1 //+build !noasm !appengine
4 // Minio Cloud Storage, (C) 2016 Minio, Inc.
6 // Licensed under the Apache License, Version 2.0 (the "License");
7 // you may not use this file except in compliance with the License.
8 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
20 // Based on SSE implementation from https://github.com/BLAKE2/BLAKE2/blob/master/sse/blake2b.c
22 // Use github.com/fwessels/asm2plan9s on this file to assemble instructions to their Plan9 equivalent
24 // Assembly code below essentially follows the ROUND macro (see blake2b-round.h) which is defined as:
26 // LOAD_MSG_ ##r ##_1(b0, b1); \
27 // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
28 // LOAD_MSG_ ##r ##_2(b0, b1); \
29 // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
30 // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h); \
31 // LOAD_MSG_ ##r ##_3(b0, b1); \
32 // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
33 // LOAD_MSG_ ##r ##_4(b0, b1); \
34 // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1); \
35 // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
37 // as well as the go equivalent in https://github.com/dchest/blake2b/blob/master/block.go
39 // As in the macro, G1/G2 in the 1st and 2nd half are identical (so literal copy of assembly)
41 // Rounds are also the same, except for the loading of the message (and rounds 1 & 11 and
42 // rounds 2 & 12 are identical)
46 \ // G1(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
47 LONG $0xd479c1c4; BYTE $0xc0 \ // VPADDQ XMM0,XMM0,XMM8 /* v0 += m[0], v1 += m[2] */
48 LONG $0xd471c1c4; BYTE $0xc9 \ // VPADDQ XMM1,XMM1,XMM9 /* v2 += m[4], v3 += m[6] */
49 LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
50 LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
51 LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
52 LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
53 LONG $0xf670f9c5; BYTE $0xb1 \ // VPSHUFD XMM6,XMM6,0xb1 /* v12 = v12<<(64-32) | v12>>32, v13 = v13<<(64-32) | v13>>32 */
54 LONG $0xff70f9c5; BYTE $0xb1 \ // VPSHUFD XMM7,XMM7,0xb1 /* v14 = v14<<(64-32) | v14>>32, v15 = v15<<(64-32) | v15>>32 */
55 LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
56 LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
57 LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
58 LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
59 LONG $0x0069c2c4; BYTE $0xd4 \ // VPSHUFB XMM2,XMM2,XMM12 /* v4 = v4<<(64-24) | v4>>24, v5 = v5<<(64-24) | v5>>24 */
60 LONG $0x0061c2c4; BYTE $0xdc // VPSHUFB XMM3,XMM3,XMM12 /* v6 = v6<<(64-24) | v6>>24, v7 = v7<<(64-24) | v7>>24 */
63 \ // G2(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h,b0,b1);
64 LONG $0xd479c1c4; BYTE $0xc2 \ // VPADDQ XMM0,XMM0,XMM10 /* v0 += m[1], v1 += m[3] */
65 LONG $0xd471c1c4; BYTE $0xcb \ // VPADDQ XMM1,XMM1,XMM11 /* v2 += m[5], v3 += m[7] */
66 LONG $0xc2d4f9c5 \ // VPADDQ XMM0,XMM0,XMM2 /* v0 += v4, v1 += v5 */
67 LONG $0xcbd4f1c5 \ // VPADDQ XMM1,XMM1,XMM3 /* v2 += v6, v3 += v7 */
68 LONG $0xf0efc9c5 \ // VPXOR XMM6,XMM6,XMM0 /* v12 ^= v0, v13 ^= v1 */
69 LONG $0xf9efc1c5 \ // VPXOR XMM7,XMM7,XMM1 /* v14 ^= v2, v15 ^= v3 */
70 LONG $0xf670fbc5; BYTE $0x39 \ // VPSHUFLW XMM6,XMM6,0x39 /* combined with next ... */
71 LONG $0xf670fac5; BYTE $0x39 \ // VPSHUFHW XMM6,XMM6,0x39 /* v12 = v12<<(64-16) | v12>>16, v13 = v13<<(64-16) | v13>>16 */
72 LONG $0xff70fbc5; BYTE $0x39 \ // VPSHUFLW XMM7,XMM7,0x39 /* combined with next ... */
73 LONG $0xff70fac5; BYTE $0x39 \ // VPSHUFHW XMM7,XMM7,0x39 /* v14 = v14<<(64-16) | v14>>16, v15 = v15<<(64-16) | v15>>16 */
74 LONG $0xe6d4d9c5 \ // VPADDQ XMM4,XMM4,XMM6 /* v8 += v12, v9 += v13 */
75 LONG $0xefd4d1c5 \ // VPADDQ XMM5,XMM5,XMM7 /* v10 += v14, v11 += v15 */
76 LONG $0xd4efe9c5 \ // VPXOR XMM2,XMM2,XMM4 /* v4 ^= v8, v5 ^= v9 */
77 LONG $0xddefe1c5 \ // VPXOR XMM3,XMM3,XMM5 /* v6 ^= v10, v7 ^= v11 */
78 LONG $0xfad469c5 \ // VPADDQ XMM15,XMM2,XMM2 /* temp reg = reg*2 */
79 LONG $0xd273e9c5; BYTE $0x3f \ // VPSRLQ XMM2,XMM2,0x3f /* reg = reg>>63 */
80 LONG $0xef69c1c4; BYTE $0xd7 \ // VPXOR XMM2,XMM2,XMM15 /* ORed together: v4 = v4<<(64-63) | v4>>63, v5 = v5<<(64-63) | v5>>63 */
81 LONG $0xfbd461c5 \ // VPADDQ XMM15,XMM3,XMM3 /* temp reg = reg*2 */
82 LONG $0xd373e1c5; BYTE $0x3f \ // VPSRLQ XMM3,XMM3,0x3f /* reg = reg>>63 */
83 LONG $0xef61c1c4; BYTE $0xdf // VPXOR XMM3,XMM3,XMM15 /* ORed together: v6 = v6<<(64-63) | v6>>63, v7 = v7<<(64-63) | v7>>63 */
86 \ // DIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
87 MOVOU X6, X13 \ /* t0 = row4l;\ */
88 MOVOU X2, X14 \ /* t1 = row2l;\ */
89 MOVOU X4, X6 \ /* row4l = row3l;\ */
90 MOVOU X5, X4 \ /* row3l = row3h;\ */
91 MOVOU X6, X5 \ /* row3h = row4l;\ */
92 LONG $0x6c1141c4; BYTE $0xfd \ // VPUNPCKLQDQ XMM15, XMM13, XMM13 /* _mm_unpacklo_epi64(t0, t0) */
93 LONG $0x6d41c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM7, XMM15 /* row4l = _mm_unpackhi_epi64(row4h, ); \ */
94 LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
95 LONG $0x6d11c1c4; BYTE $0xff \ // VPUNPCKHQDQ XMM7, XMM13, XMM15 /* row4h = _mm_unpackhi_epi64(t0, ); \ */
96 LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
97 LONG $0x6d69c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM2, XMM15 /* row2l = _mm_unpackhi_epi64(row2l, ); \ */
98 LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
99 LONG $0x6d61c1c4; BYTE $0xdf // VPUNPCKHQDQ XMM3, XMM3, XMM15 /* row2h = _mm_unpackhi_epi64(row2h, ) */
101 #define UNDIAGONALIZE \
102 \ // UNDIAGONALIZE(row1l,row2l,row3l,row4l,row1h,row2h,row3h,row4h);
103 MOVOU X4, X13 \ /* t0 = row3l;\ */
104 MOVOU X5, X4 \ /* row3l = row3h;\ */
105 MOVOU X13, X5 \ /* row3h = t0;\ */
106 MOVOU X2, X13 \ /* t0 = row2l;\ */
107 MOVOU X6, X14 \ /* t1 = row4l;\ */
108 LONG $0xfa6c69c5 \ // VPUNPCKLQDQ XMM15, XMM2, XMM2 /* _mm_unpacklo_epi64(row2l, row2l) */
109 LONG $0x6d61c1c4; BYTE $0xd7 \ // VPUNPCKHQDQ XMM2, XMM3, XMM15 /* row2l = _mm_unpackhi_epi64(row2h, ); \ */
110 LONG $0xfb6c61c5 \ // VPUNPCKLQDQ XMM15, XMM3, XMM3 /* _mm_unpacklo_epi64(row2h, row2h) */
111 LONG $0x6d11c1c4; BYTE $0xdf \ // VPUNPCKHQDQ XMM3, XMM13, XMM15 /* row2h = _mm_unpackhi_epi64(t0, ); \ */
112 LONG $0xff6c41c5 \ // VPUNPCKLQDQ XMM15, XMM7, XMM7 /* _mm_unpacklo_epi64(row4h, row4h) */
113 LONG $0x6d49c1c4; BYTE $0xf7 \ // VPUNPCKHQDQ XMM6, XMM6, XMM15 /* row4l = _mm_unpackhi_epi64(row4l, ); \ */
114 LONG $0x6c0941c4; BYTE $0xfe \ // VPUNPCKLQDQ XMM15, XMM14, XMM14 /* _mm_unpacklo_epi64(t1, t1) */
115 LONG $0x6d41c1c4; BYTE $0xff // VPUNPCKHQDQ XMM7, XMM7, XMM15 /* row4h = _mm_unpackhi_epi64(row4h, ) */
117 #define LOAD_SHUFFLE \
118 \ // Load shuffle value
119 MOVQ shffle+120(FP), SI \ // SI: &shuffle
120 MOVOU 0(SI), X12 // X12 = 03040506 07000102 0b0c0d0e 0f08090a
122 // func blockAVXLoop(p []uint8, in, iv, t, f, shffle, out []uint64)
123 TEXT ·blockAVXLoop(SB), 7, $0
126 // DX: message pointer
127 // SI: temp pointer for loading
129 // X8 - X11: m[0] - m[7]
130 // X12: shuffle value
131 // X13 - X15: temp registers
134 MOVQ in+24(FP), SI // SI: &in
135 MOVOU 0(SI), X0 // X0 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
136 MOVOU 16(SI), X1 // X1 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
137 MOVOU 32(SI), X2 // X2 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
138 MOVOU 48(SI), X3 // X3 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
140 // Already store digest into &out (so we can reload it later generically)
141 MOVQ out+144(FP), SI // SI: &out
142 MOVOU X0, 0(SI) // out[0]+out[1] = X0
143 MOVOU X1, 16(SI) // out[2]+out[3] = X1
144 MOVOU X2, 32(SI) // out[4]+out[5] = X2
145 MOVOU X3, 48(SI) // out[6]+out[7] = X3
147 // Initialize message pointer and loop counter
148 MOVQ message+0(FP), DX // DX: &p (message)
149 MOVQ message_len+8(FP), R8 // R8: len(message)
150 SHRQ $7, R8 // len(message) / 128
156 MOVQ t+72(FP), SI // SI: &t
158 ADDQ $128, R9 // /* d.t[0] += BlockSize */
160 CMPQ R9, $128 // /* if d.t[0] < BlockSize { */
163 ADDQ $1, R9 // /* d.t[1]++ */
167 // Load initialization vector
168 MOVQ iv+48(FP), SI // SI: &iv
169 MOVOU 0(SI), X4 // X4 = iv[0]+iv[1] /* row3l = LOAD( &blake2b_IV[0] ); */
170 MOVOU 16(SI), X5 // X5 = iv[2]+iv[3] /* row3h = LOAD( &blake2b_IV[2] ); */
171 MOVOU 32(SI), X6 // X6 = iv[4]+iv[5] /* LOAD( &blake2b_IV[4] ) */
172 MOVOU 48(SI), X7 // X7 = iv[6]+iv[7] /* LOAD( &blake2b_IV[6] ) */
173 MOVQ t+72(FP), SI // SI: &t
174 MOVOU 0(SI), X8 // X8 = t[0]+t[1] /* LOAD( &S->t[0] ) */
175 PXOR X8, X6 // X6 = X6 ^ X8 /* row4l = _mm_xor_si128( , ); */
176 MOVQ t+96(FP), SI // SI: &f
177 MOVOU 0(SI), X8 // X8 = f[0]+f[1] /* LOAD( &S->f[0] ) */
178 PXOR X8, X7 // X7 = X7 ^ X8 /* row4h = _mm_xor_si128( , ); */
180 ///////////////////////////////////////////////////////////////////////////
182 ///////////////////////////////////////////////////////////////////////////
184 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
185 MOVOU 0(DX), X12 // X12 = m[0]+m[1]
186 MOVOU 16(DX), X13 // X13 = m[2]+m[3]
187 MOVOU 32(DX), X14 // X14 = m[4]+m[5]
188 MOVOU 48(DX), X15 // X15 = m[6]+m[7]
189 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
190 LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
191 LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
192 LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
199 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
200 MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
201 MOVOU 80(DX), X13 // X13 = m[10]+m[11]
202 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
203 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
204 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
205 LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
206 LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
207 LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
214 ///////////////////////////////////////////////////////////////////////////
216 ///////////////////////////////////////////////////////////////////////////
218 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
219 MOVOU 112(DX), X12 // X12 = m[14]+m[15]
220 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
221 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
222 MOVOU 96(DX), X15 // X15 = m[12]+m[13]
223 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
224 LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
225 MOVOU 80(DX), X13 // X13 = m[10]+m[11]
226 MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
227 LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
228 LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
235 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
236 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
237 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
238 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
239 LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
240 LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
241 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
242 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
243 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
244 LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
245 LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
252 ///////////////////////////////////////////////////////////////////////////
254 ///////////////////////////////////////////////////////////////////////////
256 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
257 MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
258 MOVOU 80(DX), X13 // X13 = m[10]+m[11]
259 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
260 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
261 LONG $0x0f0943c4; WORD $0x08c5 // VPALIGNR XMM8, XMM14, XMM13, 0x8 /* m[11], m[12] */
262 LONG $0x6d1941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM12, XMM15 /* m[5], m[15] */
263 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
264 MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
265 MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
266 LONG $0x6c0141c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM15, XMM12 /* m[8], m[0] */
267 LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
268 LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[2], ___ */
275 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
276 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
277 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
278 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
279 MOVOU 80(DX), X15 // X15 = m[10]+m[11]
280 LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[3] */
281 LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[10], ___ */
282 LONG $0x6d1141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM13, XMM14 /* m[7], m[9] */
283 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
284 MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
285 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
286 LONG $0x6c0141c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM15, XMM13 /* m[14], m[6] */
287 LONG $0x0f0943c4; WORD $0x08dc // VPALIGNR XMM11, XMM14, XMM12, 0x8 /* m[1], m[4] */
294 ///////////////////////////////////////////////////////////////////////////
296 ///////////////////////////////////////////////////////////////////////////
298 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
299 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
300 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
301 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
302 MOVOU 96(DX), X15 // X15 = m[12]+m[13]
303 LONG $0x6d1141c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM13, XMM12 /* m[7], m[3] */
304 LONG $0x6d0141c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM15, XMM14 /* m[13], m[11] */
305 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
306 MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
307 MOVOU 112(DX), X14 // X14 = m[14]+m[15]
308 LONG $0x6d1141c4; BYTE $0xd4 // VPUNPCKHQDQ XMM10, XMM13, XMM12 /* m[9], m[1] */
309 LONG $0x6c0141c4; BYTE $0xde // VPUNPCKLQDQ XMM11, XMM15, XMM14 /* m[12], m[14] */
316 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
317 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
318 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
319 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
320 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
321 LONG $0x6d1141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM13, XMM13 /* ___, m[5] */
322 LONG $0x6c1941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM12, XMM8 /* m[2], ____ */
323 LONG $0x6d0141c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM15, XMM15 /* ___, m[15] */
324 LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[4], ____ */
325 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
326 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
327 MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
328 LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[6], m[10] */
329 LONG $0x6c1941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM12, XMM15 /* m[0], m[8] */
336 ///////////////////////////////////////////////////////////////////////////
338 ///////////////////////////////////////////////////////////////////////////
340 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
341 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
342 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
343 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
344 MOVOU 80(DX), X15 // X15 = m[10]+m[11]
345 LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[9], m[5] */
346 LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[2], m[10] */
347 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
348 MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
349 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
350 LONG $0x6d0941c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM14, XMM14 /* ___, m[7] */
351 LONG $0x6c1941c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM12, XMM10 /* m[0], ____ */
352 LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[15] */
353 LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[4], ____ */
360 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
361 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
362 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
363 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
364 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
365 LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[11] */
366 LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[14], ____ */
367 LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
368 LONG $0x6c1141c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM13, XMM9 /* m[6], ____ */
369 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
370 MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
371 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
372 LONG $0x0f0943c4; WORD $0x08d4 // VPALIGNR XMM10, XMM14, XMM12, 0x8 /* m[1], m[12] */
373 LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[13] */
374 LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[8], ____ */
381 ///////////////////////////////////////////////////////////////////////////
383 ///////////////////////////////////////////////////////////////////////////
385 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
386 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
387 MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
388 MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
389 MOVOU 64(DX), X15 // X15 = m[8]+ m[9]
390 LONG $0x6c1141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM13, XMM14 /* m[2], m[6] */
391 LONG $0x6c1941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM12, XMM15 /* m[0], m[8] */
392 MOVOU 80(DX), X12 // X12 = m[10]+m[11]
393 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
394 LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[10] */
395 LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[11], m[3] */
402 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
403 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
404 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
405 MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
406 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
407 LONG $0x6d0941c4; BYTE $0xc6 // VPUNPCKHQDQ XMM8, XMM14, XMM14 /* ___, m[7] */
408 LONG $0x6c1141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM13, XMM8 /* m[4], ____ */
409 LONG $0x6d0141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM15, XMM12 /* m[15], m[1] */
410 MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
411 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
412 LONG $0x6d0941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM14, XMM13 /* m[13], m[5] */
413 LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[9] */
414 LONG $0x6c0141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM15, XMM11 /* m[14], ____ */
421 ///////////////////////////////////////////////////////////////////////////
423 ///////////////////////////////////////////////////////////////////////////
425 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
426 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
427 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
428 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
429 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
430 LONG $0x6d1941c4; BYTE $0xc4 // VPUNPCKHQDQ XMM8, XMM12, XMM12 /* ___, m[1] */
431 LONG $0x6c0941c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM14, XMM8 /* m[12], ____ */
432 LONG $0x6c0141c4; BYTE $0xcd // VPUNPCKLQDQ XMM9, XMM15, XMM13 /* m[14], m[4] */
433 MOVOU 80(DX), X12 // X12 = m[10]+m[11]
434 LONG $0x6d1141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM13, XMM15 /* m[5], m[15] */
435 LONG $0x0f1943c4; WORD $0x08de // VPALIGNR XMM11, XMM12, XMM14, 0x8 /* m[13], m[10] */
442 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
443 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
444 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
445 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
446 MOVOU 80(DX), X15 // X15 = m[10]+m[11]
447 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[6] */
448 LONG $0x0f0943c4; WORD $0x08ce // VPALIGNR XMM9, XMM14, XMM14, 0x8 /* m[9], m[8] */
449 MOVOU 16(DX), X14 // X14 = m[2]+ m[3]
450 LONG $0x6d1141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM13, XMM14 /* m[7], m[3] */
451 LONG $0x6d0141c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM15, XMM15 /* ___, m[11] */
452 LONG $0x6c0941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM14, XMM11 /* m[2], ____ */
459 ///////////////////////////////////////////////////////////////////////////
461 ///////////////////////////////////////////////////////////////////////////
463 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
464 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
465 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
466 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
467 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
468 LONG $0x6d0941c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM14, XMM13 /* m[13], m[7] */
469 LONG $0x6d1941c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM12, XMM12 /* ___, m[3] */
470 LONG $0x6c0941c4; BYTE $0xc9 // VPUNPCKLQDQ XMM9, XMM14, XMM9 /* m[12], ____ */
471 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
472 MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
473 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
474 LONG $0x0f0143c4; WORD $0x08d6 // VPALIGNR XMM10, XMM15, XMM14, 0x8 /* m[11], m[14] */
475 LONG $0x6d1941c4; BYTE $0xdd // VPUNPCKHQDQ XMM11, XMM12, XMM13 /* m[1], m[9] */
482 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
483 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
484 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
485 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
486 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
487 LONG $0x6d1141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM13, XMM15 /* m[5], m[15] */
488 LONG $0x6c0941c4; BYTE $0xcc // VPUNPCKLQDQ XMM9, XMM14, XMM12 /* m[8], m[2] */
489 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
490 MOVOU 48(DX), X14 // X14 = m[6]+ m[7]
491 MOVOU 80(DX), X15 // X15 = m[10]+m[11]
492 LONG $0x6c1941c4; BYTE $0xd5 // VPUNPCKLQDQ XMM10, XMM12, XMM13 /* m[0], m[4] */
493 LONG $0x6c0941c4; BYTE $0xdf // VPUNPCKLQDQ XMM11, XMM14, XMM15 /* m[6], m[10] */
500 ///////////////////////////////////////////////////////////////////////////
502 ///////////////////////////////////////////////////////////////////////////
504 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
505 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
506 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
507 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
508 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
509 LONG $0x6c1141c4; BYTE $0xc7 // VPUNPCKLQDQ XMM8, XMM13, XMM15 /* m[6], m[14] */
510 LONG $0x0f1943c4; WORD $0x08ce // VPALIGNR XMM9, XMM12, XMM14, 0x8 /* m[11], m[0] */
511 MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
512 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
513 LONG $0x6d0141c4; BYTE $0xd6 // VPUNPCKHQDQ XMM10, XMM15, XMM14 /* m[15], m[9] */
514 LONG $0x0f0943c4; WORD $0x08dd // VPALIGNR XMM11, XMM14, XMM13, 0x8 /* m[3], m[8] */
521 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
522 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
523 MOVOU 16(DX), X13 // X13 = m[2]+ m[3]
524 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
525 MOVOU 96(DX), X15 // X15 = m[12]+m[13]
526 LONG $0x6d0141c4; BYTE $0xc7 // VPUNPCKHQDQ XMM8, XMM15, XMM15 /* ___, m[13] */
527 LONG $0x6c0141c4; BYTE $0xc0 // VPUNPCKLQDQ XMM8, XMM15, XMM8 /* m[12], ____ */
528 LONG $0x0f0943c4; WORD $0x08cc // VPALIGNR XMM9, XMM14, XMM12, 0x8 /* m[1], m[10] */
529 MOVOU 32(DX), X12 // X12 = m[4]+ m[5]
530 MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
531 LONG $0x6d0141c4; BYTE $0xd7 // VPUNPCKHQDQ XMM10, XMM15, XMM15 /* ___, m[7] */
532 LONG $0x6c1141c4; BYTE $0xd2 // VPUNPCKLQDQ XMM10, XMM13, XMM10 /* m[2], ____ */
533 LONG $0x6d1941c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM12, XMM12 /* ___, m[5] */
534 LONG $0x6c1941c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM12, XMM11 /* m[4], ____ */
541 ///////////////////////////////////////////////////////////////////////////
543 ///////////////////////////////////////////////////////////////////////////
545 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
546 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
547 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
548 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
549 MOVOU 80(DX), X15 // X15 = m[10]+m[11]
550 LONG $0x6c0141c4; BYTE $0xc6 // VPUNPCKLQDQ XMM8, XMM15, XMM14 /* m[10], m[8] */
551 LONG $0x6d1141c4; BYTE $0xcc // VPUNPCKHQDQ XMM9, XMM13, XMM12 /* m[7], m[1] */
552 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
553 MOVOU 32(DX), X14 // X14 = m[4]+ m[5]
554 LONG $0x6c1941c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM12, XMM14 /* m[2], m[4] */
555 LONG $0x6d0941c4; BYTE $0xde // VPUNPCKHQDQ XMM11, XMM14, XMM14 /* ___, m[5] */
556 LONG $0x6c1141c4; BYTE $0xdb // VPUNPCKLQDQ XMM11, XMM13, XMM11 /* m[6], ____ */
563 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
564 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
565 MOVOU 64(DX), X13 // X13 = m[8]+ m[9]
566 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
567 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
568 LONG $0x6d0141c4; BYTE $0xc5 // VPUNPCKHQDQ XMM8, XMM15, XMM13 /* m[15], m[9] */
569 LONG $0x6d1941c4; BYTE $0xce // VPUNPCKHQDQ XMM9, XMM12, XMM14 /* m[3], m[13] */
570 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
571 MOVOU 80(DX), X13 // X13 = m[10]+m[11]
572 LONG $0x0f0143c4; WORD $0x08d5 // VPALIGNR XMM10, XMM15, XMM13, 0x8 /* m[11], m[14] */
573 LONG $0x6c0941c4; BYTE $0xdc // VPUNPCKLQDQ XMM11, XMM14, XMM12 /* m[12], m[0] */
580 ///////////////////////////////////////////////////////////////////////////
582 ///////////////////////////////////////////////////////////////////////////
584 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
585 MOVOU 0(DX), X12 // X12 = m[0]+m[1]
586 MOVOU 16(DX), X13 // X13 = m[2]+m[3]
587 MOVOU 32(DX), X14 // X14 = m[4]+m[5]
588 MOVOU 48(DX), X15 // X15 = m[6]+m[7]
589 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[0], m[2] */
590 LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[4], m[6] */
591 LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[1], m[3] */
592 LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[5], m[7] */
599 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
600 MOVOU 64(DX), X12 // X12 = m[8]+ m[9]
601 MOVOU 80(DX), X13 // X13 = m[10]+m[11]
602 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
603 MOVOU 112(DX), X15 // X15 = m[14]+m[15]
604 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[8],m[10] */
605 LONG $0x6c0941c4; BYTE $0xcf // VPUNPCKLQDQ XMM9, XMM14, XMM15 /* m[12],m[14] */
606 LONG $0x6d1941c4; BYTE $0xd5 // VPUNPCKHQDQ XMM10, XMM12, XMM13 /* m[9],m[11] */
607 LONG $0x6d0941c4; BYTE $0xdf // VPUNPCKHQDQ XMM11, XMM14, XMM15 /* m[13],m[15] */
614 ///////////////////////////////////////////////////////////////////////////
616 ///////////////////////////////////////////////////////////////////////////
618 // LOAD_MSG_ ##r ##_1 / ##_2(b0, b1); (X12 is temp register)
619 MOVOU 112(DX), X12 // X12 = m[14]+m[15]
620 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
621 MOVOU 64(DX), X14 // X14 = m[8]+ m[9]
622 MOVOU 96(DX), X15 // X15 = m[12]+m[13]
623 LONG $0x6c1941c4; BYTE $0xc5 // VPUNPCKLQDQ XMM8, XMM12, XMM13 /* m[14], m[4] */
624 LONG $0x6d0941c4; BYTE $0xcf // VPUNPCKHQDQ XMM9, XMM14, XMM15 /* m[9], m[13] */
625 MOVOU 80(DX), X13 // X13 = m[10]+m[11]
626 MOVOU 48(DX), X15 // X15 = m[6]+ m[7]
627 LONG $0x6c1141c4; BYTE $0xd6 // VPUNPCKLQDQ XMM10, XMM13, XMM14 /* m[10], m[8] */
628 LONG $0x0f0143c4; WORD $0x08dc // VPALIGNR XMM11, XMM15, XMM12, 0x8 /* m[15], m[6] */
635 // LOAD_MSG_ ##r ##_3 / ##_4(b0, b1); (X12 is temp register)
636 MOVOU 0(DX), X12 // X12 = m[0]+ m[1]
637 MOVOU 32(DX), X13 // X13 = m[4]+ m[5]
638 MOVOU 80(DX), X14 // X14 = m[10]+m[11]
639 LONG $0x0f1943c4; WORD $0x08c4 // VPALIGNR XMM8, XMM12, XMM12, 0x8 /* m[1], m[0] */
640 LONG $0x6d0941c4; BYTE $0xcd // VPUNPCKHQDQ XMM9, XMM14, XMM13 /* m[11], m[5] */
641 MOVOU 16(DX), X12 // X12 = m[2]+ m[3]
642 MOVOU 48(DX), X13 // X13 = m[6]+ m[7]
643 MOVOU 96(DX), X14 // X14 = m[12]+m[13]
644 LONG $0x6c0941c4; BYTE $0xd4 // VPUNPCKLQDQ XMM10, XMM14, XMM12 /* m[12], m[2] */
645 LONG $0x6d1141c4; BYTE $0xdc // VPUNPCKHQDQ XMM11, XMM13, XMM12 /* m[7], m[3] */
652 // Reload digest (most current value store in &out)
653 MOVQ out+144(FP), SI // SI: &in
654 MOVOU 0(SI), X12 // X12 = in[0]+in[1] /* row1l = LOAD( &S->h[0] ); */
655 MOVOU 16(SI), X13 // X13 = in[2]+in[3] /* row1h = LOAD( &S->h[2] ); */
656 MOVOU 32(SI), X14 // X14 = in[4]+in[5] /* row2l = LOAD( &S->h[4] ); */
657 MOVOU 48(SI), X15 // X15 = in[6]+in[7] /* row2h = LOAD( &S->h[6] ); */
659 // Final computations and prepare for storing
660 PXOR X4, X0 // X0 = X0 ^ X4 /* row1l = _mm_xor_si128( row3l, row1l ); */
661 PXOR X5, X1 // X1 = X1 ^ X5 /* row1h = _mm_xor_si128( row3h, row1h ); */
662 PXOR X12, X0 // X0 = X0 ^ X12 /* STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); */
663 PXOR X13, X1 // X1 = X1 ^ X13 /* STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); */
664 PXOR X6, X2 // X2 = X2 ^ X6 /* row2l = _mm_xor_si128( row4l, row2l ); */
665 PXOR X7, X3 // X3 = X3 ^ X7 /* row2h = _mm_xor_si128( row4h, row2h ); */
666 PXOR X14, X2 // X2 = X2 ^ X14 /* STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); */
667 PXOR X15, X3 // X3 = X3 ^ X15 /* STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); */
669 // Store digest into &out
670 MOVQ out+144(FP), SI // SI: &out
671 MOVOU X0, 0(SI) // out[0]+out[1] = X0
672 MOVOU X1, 16(SI) // out[2]+out[3] = X1
673 MOVOU X2, 32(SI) // out[4]+out[5] = X2
674 MOVOU X3, 48(SI) // out[6]+out[7] = X3
676 // Increment message pointer and check if there's more to do
677 ADDQ $128, DX // message += 128