1 // Copyright 2016 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // +build 386,!gccgo,!appengine
9 DATA iv0<>+0x00(SB)/4, $0x6a09e667
10 DATA iv0<>+0x04(SB)/4, $0xbb67ae85
11 DATA iv0<>+0x08(SB)/4, $0x3c6ef372
12 DATA iv0<>+0x0c(SB)/4, $0xa54ff53a
13 GLOBL iv0<>(SB), (NOPTR+RODATA), $16
15 DATA iv1<>+0x00(SB)/4, $0x510e527f
16 DATA iv1<>+0x04(SB)/4, $0x9b05688c
17 DATA iv1<>+0x08(SB)/4, $0x1f83d9ab
18 DATA iv1<>+0x0c(SB)/4, $0x5be0cd19
19 GLOBL iv1<>(SB), (NOPTR+RODATA), $16
21 DATA rol16<>+0x00(SB)/8, $0x0504070601000302
22 DATA rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A
23 GLOBL rol16<>(SB), (NOPTR+RODATA), $16
25 DATA rol8<>+0x00(SB)/8, $0x0407060500030201
26 DATA rol8<>+0x08(SB)/8, $0x0C0F0E0D080B0A09
27 GLOBL rol8<>(SB), (NOPTR+RODATA), $16
29 DATA counter<>+0x00(SB)/8, $0x40
30 DATA counter<>+0x08(SB)/8, $0x0
31 GLOBL counter<>(SB), (NOPTR+RODATA), $16
33 #define ROTL_SSE2(n, t, v) \
39 #define ROTL_SSSE3(c, v) \
42 #define ROUND_SSE2(v0, v1, v2, v3, m0, m1, m2, m3, t) \
46 ROTL_SSE2(16, t, v3); \
49 ROTL_SSE2(20, t, v1); \
53 ROTL_SSE2(24, t, v3); \
56 ROTL_SSE2(25, t, v1); \
57 PSHUFL $0x39, v1, v1; \
58 PSHUFL $0x4E, v2, v2; \
59 PSHUFL $0x93, v3, v3; \
63 ROTL_SSE2(16, t, v3); \
66 ROTL_SSE2(20, t, v1); \
70 ROTL_SSE2(24, t, v3); \
73 ROTL_SSE2(25, t, v1); \
74 PSHUFL $0x39, v3, v3; \
75 PSHUFL $0x4E, v2, v2; \
78 #define ROUND_SSSE3(v0, v1, v2, v3, m0, m1, m2, m3, t, c16, c8) \
82 ROTL_SSSE3(c16, v3); \
85 ROTL_SSE2(20, t, v1); \
92 ROTL_SSE2(25, t, v1); \
93 PSHUFL $0x39, v1, v1; \
94 PSHUFL $0x4E, v2, v2; \
95 PSHUFL $0x93, v3, v3; \
99 ROTL_SSSE3(c16, v3); \
102 ROTL_SSE2(20, t, v1); \
106 ROTL_SSSE3(c8, v3); \
109 ROTL_SSE2(25, t, v1); \
110 PSHUFL $0x39, v3, v3; \
111 PSHUFL $0x4E, v2, v2; \
114 #define PRECOMPUTE(dst, off, src, t) \
116 MOVL t, 0*4+off+0(dst); \
117 MOVL t, 9*4+off+64(dst); \
118 MOVL t, 5*4+off+128(dst); \
119 MOVL t, 14*4+off+192(dst); \
120 MOVL t, 4*4+off+256(dst); \
121 MOVL t, 2*4+off+320(dst); \
122 MOVL t, 8*4+off+384(dst); \
123 MOVL t, 12*4+off+448(dst); \
124 MOVL t, 3*4+off+512(dst); \
125 MOVL t, 15*4+off+576(dst); \
127 MOVL t, 4*4+off+0(dst); \
128 MOVL t, 8*4+off+64(dst); \
129 MOVL t, 14*4+off+128(dst); \
130 MOVL t, 5*4+off+192(dst); \
131 MOVL t, 12*4+off+256(dst); \
132 MOVL t, 11*4+off+320(dst); \
133 MOVL t, 1*4+off+384(dst); \
134 MOVL t, 6*4+off+448(dst); \
135 MOVL t, 10*4+off+512(dst); \
136 MOVL t, 3*4+off+576(dst); \
138 MOVL t, 1*4+off+0(dst); \
139 MOVL t, 13*4+off+64(dst); \
140 MOVL t, 6*4+off+128(dst); \
141 MOVL t, 8*4+off+192(dst); \
142 MOVL t, 2*4+off+256(dst); \
143 MOVL t, 0*4+off+320(dst); \
144 MOVL t, 14*4+off+384(dst); \
145 MOVL t, 11*4+off+448(dst); \
146 MOVL t, 12*4+off+512(dst); \
147 MOVL t, 4*4+off+576(dst); \
149 MOVL t, 5*4+off+0(dst); \
150 MOVL t, 15*4+off+64(dst); \
151 MOVL t, 9*4+off+128(dst); \
152 MOVL t, 1*4+off+192(dst); \
153 MOVL t, 11*4+off+256(dst); \
154 MOVL t, 7*4+off+320(dst); \
155 MOVL t, 13*4+off+384(dst); \
156 MOVL t, 3*4+off+448(dst); \
157 MOVL t, 6*4+off+512(dst); \
158 MOVL t, 10*4+off+576(dst); \
160 MOVL t, 2*4+off+0(dst); \
161 MOVL t, 1*4+off+64(dst); \
162 MOVL t, 15*4+off+128(dst); \
163 MOVL t, 10*4+off+192(dst); \
164 MOVL t, 6*4+off+256(dst); \
165 MOVL t, 8*4+off+320(dst); \
166 MOVL t, 3*4+off+384(dst); \
167 MOVL t, 13*4+off+448(dst); \
168 MOVL t, 14*4+off+512(dst); \
169 MOVL t, 5*4+off+576(dst); \
171 MOVL t, 6*4+off+0(dst); \
172 MOVL t, 11*4+off+64(dst); \
173 MOVL t, 2*4+off+128(dst); \
174 MOVL t, 9*4+off+192(dst); \
175 MOVL t, 1*4+off+256(dst); \
176 MOVL t, 13*4+off+320(dst); \
177 MOVL t, 4*4+off+384(dst); \
178 MOVL t, 8*4+off+448(dst); \
179 MOVL t, 15*4+off+512(dst); \
180 MOVL t, 7*4+off+576(dst); \
182 MOVL t, 3*4+off+0(dst); \
183 MOVL t, 7*4+off+64(dst); \
184 MOVL t, 13*4+off+128(dst); \
185 MOVL t, 12*4+off+192(dst); \
186 MOVL t, 10*4+off+256(dst); \
187 MOVL t, 1*4+off+320(dst); \
188 MOVL t, 9*4+off+384(dst); \
189 MOVL t, 14*4+off+448(dst); \
190 MOVL t, 0*4+off+512(dst); \
191 MOVL t, 6*4+off+576(dst); \
193 MOVL t, 7*4+off+0(dst); \
194 MOVL t, 14*4+off+64(dst); \
195 MOVL t, 10*4+off+128(dst); \
196 MOVL t, 0*4+off+192(dst); \
197 MOVL t, 5*4+off+256(dst); \
198 MOVL t, 9*4+off+320(dst); \
199 MOVL t, 12*4+off+384(dst); \
200 MOVL t, 1*4+off+448(dst); \
201 MOVL t, 13*4+off+512(dst); \
202 MOVL t, 2*4+off+576(dst); \
204 MOVL t, 8*4+off+0(dst); \
205 MOVL t, 5*4+off+64(dst); \
206 MOVL t, 4*4+off+128(dst); \
207 MOVL t, 15*4+off+192(dst); \
208 MOVL t, 14*4+off+256(dst); \
209 MOVL t, 3*4+off+320(dst); \
210 MOVL t, 11*4+off+384(dst); \
211 MOVL t, 10*4+off+448(dst); \
212 MOVL t, 7*4+off+512(dst); \
213 MOVL t, 1*4+off+576(dst); \
215 MOVL t, 12*4+off+0(dst); \
216 MOVL t, 2*4+off+64(dst); \
217 MOVL t, 11*4+off+128(dst); \
218 MOVL t, 4*4+off+192(dst); \
219 MOVL t, 0*4+off+256(dst); \
220 MOVL t, 15*4+off+320(dst); \
221 MOVL t, 10*4+off+384(dst); \
222 MOVL t, 7*4+off+448(dst); \
223 MOVL t, 5*4+off+512(dst); \
224 MOVL t, 9*4+off+576(dst); \
226 MOVL t, 9*4+off+0(dst); \
227 MOVL t, 4*4+off+64(dst); \
228 MOVL t, 8*4+off+128(dst); \
229 MOVL t, 13*4+off+192(dst); \
230 MOVL t, 3*4+off+256(dst); \
231 MOVL t, 5*4+off+320(dst); \
232 MOVL t, 7*4+off+384(dst); \
233 MOVL t, 15*4+off+448(dst); \
234 MOVL t, 11*4+off+512(dst); \
235 MOVL t, 0*4+off+576(dst); \
237 MOVL t, 13*4+off+0(dst); \
238 MOVL t, 10*4+off+64(dst); \
239 MOVL t, 0*4+off+128(dst); \
240 MOVL t, 3*4+off+192(dst); \
241 MOVL t, 9*4+off+256(dst); \
242 MOVL t, 6*4+off+320(dst); \
243 MOVL t, 15*4+off+384(dst); \
244 MOVL t, 4*4+off+448(dst); \
245 MOVL t, 2*4+off+512(dst); \
246 MOVL t, 12*4+off+576(dst); \
248 MOVL t, 10*4+off+0(dst); \
249 MOVL t, 12*4+off+64(dst); \
250 MOVL t, 1*4+off+128(dst); \
251 MOVL t, 6*4+off+192(dst); \
252 MOVL t, 13*4+off+256(dst); \
253 MOVL t, 4*4+off+320(dst); \
254 MOVL t, 0*4+off+384(dst); \
255 MOVL t, 2*4+off+448(dst); \
256 MOVL t, 8*4+off+512(dst); \
257 MOVL t, 14*4+off+576(dst); \
259 MOVL t, 14*4+off+0(dst); \
260 MOVL t, 3*4+off+64(dst); \
261 MOVL t, 7*4+off+128(dst); \
262 MOVL t, 2*4+off+192(dst); \
263 MOVL t, 15*4+off+256(dst); \
264 MOVL t, 12*4+off+320(dst); \
265 MOVL t, 6*4+off+384(dst); \
266 MOVL t, 0*4+off+448(dst); \
267 MOVL t, 9*4+off+512(dst); \
268 MOVL t, 11*4+off+576(dst); \
270 MOVL t, 11*4+off+0(dst); \
271 MOVL t, 0*4+off+64(dst); \
272 MOVL t, 12*4+off+128(dst); \
273 MOVL t, 7*4+off+192(dst); \
274 MOVL t, 8*4+off+256(dst); \
275 MOVL t, 14*4+off+320(dst); \
276 MOVL t, 2*4+off+384(dst); \
277 MOVL t, 5*4+off+448(dst); \
278 MOVL t, 1*4+off+512(dst); \
279 MOVL t, 13*4+off+576(dst); \
281 MOVL t, 15*4+off+0(dst); \
282 MOVL t, 6*4+off+64(dst); \
283 MOVL t, 3*4+off+128(dst); \
284 MOVL t, 11*4+off+192(dst); \
285 MOVL t, 7*4+off+256(dst); \
286 MOVL t, 10*4+off+320(dst); \
287 MOVL t, 5*4+off+384(dst); \
288 MOVL t, 9*4+off+448(dst); \
289 MOVL t, 4*4+off+512(dst); \
290 MOVL t, 8*4+off+576(dst)
292 // func hashBlocksSSE2(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
293 TEXT ·hashBlocksSSE2(SB), 0, $672-24 // frame = 656 + 16 byte alignment
297 MOVL blocks_base+12(FP), SI
298 MOVL blocks_len+16(FP), DX
316 MOVOU counter<>(SB), X2
329 PRECOMPUTE(SP, 16, SI, CX)
330 ROUND_SSE2(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3)
331 ROUND_SSE2(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3)
332 ROUND_SSE2(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3)
333 ROUND_SSE2(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3)
334 ROUND_SSE2(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3)
335 ROUND_SSE2(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3)
336 ROUND_SSE2(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3)
337 ROUND_SSE2(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3)
338 ROUND_SSE2(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3)
339 ROUND_SSE2(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3)
361 // func hashBlocksSSSE3(h *[8]uint32, c *[2]uint32, flag uint32, blocks []byte)
362 TEXT ·hashBlocksSSSE3(SB), 0, $704-24 // frame = 688 + 16 byte alignment
366 MOVL blocks_base+12(FP), SI
367 MOVL blocks_len+16(FP), DX
385 MOVOU counter<>(SB), X2
400 MOVOU rol16<>(SB), X0
403 PRECOMPUTE(SP, 16, SI, CX)
404 ROUND_SSSE3(X4, X5, X6, X7, 16(SP), 32(SP), 48(SP), 64(SP), X3, X0, X1)
405 ROUND_SSSE3(X4, X5, X6, X7, 16+64(SP), 32+64(SP), 48+64(SP), 64+64(SP), X3, X0, X1)
406 ROUND_SSSE3(X4, X5, X6, X7, 16+128(SP), 32+128(SP), 48+128(SP), 64+128(SP), X3, X0, X1)
407 ROUND_SSSE3(X4, X5, X6, X7, 16+192(SP), 32+192(SP), 48+192(SP), 64+192(SP), X3, X0, X1)
408 ROUND_SSSE3(X4, X5, X6, X7, 16+256(SP), 32+256(SP), 48+256(SP), 64+256(SP), X3, X0, X1)
409 ROUND_SSSE3(X4, X5, X6, X7, 16+320(SP), 32+320(SP), 48+320(SP), 64+320(SP), X3, X0, X1)
410 ROUND_SSSE3(X4, X5, X6, X7, 16+384(SP), 32+384(SP), 48+384(SP), 64+384(SP), X3, X0, X1)
411 ROUND_SSSE3(X4, X5, X6, X7, 16+448(SP), 32+448(SP), 48+448(SP), 64+448(SP), X3, X0, X1)
412 ROUND_SSSE3(X4, X5, X6, X7, 16+512(SP), 32+512(SP), 48+512(SP), 64+512(SP), X3, X0, X1)
413 ROUND_SSSE3(X4, X5, X6, X7, 16+576(SP), 32+576(SP), 48+576(SP), 64+576(SP), X3, X0, X1)
437 // func supportSSSE3() bool
438 TEXT ·supportSSSE3(SB), 4, $0-1
442 ANDL $0x1, BX // supports SSE3
444 ANDL $0x200, CX // supports SSSE3
453 // func supportSSE2() bool
454 TEXT ·supportSSE2(SB), 4, $0-1
458 ANDL $1, DX // DX != 0 if support SSE2