1 .file "cpu_features.sx"
3 * Initialization procedure for identification of CPU supported features.
7 * Written by Keith Marshall <keith@users.osdn.me>
8 * Copyright (C) 2017, MinGW.org Project
10 * Adapted from an original C language implementation.
11 * Written by Danny Smith <dannysmith@users.sourceforge.net>
12 * Copyright (C) 2006, 2008, 2009, MinGW.org Project
15 * Permission is hereby granted, free of charge, to any person obtaining a
16 * copy of this software and associated documentation files (the "Software"),
17 * to deal in the Software without restriction, including without limitation
18 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 * and/or sell copies of the Software, and to permit persons to whom the
20 * Software is furnished to do so, subject to the following conditions:
22 * The above copyright notice and this permission notice (including the next
23 * paragraph) shall be included in all copies or substantial portions of the
26 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
27 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
28 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
29 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
30 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
31 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
32 * DEALINGS IN THE SOFTWARE.
35 .intel_syntax noprefix
37 #include "cpu_features.h"
39 #define RX_FLAG(BITNUM) (1 << (BITNUM))
40 #define RH_FLAG(BITNUM) (1 << ((BITNUM) - 8))
42 #define CPUID_FLAG RX_FLAG(21) /* EFLAGS bit 21 */
44 #define FXSAVE_BUFSIZ 512
45 #define FXSAVE_BUF_ALIGN 0xFFFFFFF0
46 #define FXTEST_BITS 0x0013C0DE
48 /* FIXME: is this optimization really worthwhile here? It breaks,
49 * with older GAS versions, (such as that commonly deployed in the
50 * GCC-3.4.5 era, and earlier)!
52 * GCC, (since GCC-4), emits "repz ret" rather than single-byte "ret",
53 * when optimizing with "generic" tuning, and the return opcode would
54 * otherwise become a branch destination, or is the fall-through for a
55 * conditional branch which is not taken. This opcode sequence, (which
56 * appears as if it should be illegal), is a work-around for an AMD K8,
57 * Athlon, and AMD10 family branch predictor bug; it is decoded as being
58 * effectively equivalent to a 2-byte "ret" instruction, (equivalent to
59 * preceding the "ret" with a "nop", but without incurring additional
60 * overhead to decode the "nop" instruction).
65 .globl ___cpu_features; .align 4
66 ___cpu_features: .space 4
69 .globl ___cpu_features_init; .p2align 4,,15
70 .def ___cpu_features_init; .scl 2; .type 32; .endef
75 /* Initialization requires use of the CPUID instruction; to check if it is
76 * supported by the host CPU, we try to toggle the CPUID flag bit within the
77 * EFLAGS register, (ultimately leaving it unchanged).
79 pushf /* save original flags state */
80 pushf /* duplicate them in both... */
81 pop eax /* ...the EAX, and... */
82 mov edx, eax /* ...the EDX registers */
83 xor eax, CPUID_FLAG /* flip the CPUID_FLAG bit */
84 push eax /* try to toggle the bit... */
85 popf /* ...within EFLAGS itself */
86 pushf /* capture the result... */
87 pop eax /* ...in the EAX register */
88 popf /* restore original flags */
90 /* The preceding code leaves all EFLAGS in their original state, as at
91 * procedure entry, with this state replicated in EDX, while EAX reflects
92 * their state after attempting to toggle the CPUID_FLAG bit; thus, if the
93 * CPU supports the CPUID instruction, EAX and EDX must now reflect
94 * differing states of this bit, and thus...
96 xor eax, edx /* isolate CPUID_FLAG state */
97 test eax, CPUID_FLAG /* did it change? */
98 je 90f /* no: quit immediately */
100 /* If we're still here, then we may safely interrogate the CPU, using
101 * the CPUID instruction, to identify various CPU features which may, or
102 * may not, be supported, but first...
104 push ebx /* ...we MUST preserve this! */
106 .cfi_def_cfa_offset 8
109 /* First, we must perform a level zero CPUID enquiry, to determine the
110 * maximum level of interrogation which is supported.
112 xor eax, eax /* zero request level code */
113 cpuid /* get max supported level */
114 test eax, eax /* is it greater than zero? */
115 je 80f /* no: we can do no more */
117 /* If we're still here, we may progress to a level one (supported features)
120 mov eax, 1 /* select level one enquiry */
121 cpuid /* get level one response */
123 /* Evaluate CPU capabilities (available features), accumulating flags for
124 * each in EAX, for eventual update of the global ___cpu_features variable.
126 xor eax, eax /* start with a clean slate */
128 /* The CPUID level one features, in which we are interested, are reported
129 * in the ECX and EDX registers, using the following single bit flags for
130 * each feature; (note that, for code size efficiency, for flags expressed
131 * using bits 0..7 we interrogate only CL or DL, and for bits 8..15, only
132 * the CH or DH sub-registers, as appropriate).
134 #define CPUID_SSE3_FLAG cl, RX_FLAG(0)
135 #define CPUID_CMPXCHG16B_FLAG ch, RH_FLAG(13)
137 #define CPUID_CMPXCHG8B_FLAG dh, RH_FLAG(8)
138 #define CPUID_CMOV_FLAG dh, RH_FLAG(15)
140 #define CPUID_MMX_FLAG edx, RX_FLAG(23)
141 #define CPUID_FXSR_FLAG edx, RX_FLAG(24)
142 #define CPUID_SSE_FLAG edx, RX_FLAG(25)
143 #define CPUID_SSE2_FLAG edx, RX_FLAG(26)
145 .macro chk rx, cond, feature, next=15f
151 #define CPUID_CAP(FLG) CPUID_##FLG##_FLAG, _CRT_##FLG
153 chk CPUID_CAP(CMPXCHG8B)
154 chk CPUID_CAP(CMPXCHG16B)
158 /* Even if CPUID feature tests indicate that SSE instructions are available,
159 * the underlying operating system may not support them, and any attempt to
160 * use them may raise unhandled exceptions; (this is most likely to arise in
161 * the case of a legacy version of Windows, running on modern hardware). To
162 * avoid this issue, provided that the FXSAVE and FXRSTOR instructions are
163 * supported, we may use them to predict the likelihood of this issue
164 * arising, and consequently bypass SSE detection.
166 chk CPUID_CAP(FXSR), 20f
168 /* We must create a local stack frame, with the stack pointer aligned to a
169 * sixteen byte boundary, in which to allocate an FXSAVE buffer; (failure to
170 * align this correctly will raise an unhandled exception, and GCC cannot be
171 * trusted to get this right in C language code).
179 sub esp, FXSAVE_BUFSIZ
180 and esp, FXSAVE_BUF_ALIGN
182 /* Save the FPU state, and immediately attempt to restore it with some of
183 * the SSE specific control flags inverted.
186 mov ebx, DWORD PTR 200[esp]
187 xor DWORD PTR 200[esp], FXTEST_BITS
190 /* Return the FXSAVE buffer to its original state, then overwrite it with
191 * the state just restored.
193 mov DWORD PTR 200[esp], ebx
196 /* Explicitly restore the original FPU state, while noting (in EBX) the
197 * state of those SSE control flags, as retrieved from the FPU itself,
198 * after the attempt to change them.
200 xchg DWORD PTR 200[esp], ebx
203 /* Check if the operating system actually allowed the requested change of
204 * the SSE control flags, then discard the local stack frame.
206 xor ebx, DWORD PTR 200[esp]
212 cmp ebx, FXTEST_BITS /* SSE flags were changed? */
213 jne 20f /* no: skip SSE detection */
215 /* If we're still here, then the operating system should support SSE;
216 * proceed to check whether the CPU does so.
222 /* Before we move on to extended feature tests, we must store the feature
223 * test flags which we have accumulated so far...
225 20: mov DWORD PTR ___cpu_features, eax
227 /* ...so that EAX becomes available for us to, first confirm that extended
228 * feature tests are supported...
230 mov eax, 0x80000000 /* select extended features */
231 cpuid /* get maximum support level */
232 cmp eax, 0x80000000 /* extended features okay? */
233 jbe 80f /* no: exit now */
235 /* ...and, when so, request the extended feature test flags.
237 mov eax, 0x80000001 /* select extended level 1 */
238 cpuid /* get extended features */
240 /* Initially, we will accumulate the extended feature flags, in which we are
241 * interested, separately from those already accumulated, so...
243 xor eax, eax /* ...clean the slate again */
245 /* Of the extended feature tests, we are interested in the following:
247 #define CPUID_3DNOWP_FLAG edx, RX_FLAG(30)
248 #define CPUID_3DNOW_FLAG edx, RX_FLAG(31)
250 /* Since the CPUID_3DNOW_FLAG maps directly to the sign bit of EDX, rather
251 * than the obvious feature test:
253 * chk CPUID_CAP(3DNOW)
255 * it is more efficient to use...
257 test edx, edx /* is the sign bit set? */
258 jns 30f /* no: we don't have 3DNOW */
259 mov eax, _CRT_3DNOW /* yes: note that we do */
261 /* ...whereas, for other extended feature tests, we revert to use of
264 30: chk CPUID_CAP(3DNOWP)
266 /* Finally, we combine the extended feature test flags with those which we
267 * had previously accumulated from the regular feature tests, before...
269 or DWORD PTR ___cpu_features, eax
271 /* ...we restore the preserved state of the EBX register...
276 .cfi_def_cfa_offset 4
278 /* ...and return to the C runtime initialization procedure.
284 /* $RCSfile$: end of file */