1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
347 // Promote the i8 variants and force them on up to i32 which has a shorter
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544 // Expand FP immediates into loads from the stack, except for the special
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
793 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
794 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
795 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
802 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
803 setOperationAction(ISD::SETCC, VT, Custom);
804 setOperationAction(ISD::CTPOP, VT, Custom);
805 setOperationAction(ISD::CTTZ, VT, Custom);
808 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
809 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
810 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
812 setOperationAction(ISD::VSELECT, VT, Custom);
813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
816 // We support custom legalizing of sext and anyext loads for specific
817 // memory vector types which we can load as a scalar (or sequence of
818 // scalars) and extend in-register to a legal 128-bit vector type. For sext
819 // loads these must work with a single scalar load.
820 for (MVT VT : MVT::integer_vector_valuetypes()) {
821 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
832 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
833 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
834 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
835 setOperationAction(ISD::VSELECT, VT, Custom);
837 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
840 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
844 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
846 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
847 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
853 // Custom lower v2i64 and v2f64 selects.
854 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
855 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
857 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
860 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
863 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
865 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
868 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
869 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
871 for (MVT VT : MVT::fp_vector_valuetypes())
872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
874 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
875 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
876 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
882 // In the customized shift lowering, the legal v4i32/v2i64 cases
883 // in AVX2 will be recognized.
884 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885 setOperationAction(ISD::SRL, VT, Custom);
886 setOperationAction(ISD::SHL, VT, Custom);
887 setOperationAction(ISD::SRA, VT, Custom);
891 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
892 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
893 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
894 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
895 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
898 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907 setOperationAction(ISD::FRINT, RoundedTy, Legal);
908 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
911 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
915 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
920 // FIXME: Do we need to handle scalar-to-vector here?
921 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
923 // We directly match byte blends in the backend as they match the VSELECT
925 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
927 // SSE41 brings specific instructions for doing vector sign extend even in
928 // cases where we don't have SRA.
929 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
934 for (MVT VT : MVT::integer_vector_valuetypes()) {
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
940 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
951 // i8 vectors are custom because the source register and source
952 // source memory operand types are not the same width.
953 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
956 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::ROTL, VT, Custom);
961 // XOP can efficiently perform BITREVERSE with VPPERM.
962 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::BITREVERSE, VT, Custom);
970 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971 bool HasInt256 = Subtarget.hasInt256();
973 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
986 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987 setOperationAction(ISD::FFLOOR, VT, Legal);
988 setOperationAction(ISD::FCEIL, VT, Legal);
989 setOperationAction(ISD::FTRUNC, VT, Legal);
990 setOperationAction(ISD::FRINT, VT, Legal);
991 setOperationAction(ISD::FNEARBYINT, VT, Legal);
992 setOperationAction(ISD::FNEG, VT, Custom);
993 setOperationAction(ISD::FABS, VT, Custom);
994 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
997 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998 // even though v8i16 is a legal type.
999 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1000 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1004 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1006 for (MVT VT : MVT::fp_vector_valuetypes())
1007 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1009 // In the customized shift lowering, the legal v8i32/v4i64 cases
1010 // in AVX2 will be recognized.
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::SRL, VT, Custom);
1013 setOperationAction(ISD::SHL, VT, Custom);
1014 setOperationAction(ISD::SRA, VT, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1021 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1022 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1023 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1027 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1030 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1033 setOperationAction(ISD::SETCC, VT, Custom);
1034 setOperationAction(ISD::CTPOP, VT, Custom);
1035 setOperationAction(ISD::CTTZ, VT, Custom);
1036 setOperationAction(ISD::CTLZ, VT, Custom);
1039 if (Subtarget.hasAnyFMA()) {
1040 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1041 MVT::v2f64, MVT::v4f64 })
1042 setOperationAction(ISD::FMA, VT, Legal);
1045 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1055 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1056 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1061 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1063 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1076 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077 // when we have a 256bit-wide blend with immediate.
1078 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1080 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1082 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1091 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1092 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1093 setOperationAction(ISD::MLOAD, VT, Legal);
1094 setOperationAction(ISD::MSTORE, VT, Legal);
1097 // Extract subvector is special because the value type
1098 // (result) is 128-bit but the source is 256-bit wide.
1099 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100 MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1104 // Custom lower several nodes for 256-bit types.
1105 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1106 MVT::v8f32, MVT::v4f64 }) {
1107 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1108 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1109 setOperationAction(ISD::VSELECT, VT, Custom);
1110 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1111 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1113 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1114 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1118 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1120 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1122 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1130 // Custom legalize 2x32 to get a little better code.
1131 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1132 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1134 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1135 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1136 setOperationAction(ISD::MGATHER, VT, Custom);
1140 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1146 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1148 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1149 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1150 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1152 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1153 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1154 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1156 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v16i1, MVT::v16i32);
1157 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v16i1, MVT::v16i32);
1158 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v8i1, MVT::v8i32);
1159 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v8i1, MVT::v8i32);
1160 setOperationPromotedToType(ISD::SINT_TO_FP, MVT::v4i1, MVT::v4i32);
1161 setOperationPromotedToType(ISD::UINT_TO_FP, MVT::v4i1, MVT::v4i32);
1162 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1163 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1165 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1166 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1167 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1168 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1169 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1170 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1171 if (Subtarget.hasVLX()) {
1172 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1173 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1176 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1177 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1178 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1179 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1180 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1183 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1184 setOperationAction(ISD::ADD, VT, Custom);
1185 setOperationAction(ISD::SUB, VT, Custom);
1186 setOperationAction(ISD::MUL, VT, Custom);
1187 setOperationAction(ISD::SETCC, VT, Custom);
1188 setOperationAction(ISD::SELECT, VT, Custom);
1189 setOperationAction(ISD::TRUNCATE, VT, Custom);
1191 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1192 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1193 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1194 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1195 setOperationAction(ISD::VSELECT, VT, Expand);
1198 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1199 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1200 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1201 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1202 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1203 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1204 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1205 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1206 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1208 for (MVT VT : MVT::fp_vector_valuetypes())
1209 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1211 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1212 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1213 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1214 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1215 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1216 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1219 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1220 setOperationAction(ISD::FNEG, VT, Custom);
1221 setOperationAction(ISD::FABS, VT, Custom);
1222 setOperationAction(ISD::FMA, VT, Legal);
1223 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1226 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1227 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1228 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1229 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1230 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1231 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1232 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1233 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1235 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1236 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1237 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1238 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1239 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1241 if (!Subtarget.hasVLX()) {
1242 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1243 // to 512-bit rather than use the AVX2 instructions so that we can use
1245 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1246 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1247 setOperationAction(ISD::MLOAD, VT, Custom);
1248 setOperationAction(ISD::MSTORE, VT, Custom);
1252 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1253 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1254 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1255 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1256 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1257 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1258 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1259 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1261 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1262 setOperationAction(ISD::FFLOOR, VT, Legal);
1263 setOperationAction(ISD::FCEIL, VT, Legal);
1264 setOperationAction(ISD::FTRUNC, VT, Legal);
1265 setOperationAction(ISD::FRINT, VT, Legal);
1266 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1269 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1270 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1272 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1273 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1274 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1276 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1277 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1278 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1279 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1281 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1282 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1284 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1285 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1287 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1288 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1289 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1291 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1292 setOperationAction(ISD::SMAX, VT, Legal);
1293 setOperationAction(ISD::UMAX, VT, Legal);
1294 setOperationAction(ISD::SMIN, VT, Legal);
1295 setOperationAction(ISD::UMIN, VT, Legal);
1296 setOperationAction(ISD::ABS, VT, Legal);
1297 setOperationAction(ISD::SRL, VT, Custom);
1298 setOperationAction(ISD::SHL, VT, Custom);
1299 setOperationAction(ISD::SRA, VT, Custom);
1300 setOperationAction(ISD::CTPOP, VT, Custom);
1301 setOperationAction(ISD::CTTZ, VT, Custom);
1302 setOperationAction(ISD::ROTL, VT, Custom);
1303 setOperationAction(ISD::ROTR, VT, Custom);
1306 // Need to promote to 64-bit even though we have 32-bit masked instructions
1307 // because the IR optimizers rearrange bitcasts around logic ops leaving
1308 // too many variations to handle if we don't promote them.
1309 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1310 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1311 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1313 if (Subtarget.hasDQI()) {
1314 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1315 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1316 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1317 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1320 if (Subtarget.hasCDI()) {
1321 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1322 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1323 setOperationAction(ISD::CTLZ, VT, Legal);
1324 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1326 } // Subtarget.hasCDI()
1328 if (Subtarget.hasVPOPCNTDQ()) {
1329 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1330 setOperationAction(ISD::CTPOP, VT, Legal);
1333 // Extract subvector is special because the value type
1334 // (result) is 256-bit but the source is 512-bit wide.
1335 // 128-bit was made Legal under AVX1.
1336 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1337 MVT::v8f32, MVT::v4f64 })
1338 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1340 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1341 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1342 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1343 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1344 setOperationAction(ISD::VSELECT, VT, Custom);
1345 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1346 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1347 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1348 setOperationAction(ISD::MLOAD, VT, Legal);
1349 setOperationAction(ISD::MSTORE, VT, Legal);
1350 setOperationAction(ISD::MGATHER, VT, Custom);
1351 setOperationAction(ISD::MSCATTER, VT, Custom);
1353 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1354 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1355 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1359 if (!Subtarget.useSoftFloat() &&
1360 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1361 // These operations are handled on non-VLX by artificially widening in
1363 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1365 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1366 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1367 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1368 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1369 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1371 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1372 setOperationAction(ISD::SMAX, VT, Legal);
1373 setOperationAction(ISD::UMAX, VT, Legal);
1374 setOperationAction(ISD::SMIN, VT, Legal);
1375 setOperationAction(ISD::UMIN, VT, Legal);
1376 setOperationAction(ISD::ABS, VT, Legal);
1379 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1380 setOperationAction(ISD::ROTL, VT, Custom);
1381 setOperationAction(ISD::ROTR, VT, Custom);
1384 // Custom legalize 2x32 to get a little better code.
1385 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1386 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1388 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1389 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1390 setOperationAction(ISD::MSCATTER, VT, Custom);
1392 if (Subtarget.hasDQI()) {
1393 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1394 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1395 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1396 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1397 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1401 if (Subtarget.hasCDI()) {
1402 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1403 setOperationAction(ISD::CTLZ, VT, Legal);
1404 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1406 } // Subtarget.hasCDI()
1408 if (Subtarget.hasVPOPCNTDQ()) {
1409 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1410 setOperationAction(ISD::CTPOP, VT, Legal);
1414 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1415 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1416 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1418 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1419 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1421 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1422 setOperationAction(ISD::ADD, VT, Custom);
1423 setOperationAction(ISD::SUB, VT, Custom);
1424 setOperationAction(ISD::MUL, VT, Custom);
1425 setOperationAction(ISD::VSELECT, VT, Expand);
1427 setOperationAction(ISD::TRUNCATE, VT, Custom);
1428 setOperationAction(ISD::SETCC, VT, Custom);
1429 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1430 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1431 setOperationAction(ISD::SELECT, VT, Custom);
1432 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1433 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1436 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1438 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1439 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1440 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1441 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1443 // Extends from v32i1 masks to 256-bit vectors.
1444 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1445 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1446 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1447 // Extends from v64i1 masks to 512-bit vectors.
1448 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1449 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1450 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1452 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1453 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1454 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1455 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1456 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1457 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1458 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1459 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1460 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1461 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1462 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1463 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1464 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1465 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1466 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1467 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1468 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1469 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1470 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1471 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1472 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1473 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1474 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1476 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1478 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1480 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1481 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1482 setOperationAction(ISD::VSELECT, VT, Custom);
1483 setOperationAction(ISD::ABS, VT, Legal);
1484 setOperationAction(ISD::SRL, VT, Custom);
1485 setOperationAction(ISD::SHL, VT, Custom);
1486 setOperationAction(ISD::SRA, VT, Custom);
1487 setOperationAction(ISD::MLOAD, VT, Legal);
1488 setOperationAction(ISD::MSTORE, VT, Legal);
1489 setOperationAction(ISD::CTPOP, VT, Custom);
1490 setOperationAction(ISD::CTTZ, VT, Custom);
1491 setOperationAction(ISD::CTLZ, VT, Custom);
1492 setOperationAction(ISD::SMAX, VT, Legal);
1493 setOperationAction(ISD::UMAX, VT, Legal);
1494 setOperationAction(ISD::SMIN, VT, Legal);
1495 setOperationAction(ISD::UMIN, VT, Legal);
1497 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1498 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1499 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1502 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1503 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1506 if (Subtarget.hasBITALG()) {
1507 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1508 setOperationAction(ISD::CTPOP, VT, Legal);
1512 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
1513 (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
1514 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1515 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1516 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1519 // These operations are handled on non-VLX by artificially widening in
1521 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1523 if (Subtarget.hasBITALG()) {
1524 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1525 setOperationAction(ISD::CTPOP, VT, Legal);
1529 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1530 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1531 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1532 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1533 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1534 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1536 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1537 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1538 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1539 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1540 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1542 if (Subtarget.hasDQI()) {
1543 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1544 // v2f32 UINT_TO_FP is already custom under SSE2.
1545 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1546 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1547 "Unexpected operation action!");
1548 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1549 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1550 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1553 if (Subtarget.hasBWI()) {
1554 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1555 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1559 // We want to custom lower some of our intrinsics.
1560 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1561 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1562 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1563 if (!Subtarget.is64Bit()) {
1564 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1565 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1568 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1569 // handle type legalization for these operations here.
1571 // FIXME: We really should do custom legalization for addition and
1572 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1573 // than generic legalization for 64-bit multiplication-with-overflow, though.
1574 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1575 if (VT == MVT::i64 && !Subtarget.is64Bit())
1577 // Add/Sub/Mul with overflow operations are custom lowered.
1578 setOperationAction(ISD::SADDO, VT, Custom);
1579 setOperationAction(ISD::UADDO, VT, Custom);
1580 setOperationAction(ISD::SSUBO, VT, Custom);
1581 setOperationAction(ISD::USUBO, VT, Custom);
1582 setOperationAction(ISD::SMULO, VT, Custom);
1583 setOperationAction(ISD::UMULO, VT, Custom);
1585 // Support carry in as value rather than glue.
1586 setOperationAction(ISD::ADDCARRY, VT, Custom);
1587 setOperationAction(ISD::SUBCARRY, VT, Custom);
1588 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1591 if (!Subtarget.is64Bit()) {
1592 // These libcalls are not available in 32-bit.
1593 setLibcallName(RTLIB::SHL_I128, nullptr);
1594 setLibcallName(RTLIB::SRL_I128, nullptr);
1595 setLibcallName(RTLIB::SRA_I128, nullptr);
1596 setLibcallName(RTLIB::MUL_I128, nullptr);
1599 // Combine sin / cos into _sincos_stret if it is available.
1600 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1601 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1602 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1603 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1606 if (Subtarget.isTargetWin64()) {
1607 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1608 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1609 setOperationAction(ISD::SREM, MVT::i128, Custom);
1610 setOperationAction(ISD::UREM, MVT::i128, Custom);
1611 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1612 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1615 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1616 // is. We should promote the value to 64-bits to solve this.
1617 // This is what the CRT headers do - `fmodf` is an inline header
1618 // function casting to f64 and calling `fmod`.
1619 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1620 Subtarget.isTargetWindowsItanium()))
1621 for (ISD::NodeType Op :
1622 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1623 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1624 if (isOperationExpand(Op, MVT::f32))
1625 setOperationAction(Op, MVT::f32, Promote);
1627 // We have target-specific dag combine patterns for the following nodes:
1628 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1629 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1630 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1631 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1632 setTargetDAGCombine(ISD::BITCAST);
1633 setTargetDAGCombine(ISD::VSELECT);
1634 setTargetDAGCombine(ISD::SELECT);
1635 setTargetDAGCombine(ISD::SHL);
1636 setTargetDAGCombine(ISD::SRA);
1637 setTargetDAGCombine(ISD::SRL);
1638 setTargetDAGCombine(ISD::OR);
1639 setTargetDAGCombine(ISD::AND);
1640 setTargetDAGCombine(ISD::ADD);
1641 setTargetDAGCombine(ISD::FADD);
1642 setTargetDAGCombine(ISD::FSUB);
1643 setTargetDAGCombine(ISD::FNEG);
1644 setTargetDAGCombine(ISD::FMA);
1645 setTargetDAGCombine(ISD::FMINNUM);
1646 setTargetDAGCombine(ISD::FMAXNUM);
1647 setTargetDAGCombine(ISD::SUB);
1648 setTargetDAGCombine(ISD::LOAD);
1649 setTargetDAGCombine(ISD::MLOAD);
1650 setTargetDAGCombine(ISD::STORE);
1651 setTargetDAGCombine(ISD::MSTORE);
1652 setTargetDAGCombine(ISD::TRUNCATE);
1653 setTargetDAGCombine(ISD::ZERO_EXTEND);
1654 setTargetDAGCombine(ISD::ANY_EXTEND);
1655 setTargetDAGCombine(ISD::SIGN_EXTEND);
1656 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1657 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1658 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1659 setTargetDAGCombine(ISD::SINT_TO_FP);
1660 setTargetDAGCombine(ISD::UINT_TO_FP);
1661 setTargetDAGCombine(ISD::SETCC);
1662 setTargetDAGCombine(ISD::MUL);
1663 setTargetDAGCombine(ISD::XOR);
1664 setTargetDAGCombine(ISD::MSCATTER);
1665 setTargetDAGCombine(ISD::MGATHER);
1667 computeRegisterProperties(Subtarget.getRegisterInfo());
1669 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1670 MaxStoresPerMemsetOptSize = 8;
1671 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1672 MaxStoresPerMemcpyOptSize = 4;
1673 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1674 MaxStoresPerMemmoveOptSize = 4;
1676 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1677 // that needs to benchmarked and balanced with the potential use of vector
1678 // load/store types (PR33329, PR33914).
1679 MaxLoadsPerMemcmp = 2;
1680 MaxLoadsPerMemcmpOptSize = 2;
1682 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1683 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1685 // An out-of-order CPU can speculatively execute past a predictable branch,
1686 // but a conditional move could be stalled by an expensive earlier operation.
1687 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1688 EnableExtLdPromotion = true;
1689 setPrefFunctionAlignment(4); // 2^4 bytes.
1691 verifyIntrinsicTables();
1694 // This has so far only been implemented for 64-bit MachO.
1695 bool X86TargetLowering::useLoadStackGuardNode() const {
1696 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1699 bool X86TargetLowering::useStackGuardXorFP() const {
1700 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1701 return Subtarget.getTargetTriple().isOSMSVCRT();
1704 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1705 const SDLoc &DL) const {
1706 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1707 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1708 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1709 return SDValue(Node, 0);
1712 TargetLoweringBase::LegalizeTypeAction
1713 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1714 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1715 return TypeSplitVector;
1717 if (ExperimentalVectorWideningLegalization &&
1718 VT.getVectorNumElements() != 1 &&
1719 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1720 return TypeWidenVector;
1722 return TargetLoweringBase::getPreferredVectorAction(VT);
1725 MVT X86TargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
1726 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1728 return TargetLowering::getRegisterTypeForCallingConv(VT);
1731 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1733 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1735 return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
1738 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1740 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1742 return TargetLowering::getNumRegistersForCallingConv(Context, VT);
1745 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1746 LLVMContext& Context,
1751 if (Subtarget.hasAVX512()) {
1752 const unsigned NumElts = VT.getVectorNumElements();
1754 // Figure out what this type will be legalized to.
1756 while (getTypeAction(Context, LegalVT) != TypeLegal)
1757 LegalVT = getTypeToTransformTo(Context, LegalVT);
1759 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1760 if (LegalVT.getSimpleVT().is512BitVector())
1761 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1763 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1764 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1765 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1767 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1768 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1769 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1773 return VT.changeVectorElementTypeToInteger();
1776 /// Helper for getByValTypeAlignment to determine
1777 /// the desired ByVal argument alignment.
1778 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1781 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1782 if (VTy->getBitWidth() == 128)
1784 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1785 unsigned EltAlign = 0;
1786 getMaxByValAlign(ATy->getElementType(), EltAlign);
1787 if (EltAlign > MaxAlign)
1788 MaxAlign = EltAlign;
1789 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1790 for (auto *EltTy : STy->elements()) {
1791 unsigned EltAlign = 0;
1792 getMaxByValAlign(EltTy, EltAlign);
1793 if (EltAlign > MaxAlign)
1794 MaxAlign = EltAlign;
1801 /// Return the desired alignment for ByVal aggregate
1802 /// function arguments in the caller parameter area. For X86, aggregates
1803 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1804 /// are at 4-byte boundaries.
1805 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1806 const DataLayout &DL) const {
1807 if (Subtarget.is64Bit()) {
1808 // Max of 8 and alignment of type.
1809 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1816 if (Subtarget.hasSSE1())
1817 getMaxByValAlign(Ty, Align);
1821 /// Returns the target specific optimal type for load
1822 /// and store operations as a result of memset, memcpy, and memmove
1823 /// lowering. If DstAlign is zero that means it's safe to destination
1824 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1825 /// means there isn't a need to check it against alignment requirement,
1826 /// probably because the source does not need to be loaded. If 'IsMemset' is
1827 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1828 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1829 /// source is constant so it does not need to be loaded.
1830 /// It returns EVT::Other if the type should be determined using generic
1831 /// target-independent logic.
1833 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1834 unsigned DstAlign, unsigned SrcAlign,
1835 bool IsMemset, bool ZeroMemset,
1837 MachineFunction &MF) const {
1838 const Function &F = MF.getFunction();
1839 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1841 (!Subtarget.isUnalignedMem16Slow() ||
1842 ((DstAlign == 0 || DstAlign >= 16) &&
1843 (SrcAlign == 0 || SrcAlign >= 16)))) {
1844 // FIXME: Check if unaligned 32-byte accesses are slow.
1845 if (Size >= 32 && Subtarget.hasAVX()) {
1846 // Although this isn't a well-supported type for AVX1, we'll let
1847 // legalization and shuffle lowering produce the optimal codegen. If we
1848 // choose an optimal type with a vector element larger than a byte,
1849 // getMemsetStores() may create an intermediate splat (using an integer
1850 // multiply) before we splat as a vector.
1853 if (Subtarget.hasSSE2())
1855 // TODO: Can SSE1 handle a byte vector?
1856 if (Subtarget.hasSSE1())
1858 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1859 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1860 // Do not use f64 to lower memcpy if source is string constant. It's
1861 // better to use i32 to avoid the loads.
1862 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1863 // The gymnastics of splatting a byte value into an XMM register and then
1864 // only using 8-byte stores (because this is a CPU with slow unaligned
1865 // 16-byte accesses) makes that a loser.
1869 // This is a compromise. If we reach here, unaligned accesses may be slow on
1870 // this target. However, creating smaller, aligned accesses could be even
1871 // slower and would certainly be a lot more code.
1872 if (Subtarget.is64Bit() && Size >= 8)
1877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1879 return X86ScalarSSEf32;
1880 else if (VT == MVT::f64)
1881 return X86ScalarSSEf64;
1886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1891 switch (VT.getSizeInBits()) {
1893 // 8-byte and under are always assumed to be fast.
1897 *Fast = !Subtarget.isUnalignedMem16Slow();
1900 *Fast = !Subtarget.isUnalignedMem32Slow();
1902 // TODO: What about AVX-512 (512-bit) accesses?
1905 // Misaligned accesses of any size are always allowed.
1909 /// Return the entry encoding for a jump table in the
1910 /// current function. The returned value is a member of the
1911 /// MachineJumpTableInfo::JTEntryKind enum.
1912 unsigned X86TargetLowering::getJumpTableEncoding() const {
1913 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1915 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1916 return MachineJumpTableInfo::EK_Custom32;
1918 // Otherwise, use the normal jump table encoding heuristics.
1919 return TargetLowering::getJumpTableEncoding();
1922 bool X86TargetLowering::useSoftFloat() const {
1923 return Subtarget.useSoftFloat();
1926 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1927 ArgListTy &Args) const {
1929 // Only relabel X86-32 for C / Stdcall CCs.
1930 if (Subtarget.is64Bit())
1932 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1934 unsigned ParamRegs = 0;
1935 if (auto *M = MF->getFunction().getParent())
1936 ParamRegs = M->getNumberRegisterParameters();
1938 // Mark the first N int arguments as having reg
1939 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1940 Type *T = Args[Idx].Ty;
1941 if (T->isPointerTy() || T->isIntegerTy())
1942 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1943 unsigned numRegs = 1;
1944 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1946 if (ParamRegs < numRegs)
1948 ParamRegs -= numRegs;
1949 Args[Idx].IsInReg = true;
1955 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1956 const MachineBasicBlock *MBB,
1957 unsigned uid,MCContext &Ctx) const{
1958 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1959 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1961 return MCSymbolRefExpr::create(MBB->getSymbol(),
1962 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1965 /// Returns relocation base for the given PIC jumptable.
1966 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1967 SelectionDAG &DAG) const {
1968 if (!Subtarget.is64Bit())
1969 // This doesn't have SDLoc associated with it, but is not really the
1970 // same as a Register.
1971 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1972 getPointerTy(DAG.getDataLayout()));
1976 /// This returns the relocation base for the given PIC jumptable,
1977 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1978 const MCExpr *X86TargetLowering::
1979 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1980 MCContext &Ctx) const {
1981 // X86-64 uses RIP relative addressing based on the jump table label.
1982 if (Subtarget.isPICStyleRIPRel())
1983 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1985 // Otherwise, the reference is relative to the PIC base.
1986 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1989 std::pair<const TargetRegisterClass *, uint8_t>
1990 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1992 const TargetRegisterClass *RRC = nullptr;
1994 switch (VT.SimpleTy) {
1996 return TargetLowering::findRepresentativeClass(TRI, VT);
1997 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1998 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2001 RRC = &X86::VR64RegClass;
2003 case MVT::f32: case MVT::f64:
2004 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2005 case MVT::v4f32: case MVT::v2f64:
2006 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2007 case MVT::v8f32: case MVT::v4f64:
2008 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2009 case MVT::v16f32: case MVT::v8f64:
2010 RRC = &X86::VR128XRegClass;
2013 return std::make_pair(RRC, Cost);
2016 unsigned X86TargetLowering::getAddressSpace() const {
2017 if (Subtarget.is64Bit())
2018 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2022 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2023 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2024 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2027 static Constant* SegmentOffset(IRBuilder<> &IRB,
2028 unsigned Offset, unsigned AddressSpace) {
2029 return ConstantExpr::getIntToPtr(
2030 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2031 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2034 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2035 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2036 // tcbhead_t; use it instead of the usual global variable (see
2037 // sysdeps/{i386,x86_64}/nptl/tls.h)
2038 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2039 if (Subtarget.isTargetFuchsia()) {
2040 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2041 return SegmentOffset(IRB, 0x10, getAddressSpace());
2043 // %fs:0x28, unless we're using a Kernel code model, in which case
2044 // it's %gs:0x28. gs:0x14 on i386.
2045 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2046 return SegmentOffset(IRB, Offset, getAddressSpace());
2050 return TargetLowering::getIRStackGuard(IRB);
2053 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2054 // MSVC CRT provides functionalities for stack protection.
2055 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2056 // MSVC CRT has a global variable holding security cookie.
2057 M.getOrInsertGlobal("__security_cookie",
2058 Type::getInt8PtrTy(M.getContext()));
2060 // MSVC CRT has a function to validate security cookie.
2061 auto *SecurityCheckCookie = cast<Function>(
2062 M.getOrInsertFunction("__security_check_cookie",
2063 Type::getVoidTy(M.getContext()),
2064 Type::getInt8PtrTy(M.getContext())));
2065 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2066 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2069 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2070 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2072 TargetLowering::insertSSPDeclarations(M);
2075 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2076 // MSVC CRT has a global variable holding security cookie.
2077 if (Subtarget.getTargetTriple().isOSMSVCRT())
2078 return M.getGlobalVariable("__security_cookie");
2079 return TargetLowering::getSDagStackGuard(M);
2082 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2083 // MSVC CRT has a function to validate security cookie.
2084 if (Subtarget.getTargetTriple().isOSMSVCRT())
2085 return M.getFunction("__security_check_cookie");
2086 return TargetLowering::getSSPStackGuardCheck(M);
2089 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2090 if (Subtarget.getTargetTriple().isOSContiki())
2091 return getDefaultSafeStackPointerLocation(IRB, false);
2093 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2094 // definition of TLS_SLOT_SAFESTACK in
2095 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2096 if (Subtarget.isTargetAndroid()) {
2097 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2099 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2100 return SegmentOffset(IRB, Offset, getAddressSpace());
2103 // Fuchsia is similar.
2104 if (Subtarget.isTargetFuchsia()) {
2105 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2106 return SegmentOffset(IRB, 0x18, getAddressSpace());
2109 return TargetLowering::getSafeStackPointerLocation(IRB);
2112 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2113 unsigned DestAS) const {
2114 assert(SrcAS != DestAS && "Expected different address spaces!");
2116 return SrcAS < 256 && DestAS < 256;
2119 //===----------------------------------------------------------------------===//
2120 // Return Value Calling Convention Implementation
2121 //===----------------------------------------------------------------------===//
2123 #include "X86GenCallingConv.inc"
2125 bool X86TargetLowering::CanLowerReturn(
2126 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2127 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2128 SmallVector<CCValAssign, 16> RVLocs;
2129 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2130 return CCInfo.CheckReturn(Outs, RetCC_X86);
2133 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2134 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2138 /// Lowers masks values (v*i1) to the local register values
2139 /// \returns DAG node after lowering to register type
2140 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2141 const SDLoc &Dl, SelectionDAG &DAG) {
2142 EVT ValVT = ValArg.getValueType();
2144 if (ValVT == MVT::v1i1)
2145 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2146 DAG.getIntPtrConstant(0, Dl));
2148 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2149 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2150 // Two stage lowering might be required
2151 // bitcast: v8i1 -> i8 / v16i1 -> i16
2152 // anyextend: i8 -> i32 / i16 -> i32
2153 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2154 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2155 if (ValLoc == MVT::i32)
2156 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2160 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2161 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2162 // One stage lowering is required
2163 // bitcast: v32i1 -> i32 / v64i1 -> i64
2164 return DAG.getBitcast(ValLoc, ValArg);
2167 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2170 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2171 static void Passv64i1ArgInRegs(
2172 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2173 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2174 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2175 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2176 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2177 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2178 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2179 "The value should reside in two registers");
2181 // Before splitting the value we cast it to i64
2182 Arg = DAG.getBitcast(MVT::i64, Arg);
2184 // Splitting the value into two i32 types
2186 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2187 DAG.getConstant(0, Dl, MVT::i32));
2188 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2189 DAG.getConstant(1, Dl, MVT::i32));
2191 // Attach the two i32 types into corresponding registers
2192 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2193 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2197 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2199 const SmallVectorImpl<ISD::OutputArg> &Outs,
2200 const SmallVectorImpl<SDValue> &OutVals,
2201 const SDLoc &dl, SelectionDAG &DAG) const {
2202 MachineFunction &MF = DAG.getMachineFunction();
2203 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2205 // In some cases we need to disable registers from the default CSR list.
2206 // For example, when they are used for argument passing.
2207 bool ShouldDisableCalleeSavedRegister =
2208 CallConv == CallingConv::X86_RegCall ||
2209 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2211 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2212 report_fatal_error("X86 interrupts may not return any value");
2214 SmallVector<CCValAssign, 16> RVLocs;
2215 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2216 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2219 SmallVector<SDValue, 6> RetOps;
2220 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2221 // Operand #1 = Bytes To Pop
2222 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2225 // Copy the result values into the output registers.
2226 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2228 CCValAssign &VA = RVLocs[I];
2229 assert(VA.isRegLoc() && "Can only return in registers!");
2231 // Add the register to the CalleeSaveDisableRegs list.
2232 if (ShouldDisableCalleeSavedRegister)
2233 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2235 SDValue ValToCopy = OutVals[OutsIndex];
2236 EVT ValVT = ValToCopy.getValueType();
2238 // Promote values to the appropriate types.
2239 if (VA.getLocInfo() == CCValAssign::SExt)
2240 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2241 else if (VA.getLocInfo() == CCValAssign::ZExt)
2242 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2243 else if (VA.getLocInfo() == CCValAssign::AExt) {
2244 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2245 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2247 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2249 else if (VA.getLocInfo() == CCValAssign::BCvt)
2250 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2252 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2253 "Unexpected FP-extend for return value.");
2255 // If this is x86-64, and we disabled SSE, we can't return FP values,
2256 // or SSE or MMX vectors.
2257 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2258 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2259 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2260 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2261 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2262 } else if (ValVT == MVT::f64 &&
2263 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2264 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2265 // llvm-gcc has never done it right and no one has noticed, so this
2266 // should be OK for now.
2267 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2268 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2271 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2272 // the RET instruction and handled by the FP Stackifier.
2273 if (VA.getLocReg() == X86::FP0 ||
2274 VA.getLocReg() == X86::FP1) {
2275 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2276 // change the value to the FP stack register class.
2277 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2278 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2279 RetOps.push_back(ValToCopy);
2280 // Don't emit a copytoreg.
2284 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2285 // which is returned in RAX / RDX.
2286 if (Subtarget.is64Bit()) {
2287 if (ValVT == MVT::x86mmx) {
2288 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2289 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2290 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2292 // If we don't have SSE2 available, convert to v4f32 so the generated
2293 // register is legal.
2294 if (!Subtarget.hasSSE2())
2295 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2300 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2302 if (VA.needsCustom()) {
2303 assert(VA.getValVT() == MVT::v64i1 &&
2304 "Currently the only custom case is when we split v64i1 to 2 regs");
2306 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2309 assert(2 == RegsToPass.size() &&
2310 "Expecting two registers after Pass64BitArgInRegs");
2312 // Add the second register to the CalleeSaveDisableRegs list.
2313 if (ShouldDisableCalleeSavedRegister)
2314 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2316 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2319 // Add nodes to the DAG and add the values into the RetOps list
2320 for (auto &Reg : RegsToPass) {
2321 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2322 Flag = Chain.getValue(1);
2323 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2327 // Swift calling convention does not require we copy the sret argument
2328 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2330 // All x86 ABIs require that for returning structs by value we copy
2331 // the sret argument into %rax/%eax (depending on ABI) for the return.
2332 // We saved the argument into a virtual register in the entry block,
2333 // so now we copy the value out and into %rax/%eax.
2335 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2336 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2337 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2338 // either case FuncInfo->setSRetReturnReg() will have been called.
2339 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2340 // When we have both sret and another return value, we should use the
2341 // original Chain stored in RetOps[0], instead of the current Chain updated
2342 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2344 // For the case of sret and another return value, we have
2345 // Chain_0 at the function entry
2346 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2347 // If we use Chain_1 in getCopyFromReg, we will have
2348 // Val = getCopyFromReg(Chain_1)
2349 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2351 // getCopyToReg(Chain_0) will be glued together with
2352 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2353 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2354 // Data dependency from Unit B to Unit A due to usage of Val in
2355 // getCopyToReg(Chain_1, Val)
2356 // Chain dependency from Unit A to Unit B
2358 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2359 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2360 getPointerTy(MF.getDataLayout()));
2363 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2364 X86::RAX : X86::EAX;
2365 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2366 Flag = Chain.getValue(1);
2368 // RAX/EAX now acts like a return value.
2370 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2372 // Add the returned register to the CalleeSaveDisableRegs list.
2373 if (ShouldDisableCalleeSavedRegister)
2374 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2377 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2378 const MCPhysReg *I =
2379 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2382 if (X86::GR64RegClass.contains(*I))
2383 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2385 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2389 RetOps[0] = Chain; // Update chain.
2391 // Add the flag if we have it.
2393 RetOps.push_back(Flag);
2395 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2396 if (CallConv == CallingConv::X86_INTR)
2397 opcode = X86ISD::IRET;
2398 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2401 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2402 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2405 SDValue TCChain = Chain;
2406 SDNode *Copy = *N->use_begin();
2407 if (Copy->getOpcode() == ISD::CopyToReg) {
2408 // If the copy has a glue operand, we conservatively assume it isn't safe to
2409 // perform a tail call.
2410 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2412 TCChain = Copy->getOperand(0);
2413 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2416 bool HasRet = false;
2417 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2419 if (UI->getOpcode() != X86ISD::RET_FLAG)
2421 // If we are returning more than one value, we can definitely
2422 // not make a tail call see PR19530
2423 if (UI->getNumOperands() > 4)
2425 if (UI->getNumOperands() == 4 &&
2426 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2438 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2439 ISD::NodeType ExtendKind) const {
2440 MVT ReturnMVT = MVT::i32;
2442 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2443 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2444 // The ABI does not require i1, i8 or i16 to be extended.
2446 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2447 // always extending i8/i16 return values, so keep doing that for now.
2449 ReturnMVT = MVT::i8;
2452 EVT MinVT = getRegisterType(Context, ReturnMVT);
2453 return VT.bitsLT(MinVT) ? MinVT : VT;
2456 /// Reads two 32 bit registers and creates a 64 bit mask value.
2457 /// \param VA The current 32 bit value that need to be assigned.
2458 /// \param NextVA The next 32 bit value that need to be assigned.
2459 /// \param Root The parent DAG node.
2460 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2461 /// glue purposes. In the case the DAG is already using
2462 /// physical register instead of virtual, we should glue
2463 /// our new SDValue to InFlag SDvalue.
2464 /// \return a new SDvalue of size 64bit.
2465 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2466 SDValue &Root, SelectionDAG &DAG,
2467 const SDLoc &Dl, const X86Subtarget &Subtarget,
2468 SDValue *InFlag = nullptr) {
2469 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2470 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2471 assert(VA.getValVT() == MVT::v64i1 &&
2472 "Expecting first location of 64 bit width type");
2473 assert(NextVA.getValVT() == VA.getValVT() &&
2474 "The locations should have the same type");
2475 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2476 "The values should reside in two registers");
2480 SDValue ArgValueLo, ArgValueHi;
2482 MachineFunction &MF = DAG.getMachineFunction();
2483 const TargetRegisterClass *RC = &X86::GR32RegClass;
2485 // Read a 32 bit value from the registers
2486 if (nullptr == InFlag) {
2487 // When no physical register is present,
2488 // create an intermediate virtual register
2489 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2490 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2491 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2492 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2494 // When a physical register is available read the value from it and glue
2495 // the reads together.
2497 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2498 *InFlag = ArgValueLo.getValue(2);
2500 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2501 *InFlag = ArgValueHi.getValue(2);
2504 // Convert the i32 type into v32i1 type
2505 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2507 // Convert the i32 type into v32i1 type
2508 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2510 // Concatenate the two values together
2511 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2514 /// The function will lower a register of various sizes (8/16/32/64)
2515 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2516 /// \returns a DAG node contains the operand after lowering to mask type.
2517 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2518 const EVT &ValLoc, const SDLoc &Dl,
2519 SelectionDAG &DAG) {
2520 SDValue ValReturned = ValArg;
2522 if (ValVT == MVT::v1i1)
2523 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2525 if (ValVT == MVT::v64i1) {
2526 // In 32 bit machine, this case is handled by getv64i1Argument
2527 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2528 // In 64 bit machine, There is no need to truncate the value only bitcast
2531 switch (ValVT.getSimpleVT().SimpleTy) {
2542 llvm_unreachable("Expecting a vector of i1 types");
2545 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2547 return DAG.getBitcast(ValVT, ValReturned);
2550 /// Lower the result values of a call into the
2551 /// appropriate copies out of appropriate physical registers.
2553 SDValue X86TargetLowering::LowerCallResult(
2554 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2555 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2556 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2557 uint32_t *RegMask) const {
2559 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2560 // Assign locations to each value returned by this call.
2561 SmallVector<CCValAssign, 16> RVLocs;
2562 bool Is64Bit = Subtarget.is64Bit();
2563 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2565 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2567 // Copy all of the result registers out of their specified physreg.
2568 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2570 CCValAssign &VA = RVLocs[I];
2571 EVT CopyVT = VA.getLocVT();
2573 // In some calling conventions we need to remove the used registers
2574 // from the register mask.
2576 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2577 SubRegs.isValid(); ++SubRegs)
2578 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2581 // If this is x86-64, and we disabled SSE, we can't return FP values
2582 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2583 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2584 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2585 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2588 // If we prefer to use the value in xmm registers, copy it out as f80 and
2589 // use a truncate to move it from fp stack reg to xmm reg.
2590 bool RoundAfterCopy = false;
2591 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2592 isScalarFPTypeInSSEReg(VA.getValVT())) {
2593 if (!Subtarget.hasX87())
2594 report_fatal_error("X87 register return with X87 disabled");
2596 RoundAfterCopy = (CopyVT != VA.getLocVT());
2600 if (VA.needsCustom()) {
2601 assert(VA.getValVT() == MVT::v64i1 &&
2602 "Currently the only custom case is when we split v64i1 to 2 regs");
2604 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2606 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2608 Val = Chain.getValue(0);
2609 InFlag = Chain.getValue(2);
2613 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2614 // This truncation won't change the value.
2615 DAG.getIntPtrConstant(1, dl));
2617 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2618 if (VA.getValVT().isVector() &&
2619 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2620 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2621 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2622 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2624 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2627 InVals.push_back(Val);
2633 //===----------------------------------------------------------------------===//
2634 // C & StdCall & Fast Calling Convention implementation
2635 //===----------------------------------------------------------------------===//
2636 // StdCall calling convention seems to be standard for many Windows' API
2637 // routines and around. It differs from C calling convention just a little:
2638 // callee should clean up the stack, not caller. Symbols should be also
2639 // decorated in some fancy way :) It doesn't support any vector arguments.
2640 // For info on fast calling convention see Fast Calling Convention (tail call)
2641 // implementation LowerX86_32FastCCCallTo.
2643 /// CallIsStructReturn - Determines whether a call uses struct return
2645 enum StructReturnType {
2650 static StructReturnType
2651 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2653 return NotStructReturn;
2655 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2656 if (!Flags.isSRet())
2657 return NotStructReturn;
2658 if (Flags.isInReg() || IsMCU)
2659 return RegStructReturn;
2660 return StackStructReturn;
2663 /// Determines whether a function uses struct return semantics.
2664 static StructReturnType
2665 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2667 return NotStructReturn;
2669 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2670 if (!Flags.isSRet())
2671 return NotStructReturn;
2672 if (Flags.isInReg() || IsMCU)
2673 return RegStructReturn;
2674 return StackStructReturn;
2677 /// Make a copy of an aggregate at address specified by "Src" to address
2678 /// "Dst" with size and alignment information specified by the specific
2679 /// parameter attribute. The copy will be passed as a byval function parameter.
2680 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2681 SDValue Chain, ISD::ArgFlagsTy Flags,
2682 SelectionDAG &DAG, const SDLoc &dl) {
2683 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2685 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2686 /*isVolatile*/false, /*AlwaysInline=*/true,
2687 /*isTailCall*/false,
2688 MachinePointerInfo(), MachinePointerInfo());
2691 /// Return true if the calling convention is one that we can guarantee TCO for.
2692 static bool canGuaranteeTCO(CallingConv::ID CC) {
2693 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2694 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2695 CC == CallingConv::HHVM);
2698 /// Return true if we might ever do TCO for calls with this calling convention.
2699 static bool mayTailCallThisCC(CallingConv::ID CC) {
2701 // C calling conventions:
2702 case CallingConv::C:
2703 case CallingConv::Win64:
2704 case CallingConv::X86_64_SysV:
2705 // Callee pop conventions:
2706 case CallingConv::X86_ThisCall:
2707 case CallingConv::X86_StdCall:
2708 case CallingConv::X86_VectorCall:
2709 case CallingConv::X86_FastCall:
2712 return canGuaranteeTCO(CC);
2716 /// Return true if the function is being made into a tailcall target by
2717 /// changing its ABI.
2718 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2719 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2722 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2724 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2725 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2728 ImmutableCallSite CS(CI);
2729 CallingConv::ID CalleeCC = CS.getCallingConv();
2730 if (!mayTailCallThisCC(CalleeCC))
2737 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2738 const SmallVectorImpl<ISD::InputArg> &Ins,
2739 const SDLoc &dl, SelectionDAG &DAG,
2740 const CCValAssign &VA,
2741 MachineFrameInfo &MFI, unsigned i) const {
2742 // Create the nodes corresponding to a load from this parameter slot.
2743 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2744 bool AlwaysUseMutable = shouldGuaranteeTCO(
2745 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2746 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2748 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2750 // If value is passed by pointer we have address passed instead of the value
2751 // itself. No need to extend if the mask value and location share the same
2753 bool ExtendedInMem =
2754 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2755 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2757 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2758 ValVT = VA.getLocVT();
2760 ValVT = VA.getValVT();
2762 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2763 // taken by a return address.
2765 if (CallConv == CallingConv::X86_INTR) {
2766 // X86 interrupts may take one or two arguments.
2767 // On the stack there will be no return address as in regular call.
2768 // Offset of last argument need to be set to -4/-8 bytes.
2769 // Where offset of the first argument out of two, should be set to 0 bytes.
2770 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2771 if (Subtarget.is64Bit() && Ins.size() == 2) {
2772 // The stack pointer needs to be realigned for 64 bit handlers with error
2773 // code, so the argument offset changes by 8 bytes.
2778 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2779 // changed with more analysis.
2780 // In case of tail call optimization mark all arguments mutable. Since they
2781 // could be overwritten by lowering of arguments in case of a tail call.
2782 if (Flags.isByVal()) {
2783 unsigned Bytes = Flags.getByValSize();
2784 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2785 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2786 // Adjust SP offset of interrupt parameter.
2787 if (CallConv == CallingConv::X86_INTR) {
2788 MFI.setObjectOffset(FI, Offset);
2790 return DAG.getFrameIndex(FI, PtrVT);
2793 // This is an argument in memory. We might be able to perform copy elision.
2794 if (Flags.isCopyElisionCandidate()) {
2795 EVT ArgVT = Ins[i].ArgVT;
2797 if (Ins[i].PartOffset == 0) {
2798 // If this is a one-part value or the first part of a multi-part value,
2799 // create a stack object for the entire argument value type and return a
2800 // load from our portion of it. This assumes that if the first part of an
2801 // argument is in memory, the rest will also be in memory.
2802 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2803 /*Immutable=*/false);
2804 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2806 ValVT, dl, Chain, PartAddr,
2807 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2809 // This is not the first piece of an argument in memory. See if there is
2810 // already a fixed stack object including this offset. If so, assume it
2811 // was created by the PartOffset == 0 branch above and create a load from
2812 // the appropriate offset into it.
2813 int64_t PartBegin = VA.getLocMemOffset();
2814 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2815 int FI = MFI.getObjectIndexBegin();
2816 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2817 int64_t ObjBegin = MFI.getObjectOffset(FI);
2818 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2819 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2822 if (MFI.isFixedObjectIndex(FI)) {
2824 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2825 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2827 ValVT, dl, Chain, Addr,
2828 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2829 Ins[i].PartOffset));
2834 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2835 VA.getLocMemOffset(), isImmutable);
2837 // Set SExt or ZExt flag.
2838 if (VA.getLocInfo() == CCValAssign::ZExt) {
2839 MFI.setObjectZExt(FI, true);
2840 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2841 MFI.setObjectSExt(FI, true);
2844 // Adjust SP offset of interrupt parameter.
2845 if (CallConv == CallingConv::X86_INTR) {
2846 MFI.setObjectOffset(FI, Offset);
2849 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2850 SDValue Val = DAG.getLoad(
2851 ValVT, dl, Chain, FIN,
2852 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2853 return ExtendedInMem
2854 ? (VA.getValVT().isVector()
2855 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2856 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2860 // FIXME: Get this from tablegen.
2861 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2862 const X86Subtarget &Subtarget) {
2863 assert(Subtarget.is64Bit());
2865 if (Subtarget.isCallingConvWin64(CallConv)) {
2866 static const MCPhysReg GPR64ArgRegsWin64[] = {
2867 X86::RCX, X86::RDX, X86::R8, X86::R9
2869 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2872 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2873 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2875 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2878 // FIXME: Get this from tablegen.
2879 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2880 CallingConv::ID CallConv,
2881 const X86Subtarget &Subtarget) {
2882 assert(Subtarget.is64Bit());
2883 if (Subtarget.isCallingConvWin64(CallConv)) {
2884 // The XMM registers which might contain var arg parameters are shadowed
2885 // in their paired GPR. So we only need to save the GPR to their home
2887 // TODO: __vectorcall will change this.
2891 const Function &F = MF.getFunction();
2892 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2893 bool isSoftFloat = Subtarget.useSoftFloat();
2894 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2895 "SSE register cannot be used when SSE is disabled!");
2896 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2897 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2901 static const MCPhysReg XMMArgRegs64Bit[] = {
2902 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2903 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2905 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2909 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2910 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2911 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2912 return A.getValNo() < B.getValNo();
2917 SDValue X86TargetLowering::LowerFormalArguments(
2918 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2919 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2920 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2921 MachineFunction &MF = DAG.getMachineFunction();
2922 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2923 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2925 const Function &F = MF.getFunction();
2926 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
2927 F.getName() == "main")
2928 FuncInfo->setForceFramePointer(true);
2930 MachineFrameInfo &MFI = MF.getFrameInfo();
2931 bool Is64Bit = Subtarget.is64Bit();
2932 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2935 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2936 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2938 if (CallConv == CallingConv::X86_INTR) {
2939 bool isLegal = Ins.size() == 1 ||
2940 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2941 (!Is64Bit && Ins[1].VT == MVT::i32)));
2943 report_fatal_error("X86 interrupts may take one or two arguments");
2946 // Assign locations to all of the incoming arguments.
2947 SmallVector<CCValAssign, 16> ArgLocs;
2948 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2950 // Allocate shadow area for Win64.
2952 CCInfo.AllocateStack(32, 8);
2954 CCInfo.AnalyzeArguments(Ins, CC_X86);
2956 // In vectorcall calling convention a second pass is required for the HVA
2958 if (CallingConv::X86_VectorCall == CallConv) {
2959 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2962 // The next loop assumes that the locations are in the same order of the
2964 assert(isSortedByValueNo(ArgLocs) &&
2965 "Argument Location list must be sorted before lowering");
2968 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2970 assert(InsIndex < Ins.size() && "Invalid Ins index");
2971 CCValAssign &VA = ArgLocs[I];
2973 if (VA.isRegLoc()) {
2974 EVT RegVT = VA.getLocVT();
2975 if (VA.needsCustom()) {
2977 VA.getValVT() == MVT::v64i1 &&
2978 "Currently the only custom case is when we split v64i1 to 2 regs");
2980 // v64i1 values, in regcall calling convention, that are
2981 // compiled to 32 bit arch, are split up into two registers.
2983 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2985 const TargetRegisterClass *RC;
2986 if (RegVT == MVT::i32)
2987 RC = &X86::GR32RegClass;
2988 else if (Is64Bit && RegVT == MVT::i64)
2989 RC = &X86::GR64RegClass;
2990 else if (RegVT == MVT::f32)
2991 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2992 else if (RegVT == MVT::f64)
2993 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2994 else if (RegVT == MVT::f80)
2995 RC = &X86::RFP80RegClass;
2996 else if (RegVT == MVT::f128)
2997 RC = &X86::FR128RegClass;
2998 else if (RegVT.is512BitVector())
2999 RC = &X86::VR512RegClass;
3000 else if (RegVT.is256BitVector())
3001 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3002 else if (RegVT.is128BitVector())
3003 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3004 else if (RegVT == MVT::x86mmx)
3005 RC = &X86::VR64RegClass;
3006 else if (RegVT == MVT::v1i1)
3007 RC = &X86::VK1RegClass;
3008 else if (RegVT == MVT::v8i1)
3009 RC = &X86::VK8RegClass;
3010 else if (RegVT == MVT::v16i1)
3011 RC = &X86::VK16RegClass;
3012 else if (RegVT == MVT::v32i1)
3013 RC = &X86::VK32RegClass;
3014 else if (RegVT == MVT::v64i1)
3015 RC = &X86::VK64RegClass;
3017 llvm_unreachable("Unknown argument type!");
3019 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3020 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3023 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3024 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3026 if (VA.getLocInfo() == CCValAssign::SExt)
3027 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3028 DAG.getValueType(VA.getValVT()));
3029 else if (VA.getLocInfo() == CCValAssign::ZExt)
3030 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3031 DAG.getValueType(VA.getValVT()));
3032 else if (VA.getLocInfo() == CCValAssign::BCvt)
3033 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3035 if (VA.isExtInLoc()) {
3036 // Handle MMX values passed in XMM regs.
3037 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3038 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3039 else if (VA.getValVT().isVector() &&
3040 VA.getValVT().getScalarType() == MVT::i1 &&
3041 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3042 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3043 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3044 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3046 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3049 assert(VA.isMemLoc());
3051 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3054 // If value is passed via pointer - do a load.
3055 if (VA.getLocInfo() == CCValAssign::Indirect)
3057 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3059 InVals.push_back(ArgValue);
3062 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3063 // Swift calling convention does not require we copy the sret argument
3064 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3065 if (CallConv == CallingConv::Swift)
3068 // All x86 ABIs require that for returning structs by value we copy the
3069 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3070 // the argument into a virtual register so that we can access it from the
3072 if (Ins[I].Flags.isSRet()) {
3073 unsigned Reg = FuncInfo->getSRetReturnReg();
3075 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3076 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3077 FuncInfo->setSRetReturnReg(Reg);
3079 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3080 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3085 unsigned StackSize = CCInfo.getNextStackOffset();
3086 // Align stack specially for tail calls.
3087 if (shouldGuaranteeTCO(CallConv,
3088 MF.getTarget().Options.GuaranteedTailCallOpt))
3089 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3091 // If the function takes variable number of arguments, make a frame index for
3092 // the start of the first vararg value... for expansion of llvm.va_start. We
3093 // can skip this if there are no va_start calls.
3094 if (MFI.hasVAStart() &&
3095 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3096 CallConv != CallingConv::X86_ThisCall))) {
3097 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3100 // Figure out if XMM registers are in use.
3101 assert(!(Subtarget.useSoftFloat() &&
3102 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3103 "SSE register cannot be used when SSE is disabled!");
3105 // 64-bit calling conventions support varargs and register parameters, so we
3106 // have to do extra work to spill them in the prologue.
3107 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3108 // Find the first unallocated argument registers.
3109 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3110 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3111 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3112 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3113 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3114 "SSE register cannot be used when SSE is disabled!");
3116 // Gather all the live in physical registers.
3117 SmallVector<SDValue, 6> LiveGPRs;
3118 SmallVector<SDValue, 8> LiveXMMRegs;
3120 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3121 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3123 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3125 if (!ArgXMMs.empty()) {
3126 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3127 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3128 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3129 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3130 LiveXMMRegs.push_back(
3131 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3136 // Get to the caller-allocated home save location. Add 8 to account
3137 // for the return address.
3138 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3139 FuncInfo->setRegSaveFrameIndex(
3140 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3141 // Fixup to set vararg frame on shadow area (4 x i64).
3143 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3145 // For X86-64, if there are vararg parameters that are passed via
3146 // registers, then we must store them to their spots on the stack so
3147 // they may be loaded by dereferencing the result of va_next.
3148 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3149 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3150 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3151 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3154 // Store the integer parameter registers.
3155 SmallVector<SDValue, 8> MemOps;
3156 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3157 getPointerTy(DAG.getDataLayout()));
3158 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3159 for (SDValue Val : LiveGPRs) {
3160 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3161 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3163 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3164 MachinePointerInfo::getFixedStack(
3165 DAG.getMachineFunction(),
3166 FuncInfo->getRegSaveFrameIndex(), Offset));
3167 MemOps.push_back(Store);
3171 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3172 // Now store the XMM (fp + vector) parameter registers.
3173 SmallVector<SDValue, 12> SaveXMMOps;
3174 SaveXMMOps.push_back(Chain);
3175 SaveXMMOps.push_back(ALVal);
3176 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3177 FuncInfo->getRegSaveFrameIndex(), dl));
3178 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3179 FuncInfo->getVarArgsFPOffset(), dl));
3180 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3182 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3183 MVT::Other, SaveXMMOps));
3186 if (!MemOps.empty())
3187 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3190 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3191 // Find the largest legal vector type.
3192 MVT VecVT = MVT::Other;
3193 // FIXME: Only some x86_32 calling conventions support AVX512.
3194 if (Subtarget.hasAVX512() &&
3195 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3196 CallConv == CallingConv::Intel_OCL_BI)))
3197 VecVT = MVT::v16f32;
3198 else if (Subtarget.hasAVX())
3200 else if (Subtarget.hasSSE2())
3203 // We forward some GPRs and some vector types.
3204 SmallVector<MVT, 2> RegParmTypes;
3205 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3206 RegParmTypes.push_back(IntVT);
3207 if (VecVT != MVT::Other)
3208 RegParmTypes.push_back(VecVT);
3210 // Compute the set of forwarded registers. The rest are scratch.
3211 SmallVectorImpl<ForwardedRegister> &Forwards =
3212 FuncInfo->getForwardedMustTailRegParms();
3213 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3215 // Conservatively forward AL on x86_64, since it might be used for varargs.
3216 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3217 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3218 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3221 // Copy all forwards from physical to virtual registers.
3222 for (ForwardedRegister &F : Forwards) {
3223 // FIXME: Can we use a less constrained schedule?
3224 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3225 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3226 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3230 // Some CCs need callee pop.
3231 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3232 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3233 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3234 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3235 // X86 interrupts must pop the error code (and the alignment padding) if
3237 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3239 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3240 // If this is an sret function, the return should pop the hidden pointer.
3241 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3242 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3243 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3244 FuncInfo->setBytesToPopOnReturn(4);
3248 // RegSaveFrameIndex is X86-64 only.
3249 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3250 if (CallConv == CallingConv::X86_FastCall ||
3251 CallConv == CallingConv::X86_ThisCall)
3252 // fastcc functions can't have varargs.
3253 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3256 FuncInfo->setArgumentStackSize(StackSize);
3258 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3259 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3260 if (Personality == EHPersonality::CoreCLR) {
3262 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3263 // that we'd prefer this slot be allocated towards the bottom of the frame
3264 // (i.e. near the stack pointer after allocating the frame). Every
3265 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3266 // offset from the bottom of this and each funclet's frame must be the
3267 // same, so the size of funclets' (mostly empty) frames is dictated by
3268 // how far this slot is from the bottom (since they allocate just enough
3269 // space to accommodate holding this slot at the correct offset).
3270 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3271 EHInfo->PSPSymFrameIdx = PSPSymFI;
3275 if (CallConv == CallingConv::X86_RegCall ||
3276 F.hasFnAttribute("no_caller_saved_registers")) {
3277 MachineRegisterInfo &MRI = MF.getRegInfo();
3278 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3279 MRI.disableCalleeSavedRegister(Pair.first);
3285 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3286 SDValue Arg, const SDLoc &dl,
3288 const CCValAssign &VA,
3289 ISD::ArgFlagsTy Flags) const {
3290 unsigned LocMemOffset = VA.getLocMemOffset();
3291 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3292 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3294 if (Flags.isByVal())
3295 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3297 return DAG.getStore(
3298 Chain, dl, Arg, PtrOff,
3299 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3302 /// Emit a load of return address if tail call
3303 /// optimization is performed and it is required.
3304 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3305 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3306 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3307 // Adjust the Return address stack slot.
3308 EVT VT = getPointerTy(DAG.getDataLayout());
3309 OutRetAddr = getReturnAddressFrameIndex(DAG);
3311 // Load the "old" Return address.
3312 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3313 return SDValue(OutRetAddr.getNode(), 1);
3316 /// Emit a store of the return address if tail call
3317 /// optimization is performed and it is required (FPDiff!=0).
3318 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3319 SDValue Chain, SDValue RetAddrFrIdx,
3320 EVT PtrVT, unsigned SlotSize,
3321 int FPDiff, const SDLoc &dl) {
3322 // Store the return address to the appropriate stack slot.
3323 if (!FPDiff) return Chain;
3324 // Calculate the new stack slot for the return address.
3325 int NewReturnAddrFI =
3326 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3328 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3329 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3330 MachinePointerInfo::getFixedStack(
3331 DAG.getMachineFunction(), NewReturnAddrFI));
3335 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3336 /// operation of specified width.
3337 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3339 unsigned NumElems = VT.getVectorNumElements();
3340 SmallVector<int, 8> Mask;
3341 Mask.push_back(NumElems);
3342 for (unsigned i = 1; i != NumElems; ++i)
3344 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3348 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3349 SmallVectorImpl<SDValue> &InVals) const {
3350 SelectionDAG &DAG = CLI.DAG;
3352 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3353 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3354 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3355 SDValue Chain = CLI.Chain;
3356 SDValue Callee = CLI.Callee;
3357 CallingConv::ID CallConv = CLI.CallConv;
3358 bool &isTailCall = CLI.IsTailCall;
3359 bool isVarArg = CLI.IsVarArg;
3361 MachineFunction &MF = DAG.getMachineFunction();
3362 bool Is64Bit = Subtarget.is64Bit();
3363 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3364 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3365 bool IsSibcall = false;
3366 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3367 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3368 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3369 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3370 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3371 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3373 if (CallConv == CallingConv::X86_INTR)
3374 report_fatal_error("X86 interrupts may not be called directly");
3376 if (Attr.getValueAsString() == "true")
3379 if (Subtarget.isPICStyleGOT() &&
3380 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3381 // If we are using a GOT, disable tail calls to external symbols with
3382 // default visibility. Tail calling such a symbol requires using a GOT
3383 // relocation, which forces early binding of the symbol. This breaks code
3384 // that require lazy function symbol resolution. Using musttail or
3385 // GuaranteedTailCallOpt will override this.
3386 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3387 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3388 G->getGlobal()->hasDefaultVisibility()))
3392 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3394 // Force this to be a tail call. The verifier rules are enough to ensure
3395 // that we can lower this successfully without moving the return address
3398 } else if (isTailCall) {
3399 // Check if it's really possible to do a tail call.
3400 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3401 isVarArg, SR != NotStructReturn,
3402 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3403 Outs, OutVals, Ins, DAG);
3405 // Sibcalls are automatically detected tailcalls which do not require
3407 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3414 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3415 "Var args not supported with calling convention fastcc, ghc or hipe");
3417 // Analyze operands of the call, assigning locations to each operand.
3418 SmallVector<CCValAssign, 16> ArgLocs;
3419 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3421 // Allocate shadow area for Win64.
3423 CCInfo.AllocateStack(32, 8);
3425 CCInfo.AnalyzeArguments(Outs, CC_X86);
3427 // In vectorcall calling convention a second pass is required for the HVA
3429 if (CallingConv::X86_VectorCall == CallConv) {
3430 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3433 // Get a count of how many bytes are to be pushed on the stack.
3434 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3436 // This is a sibcall. The memory operands are available in caller's
3437 // own caller's stack.
3439 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3440 canGuaranteeTCO(CallConv))
3441 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3444 if (isTailCall && !IsSibcall && !IsMustTail) {
3445 // Lower arguments at fp - stackoffset + fpdiff.
3446 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3448 FPDiff = NumBytesCallerPushed - NumBytes;
3450 // Set the delta of movement of the returnaddr stackslot.
3451 // But only set if delta is greater than previous delta.
3452 if (FPDiff < X86Info->getTCReturnAddrDelta())
3453 X86Info->setTCReturnAddrDelta(FPDiff);
3456 unsigned NumBytesToPush = NumBytes;
3457 unsigned NumBytesToPop = NumBytes;
3459 // If we have an inalloca argument, all stack space has already been allocated
3460 // for us and be right at the top of the stack. We don't support multiple
3461 // arguments passed in memory when using inalloca.
3462 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3464 if (!ArgLocs.back().isMemLoc())
3465 report_fatal_error("cannot use inalloca attribute on a register "
3467 if (ArgLocs.back().getLocMemOffset() != 0)
3468 report_fatal_error("any parameter with the inalloca attribute must be "
3469 "the only memory argument");
3473 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3474 NumBytes - NumBytesToPush, dl);
3476 SDValue RetAddrFrIdx;
3477 // Load return address for tail calls.
3478 if (isTailCall && FPDiff)
3479 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3480 Is64Bit, FPDiff, dl);
3482 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3483 SmallVector<SDValue, 8> MemOpChains;
3486 // The next loop assumes that the locations are in the same order of the
3488 assert(isSortedByValueNo(ArgLocs) &&
3489 "Argument Location list must be sorted before lowering");
3491 // Walk the register/memloc assignments, inserting copies/loads. In the case
3492 // of tail call optimization arguments are handle later.
3493 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3494 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3496 assert(OutIndex < Outs.size() && "Invalid Out index");
3497 // Skip inalloca arguments, they have already been written.
3498 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3499 if (Flags.isInAlloca())
3502 CCValAssign &VA = ArgLocs[I];
3503 EVT RegVT = VA.getLocVT();
3504 SDValue Arg = OutVals[OutIndex];
3505 bool isByVal = Flags.isByVal();
3507 // Promote the value if needed.
3508 switch (VA.getLocInfo()) {
3509 default: llvm_unreachable("Unknown loc info!");
3510 case CCValAssign::Full: break;
3511 case CCValAssign::SExt:
3512 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3514 case CCValAssign::ZExt:
3515 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3517 case CCValAssign::AExt:
3518 if (Arg.getValueType().isVector() &&
3519 Arg.getValueType().getVectorElementType() == MVT::i1)
3520 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3521 else if (RegVT.is128BitVector()) {
3522 // Special case: passing MMX values in XMM registers.
3523 Arg = DAG.getBitcast(MVT::i64, Arg);
3524 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3525 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3527 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3529 case CCValAssign::BCvt:
3530 Arg = DAG.getBitcast(RegVT, Arg);
3532 case CCValAssign::Indirect: {
3533 // Store the argument.
3534 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3535 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3536 Chain = DAG.getStore(
3537 Chain, dl, Arg, SpillSlot,
3538 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3544 if (VA.needsCustom()) {
3545 assert(VA.getValVT() == MVT::v64i1 &&
3546 "Currently the only custom case is when we split v64i1 to 2 regs");
3547 // Split v64i1 value into two registers
3548 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3550 } else if (VA.isRegLoc()) {
3551 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3552 if (isVarArg && IsWin64) {
3553 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3554 // shadow reg if callee is a varargs function.
3555 unsigned ShadowReg = 0;
3556 switch (VA.getLocReg()) {
3557 case X86::XMM0: ShadowReg = X86::RCX; break;
3558 case X86::XMM1: ShadowReg = X86::RDX; break;
3559 case X86::XMM2: ShadowReg = X86::R8; break;
3560 case X86::XMM3: ShadowReg = X86::R9; break;
3563 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3565 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3566 assert(VA.isMemLoc());
3567 if (!StackPtr.getNode())
3568 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3569 getPointerTy(DAG.getDataLayout()));
3570 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3571 dl, DAG, VA, Flags));
3575 if (!MemOpChains.empty())
3576 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3578 if (Subtarget.isPICStyleGOT()) {
3579 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3582 RegsToPass.push_back(std::make_pair(
3583 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3584 getPointerTy(DAG.getDataLayout()))));
3586 // If we are tail calling and generating PIC/GOT style code load the
3587 // address of the callee into ECX. The value in ecx is used as target of
3588 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3589 // for tail calls on PIC/GOT architectures. Normally we would just put the
3590 // address of GOT into ebx and then call target@PLT. But for tail calls
3591 // ebx would be restored (since ebx is callee saved) before jumping to the
3594 // Note: The actual moving to ECX is done further down.
3595 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3596 if (G && !G->getGlobal()->hasLocalLinkage() &&
3597 G->getGlobal()->hasDefaultVisibility())
3598 Callee = LowerGlobalAddress(Callee, DAG);
3599 else if (isa<ExternalSymbolSDNode>(Callee))
3600 Callee = LowerExternalSymbol(Callee, DAG);
3604 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3605 // From AMD64 ABI document:
3606 // For calls that may call functions that use varargs or stdargs
3607 // (prototype-less calls or calls to functions containing ellipsis (...) in
3608 // the declaration) %al is used as hidden argument to specify the number
3609 // of SSE registers used. The contents of %al do not need to match exactly
3610 // the number of registers, but must be an ubound on the number of SSE
3611 // registers used and is in the range 0 - 8 inclusive.
3613 // Count the number of XMM registers allocated.
3614 static const MCPhysReg XMMArgRegs[] = {
3615 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3616 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3618 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3619 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3620 && "SSE registers cannot be used when SSE is disabled");
3622 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3623 DAG.getConstant(NumXMMRegs, dl,
3627 if (isVarArg && IsMustTail) {
3628 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3629 for (const auto &F : Forwards) {
3630 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3631 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3635 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3636 // don't need this because the eligibility check rejects calls that require
3637 // shuffling arguments passed in memory.
3638 if (!IsSibcall && isTailCall) {
3639 // Force all the incoming stack arguments to be loaded from the stack
3640 // before any new outgoing arguments are stored to the stack, because the
3641 // outgoing stack slots may alias the incoming argument stack slots, and
3642 // the alias isn't otherwise explicit. This is slightly more conservative
3643 // than necessary, because it means that each store effectively depends
3644 // on every argument instead of just those arguments it would clobber.
3645 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3647 SmallVector<SDValue, 8> MemOpChains2;
3650 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3652 CCValAssign &VA = ArgLocs[I];
3654 if (VA.isRegLoc()) {
3655 if (VA.needsCustom()) {
3656 assert((CallConv == CallingConv::X86_RegCall) &&
3657 "Expecting custom case only in regcall calling convention");
3658 // This means that we are in special case where one argument was
3659 // passed through two register locations - Skip the next location
3666 assert(VA.isMemLoc());
3667 SDValue Arg = OutVals[OutsIndex];
3668 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3669 // Skip inalloca arguments. They don't require any work.
3670 if (Flags.isInAlloca())
3672 // Create frame index.
3673 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3674 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3675 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3676 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3678 if (Flags.isByVal()) {
3679 // Copy relative to framepointer.
3680 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3681 if (!StackPtr.getNode())
3682 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3683 getPointerTy(DAG.getDataLayout()));
3684 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3687 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3691 // Store relative to framepointer.
3692 MemOpChains2.push_back(DAG.getStore(
3693 ArgChain, dl, Arg, FIN,
3694 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3698 if (!MemOpChains2.empty())
3699 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3701 // Store the return address to the appropriate stack slot.
3702 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3703 getPointerTy(DAG.getDataLayout()),
3704 RegInfo->getSlotSize(), FPDiff, dl);
3707 // Build a sequence of copy-to-reg nodes chained together with token chain
3708 // and flag operands which copy the outgoing args into registers.
3710 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3711 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3712 RegsToPass[i].second, InFlag);
3713 InFlag = Chain.getValue(1);
3716 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3717 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3718 // In the 64-bit large code model, we have to make all calls
3719 // through a register, since the call instruction's 32-bit
3720 // pc-relative offset may not be large enough to hold the whole
3722 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3723 // If the callee is a GlobalAddress node (quite common, every direct call
3724 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3726 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3728 // We should use extra load for direct calls to dllimported functions in
3730 const GlobalValue *GV = G->getGlobal();
3731 if (!GV->hasDLLImportStorageClass()) {
3732 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3734 Callee = DAG.getTargetGlobalAddress(
3735 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3737 if (OpFlags == X86II::MO_GOTPCREL) {
3739 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3740 getPointerTy(DAG.getDataLayout()), Callee);
3741 // Add extra indirection
3742 Callee = DAG.getLoad(
3743 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3744 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3747 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3748 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3749 unsigned char OpFlags =
3750 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3752 Callee = DAG.getTargetExternalSymbol(
3753 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3754 } else if (Subtarget.isTarget64BitILP32() &&
3755 Callee->getValueType(0) == MVT::i32) {
3756 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3757 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3760 // Returns a chain & a flag for retval copy to use.
3761 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3762 SmallVector<SDValue, 8> Ops;
3764 if (!IsSibcall && isTailCall) {
3765 Chain = DAG.getCALLSEQ_END(Chain,
3766 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3767 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3768 InFlag = Chain.getValue(1);
3771 Ops.push_back(Chain);
3772 Ops.push_back(Callee);
3775 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3777 // Add argument registers to the end of the list so that they are known live
3779 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3780 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3781 RegsToPass[i].second.getValueType()));
3783 // Add a register mask operand representing the call-preserved registers.
3784 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3785 // set X86_INTR calling convention because it has the same CSR mask
3786 // (same preserved registers).
3787 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3788 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3789 assert(Mask && "Missing call preserved mask for calling convention");
3791 // If this is an invoke in a 32-bit function using a funclet-based
3792 // personality, assume the function clobbers all registers. If an exception
3793 // is thrown, the runtime will not restore CSRs.
3794 // FIXME: Model this more precisely so that we can register allocate across
3795 // the normal edge and spill and fill across the exceptional edge.
3796 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3797 const Function &CallerFn = MF.getFunction();
3798 EHPersonality Pers =
3799 CallerFn.hasPersonalityFn()
3800 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3801 : EHPersonality::Unknown;
3802 if (isFuncletEHPersonality(Pers))
3803 Mask = RegInfo->getNoPreservedMask();
3806 // Define a new register mask from the existing mask.
3807 uint32_t *RegMask = nullptr;
3809 // In some calling conventions we need to remove the used physical registers
3810 // from the reg mask.
3811 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3812 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3814 // Allocate a new Reg Mask and copy Mask.
3815 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3816 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3817 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3819 // Make sure all sub registers of the argument registers are reset
3821 for (auto const &RegPair : RegsToPass)
3822 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3823 SubRegs.isValid(); ++SubRegs)
3824 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3826 // Create the RegMask Operand according to our updated mask.
3827 Ops.push_back(DAG.getRegisterMask(RegMask));
3829 // Create the RegMask Operand according to the static mask.
3830 Ops.push_back(DAG.getRegisterMask(Mask));
3833 if (InFlag.getNode())
3834 Ops.push_back(InFlag);
3838 //// If this is the first return lowered for this function, add the regs
3839 //// to the liveout set for the function.
3840 // This isn't right, although it's probably harmless on x86; liveouts
3841 // should be computed from returns not tail calls. Consider a void
3842 // function making a tail call to a function returning int.
3843 MF.getFrameInfo().setHasTailCall();
3844 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3847 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3848 InFlag = Chain.getValue(1);
3850 // Create the CALLSEQ_END node.
3851 unsigned NumBytesForCalleeToPop;
3852 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3853 DAG.getTarget().Options.GuaranteedTailCallOpt))
3854 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3855 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3856 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3857 SR == StackStructReturn)
3858 // If this is a call to a struct-return function, the callee
3859 // pops the hidden struct pointer, so we have to push it back.
3860 // This is common for Darwin/X86, Linux & Mingw32 targets.
3861 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3862 NumBytesForCalleeToPop = 4;
3864 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3866 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3867 // No need to reset the stack after the call if the call doesn't return. To
3868 // make the MI verify, we'll pretend the callee does it for us.
3869 NumBytesForCalleeToPop = NumBytes;
3872 // Returns a flag for retval copy to use.
3874 Chain = DAG.getCALLSEQ_END(Chain,
3875 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3876 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3879 InFlag = Chain.getValue(1);
3882 // Handle result values, copying them out of physregs into vregs that we
3884 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3888 //===----------------------------------------------------------------------===//
3889 // Fast Calling Convention (tail call) implementation
3890 //===----------------------------------------------------------------------===//
3892 // Like std call, callee cleans arguments, convention except that ECX is
3893 // reserved for storing the tail called function address. Only 2 registers are
3894 // free for argument passing (inreg). Tail call optimization is performed
3896 // * tailcallopt is enabled
3897 // * caller/callee are fastcc
3898 // On X86_64 architecture with GOT-style position independent code only local
3899 // (within module) calls are supported at the moment.
3900 // To keep the stack aligned according to platform abi the function
3901 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3902 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3903 // If a tail called function callee has more arguments than the caller the
3904 // caller needs to make sure that there is room to move the RETADDR to. This is
3905 // achieved by reserving an area the size of the argument delta right after the
3906 // original RETADDR, but before the saved framepointer or the spilled registers
3907 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3919 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3922 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3923 SelectionDAG& DAG) const {
3924 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3925 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3926 unsigned StackAlignment = TFI.getStackAlignment();
3927 uint64_t AlignMask = StackAlignment - 1;
3928 int64_t Offset = StackSize;
3929 unsigned SlotSize = RegInfo->getSlotSize();
3930 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3931 // Number smaller than 12 so just add the difference.
3932 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3934 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3935 Offset = ((~AlignMask) & Offset) + StackAlignment +
3936 (StackAlignment-SlotSize);
3941 /// Return true if the given stack call argument is already available in the
3942 /// same position (relatively) of the caller's incoming argument stack.
3944 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3945 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3946 const X86InstrInfo *TII, const CCValAssign &VA) {
3947 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3950 // Look through nodes that don't alter the bits of the incoming value.
3951 unsigned Op = Arg.getOpcode();
3952 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3953 Arg = Arg.getOperand(0);
3956 if (Op == ISD::TRUNCATE) {
3957 const SDValue &TruncInput = Arg.getOperand(0);
3958 if (TruncInput.getOpcode() == ISD::AssertZext &&
3959 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3960 Arg.getValueType()) {
3961 Arg = TruncInput.getOperand(0);
3969 if (Arg.getOpcode() == ISD::CopyFromReg) {
3970 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3971 if (!TargetRegisterInfo::isVirtualRegister(VR))
3973 MachineInstr *Def = MRI->getVRegDef(VR);
3976 if (!Flags.isByVal()) {
3977 if (!TII->isLoadFromStackSlot(*Def, FI))
3980 unsigned Opcode = Def->getOpcode();
3981 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3982 Opcode == X86::LEA64_32r) &&
3983 Def->getOperand(1).isFI()) {
3984 FI = Def->getOperand(1).getIndex();
3985 Bytes = Flags.getByValSize();
3989 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3990 if (Flags.isByVal())
3991 // ByVal argument is passed in as a pointer but it's now being
3992 // dereferenced. e.g.
3993 // define @foo(%struct.X* %A) {
3994 // tail call @bar(%struct.X* byval %A)
3997 SDValue Ptr = Ld->getBasePtr();
3998 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4001 FI = FINode->getIndex();
4002 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4003 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4004 FI = FINode->getIndex();
4005 Bytes = Flags.getByValSize();
4009 assert(FI != INT_MAX);
4010 if (!MFI.isFixedObjectIndex(FI))
4013 if (Offset != MFI.getObjectOffset(FI))
4016 // If this is not byval, check that the argument stack object is immutable.
4017 // inalloca and argument copy elision can create mutable argument stack
4018 // objects. Byval objects can be mutated, but a byval call intends to pass the
4020 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4023 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4024 // If the argument location is wider than the argument type, check that any
4025 // extension flags match.
4026 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4027 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4032 return Bytes == MFI.getObjectSize(FI);
4035 /// Check whether the call is eligible for tail call optimization. Targets
4036 /// that want to do tail call optimization should implement this function.
4037 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4038 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4039 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4040 const SmallVectorImpl<ISD::OutputArg> &Outs,
4041 const SmallVectorImpl<SDValue> &OutVals,
4042 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4043 if (!mayTailCallThisCC(CalleeCC))
4046 // If -tailcallopt is specified, make fastcc functions tail-callable.
4047 MachineFunction &MF = DAG.getMachineFunction();
4048 const Function &CallerF = MF.getFunction();
4050 // If the function return type is x86_fp80 and the callee return type is not,
4051 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4052 // perform a tailcall optimization here.
4053 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4056 CallingConv::ID CallerCC = CallerF.getCallingConv();
4057 bool CCMatch = CallerCC == CalleeCC;
4058 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4059 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4061 // Win64 functions have extra shadow space for argument homing. Don't do the
4062 // sibcall if the caller and callee have mismatched expectations for this
4064 if (IsCalleeWin64 != IsCallerWin64)
4067 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4068 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4073 // Look for obvious safe cases to perform tail call optimization that do not
4074 // require ABI changes. This is what gcc calls sibcall.
4076 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4077 // emit a special epilogue.
4078 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4079 if (RegInfo->needsStackRealignment(MF))
4082 // Also avoid sibcall optimization if either caller or callee uses struct
4083 // return semantics.
4084 if (isCalleeStructRet || isCallerStructRet)
4087 // Do not sibcall optimize vararg calls unless all arguments are passed via
4089 LLVMContext &C = *DAG.getContext();
4090 if (isVarArg && !Outs.empty()) {
4091 // Optimizing for varargs on Win64 is unlikely to be safe without
4092 // additional testing.
4093 if (IsCalleeWin64 || IsCallerWin64)
4096 SmallVector<CCValAssign, 16> ArgLocs;
4097 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4099 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4100 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4101 if (!ArgLocs[i].isRegLoc())
4105 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4106 // stack. Therefore, if it's not used by the call it is not safe to optimize
4107 // this into a sibcall.
4108 bool Unused = false;
4109 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4116 SmallVector<CCValAssign, 16> RVLocs;
4117 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4118 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4119 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4120 CCValAssign &VA = RVLocs[i];
4121 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4126 // Check that the call results are passed in the same way.
4127 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4128 RetCC_X86, RetCC_X86))
4130 // The callee has to preserve all registers the caller needs to preserve.
4131 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4132 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4134 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4135 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4139 unsigned StackArgsSize = 0;
4141 // If the callee takes no arguments then go on to check the results of the
4143 if (!Outs.empty()) {
4144 // Check if stack adjustment is needed. For now, do not do this if any
4145 // argument is passed on the stack.
4146 SmallVector<CCValAssign, 16> ArgLocs;
4147 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4149 // Allocate shadow area for Win64
4151 CCInfo.AllocateStack(32, 8);
4153 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4154 StackArgsSize = CCInfo.getNextStackOffset();
4156 if (CCInfo.getNextStackOffset()) {
4157 // Check if the arguments are already laid out in the right way as
4158 // the caller's fixed stack objects.
4159 MachineFrameInfo &MFI = MF.getFrameInfo();
4160 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4161 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4162 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4163 CCValAssign &VA = ArgLocs[i];
4164 SDValue Arg = OutVals[i];
4165 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4166 if (VA.getLocInfo() == CCValAssign::Indirect)
4168 if (!VA.isRegLoc()) {
4169 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4176 bool PositionIndependent = isPositionIndependent();
4177 // If the tailcall address may be in a register, then make sure it's
4178 // possible to register allocate for it. In 32-bit, the call address can
4179 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4180 // callee-saved registers are restored. These happen to be the same
4181 // registers used to pass 'inreg' arguments so watch out for those.
4182 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4183 !isa<ExternalSymbolSDNode>(Callee)) ||
4184 PositionIndependent)) {
4185 unsigned NumInRegs = 0;
4186 // In PIC we need an extra register to formulate the address computation
4188 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4190 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4191 CCValAssign &VA = ArgLocs[i];
4194 unsigned Reg = VA.getLocReg();
4197 case X86::EAX: case X86::EDX: case X86::ECX:
4198 if (++NumInRegs == MaxInRegs)
4205 const MachineRegisterInfo &MRI = MF.getRegInfo();
4206 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4210 bool CalleeWillPop =
4211 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4212 MF.getTarget().Options.GuaranteedTailCallOpt);
4214 if (unsigned BytesToPop =
4215 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4216 // If we have bytes to pop, the callee must pop them.
4217 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4218 if (!CalleePopMatches)
4220 } else if (CalleeWillPop && StackArgsSize > 0) {
4221 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4229 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4230 const TargetLibraryInfo *libInfo) const {
4231 return X86::createFastISel(funcInfo, libInfo);
4234 //===----------------------------------------------------------------------===//
4235 // Other Lowering Hooks
4236 //===----------------------------------------------------------------------===//
4238 static bool MayFoldLoad(SDValue Op) {
4239 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4242 static bool MayFoldIntoStore(SDValue Op) {
4243 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4246 static bool MayFoldIntoZeroExtend(SDValue Op) {
4247 if (Op.hasOneUse()) {
4248 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4249 return (ISD::ZERO_EXTEND == Opcode);
4254 static bool isTargetShuffle(unsigned Opcode) {
4256 default: return false;
4257 case X86ISD::BLENDI:
4258 case X86ISD::PSHUFB:
4259 case X86ISD::PSHUFD:
4260 case X86ISD::PSHUFHW:
4261 case X86ISD::PSHUFLW:
4263 case X86ISD::INSERTPS:
4264 case X86ISD::EXTRQI:
4265 case X86ISD::INSERTQI:
4266 case X86ISD::PALIGNR:
4267 case X86ISD::VSHLDQ:
4268 case X86ISD::VSRLDQ:
4269 case X86ISD::MOVLHPS:
4270 case X86ISD::MOVHLPS:
4271 case X86ISD::MOVLPS:
4272 case X86ISD::MOVLPD:
4273 case X86ISD::MOVSHDUP:
4274 case X86ISD::MOVSLDUP:
4275 case X86ISD::MOVDDUP:
4278 case X86ISD::UNPCKL:
4279 case X86ISD::UNPCKH:
4280 case X86ISD::VBROADCAST:
4281 case X86ISD::VPERMILPI:
4282 case X86ISD::VPERMILPV:
4283 case X86ISD::VPERM2X128:
4284 case X86ISD::VPERMIL2:
4285 case X86ISD::VPERMI:
4286 case X86ISD::VPPERM:
4287 case X86ISD::VPERMV:
4288 case X86ISD::VPERMV3:
4289 case X86ISD::VPERMIV3:
4290 case X86ISD::VZEXT_MOVL:
4295 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4297 default: return false;
4299 case X86ISD::PSHUFB:
4300 case X86ISD::VPERMILPV:
4301 case X86ISD::VPERMIL2:
4302 case X86ISD::VPPERM:
4303 case X86ISD::VPERMV:
4304 case X86ISD::VPERMV3:
4305 case X86ISD::VPERMIV3:
4307 // 'Faux' Target Shuffles.
4314 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4315 MachineFunction &MF = DAG.getMachineFunction();
4316 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4317 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4318 int ReturnAddrIndex = FuncInfo->getRAIndex();
4320 if (ReturnAddrIndex == 0) {
4321 // Set up a frame object for the return address.
4322 unsigned SlotSize = RegInfo->getSlotSize();
4323 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4326 FuncInfo->setRAIndex(ReturnAddrIndex);
4329 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4332 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4333 bool hasSymbolicDisplacement) {
4334 // Offset should fit into 32 bit immediate field.
4335 if (!isInt<32>(Offset))
4338 // If we don't have a symbolic displacement - we don't have any extra
4340 if (!hasSymbolicDisplacement)
4343 // FIXME: Some tweaks might be needed for medium code model.
4344 if (M != CodeModel::Small && M != CodeModel::Kernel)
4347 // For small code model we assume that latest object is 16MB before end of 31
4348 // bits boundary. We may also accept pretty large negative constants knowing
4349 // that all objects are in the positive half of address space.
4350 if (M == CodeModel::Small && Offset < 16*1024*1024)
4353 // For kernel code model we know that all object resist in the negative half
4354 // of 32bits address space. We may not accept negative offsets, since they may
4355 // be just off and we may accept pretty large positive ones.
4356 if (M == CodeModel::Kernel && Offset >= 0)
4362 /// Determines whether the callee is required to pop its own arguments.
4363 /// Callee pop is necessary to support tail calls.
4364 bool X86::isCalleePop(CallingConv::ID CallingConv,
4365 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4366 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4367 // can guarantee TCO.
4368 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4371 switch (CallingConv) {
4374 case CallingConv::X86_StdCall:
4375 case CallingConv::X86_FastCall:
4376 case CallingConv::X86_ThisCall:
4377 case CallingConv::X86_VectorCall:
4382 /// \brief Return true if the condition is an unsigned comparison operation.
4383 static bool isX86CCUnsigned(unsigned X86CC) {
4386 llvm_unreachable("Invalid integer condition!");
4402 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4403 switch (SetCCOpcode) {
4404 default: llvm_unreachable("Invalid integer condition!");
4405 case ISD::SETEQ: return X86::COND_E;
4406 case ISD::SETGT: return X86::COND_G;
4407 case ISD::SETGE: return X86::COND_GE;
4408 case ISD::SETLT: return X86::COND_L;
4409 case ISD::SETLE: return X86::COND_LE;
4410 case ISD::SETNE: return X86::COND_NE;
4411 case ISD::SETULT: return X86::COND_B;
4412 case ISD::SETUGT: return X86::COND_A;
4413 case ISD::SETULE: return X86::COND_BE;
4414 case ISD::SETUGE: return X86::COND_AE;
4418 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4419 /// condition code, returning the condition code and the LHS/RHS of the
4420 /// comparison to make.
4421 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4422 bool isFP, SDValue &LHS, SDValue &RHS,
4423 SelectionDAG &DAG) {
4425 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4426 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4427 // X > -1 -> X == 0, jump !sign.
4428 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4429 return X86::COND_NS;
4431 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4432 // X < 0 -> X == 0, jump on sign.
4435 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4437 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4438 return X86::COND_LE;
4442 return TranslateIntegerX86CC(SetCCOpcode);
4445 // First determine if it is required or is profitable to flip the operands.
4447 // If LHS is a foldable load, but RHS is not, flip the condition.
4448 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4449 !ISD::isNON_EXTLoad(RHS.getNode())) {
4450 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4451 std::swap(LHS, RHS);
4454 switch (SetCCOpcode) {
4460 std::swap(LHS, RHS);
4464 // On a floating point condition, the flags are set as follows:
4466 // 0 | 0 | 0 | X > Y
4467 // 0 | 0 | 1 | X < Y
4468 // 1 | 0 | 0 | X == Y
4469 // 1 | 1 | 1 | unordered
4470 switch (SetCCOpcode) {
4471 default: llvm_unreachable("Condcode should be pre-legalized away");
4473 case ISD::SETEQ: return X86::COND_E;
4474 case ISD::SETOLT: // flipped
4476 case ISD::SETGT: return X86::COND_A;
4477 case ISD::SETOLE: // flipped
4479 case ISD::SETGE: return X86::COND_AE;
4480 case ISD::SETUGT: // flipped
4482 case ISD::SETLT: return X86::COND_B;
4483 case ISD::SETUGE: // flipped
4485 case ISD::SETLE: return X86::COND_BE;
4487 case ISD::SETNE: return X86::COND_NE;
4488 case ISD::SETUO: return X86::COND_P;
4489 case ISD::SETO: return X86::COND_NP;
4491 case ISD::SETUNE: return X86::COND_INVALID;
4495 /// Is there a floating point cmov for the specific X86 condition code?
4496 /// Current x86 isa includes the following FP cmov instructions:
4497 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4498 static bool hasFPCMov(unsigned X86CC) {
4515 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4517 MachineFunction &MF,
4518 unsigned Intrinsic) const {
4520 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4524 Info.opc = ISD::INTRINSIC_W_CHAIN;
4525 Info.flags = MachineMemOperand::MONone;
4528 switch (IntrData->Type) {
4529 case EXPAND_FROM_MEM: {
4530 Info.ptrVal = I.getArgOperand(0);
4531 Info.memVT = MVT::getVT(I.getType());
4533 Info.flags |= MachineMemOperand::MOLoad;
4536 case COMPRESS_TO_MEM: {
4537 Info.ptrVal = I.getArgOperand(0);
4538 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4540 Info.flags |= MachineMemOperand::MOStore;
4543 case TRUNCATE_TO_MEM_VI8:
4544 case TRUNCATE_TO_MEM_VI16:
4545 case TRUNCATE_TO_MEM_VI32: {
4546 Info.ptrVal = I.getArgOperand(0);
4547 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4548 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4549 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4551 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4552 ScalarVT = MVT::i16;
4553 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4554 ScalarVT = MVT::i32;
4556 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4558 Info.flags |= MachineMemOperand::MOStore;
4568 /// Returns true if the target can instruction select the
4569 /// specified FP immediate natively. If false, the legalizer will
4570 /// materialize the FP immediate as a load from a constant pool.
4571 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4572 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4573 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4579 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4580 ISD::LoadExtType ExtTy,
4582 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4583 // relocation target a movq or addq instruction: don't let the load shrink.
4584 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4585 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4586 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4587 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4591 /// \brief Returns true if it is beneficial to convert a load of a constant
4592 /// to just the constant itself.
4593 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4595 assert(Ty->isIntegerTy());
4597 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4598 if (BitSize == 0 || BitSize > 64)
4603 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4604 // TODO: It might be a win to ease or lift this restriction, but the generic
4605 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4606 if (VT.isVector() && Subtarget.hasAVX512())
4612 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4613 unsigned Index) const {
4614 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4617 // Mask vectors support all subregister combinations and operations that
4618 // extract half of vector.
4619 if (ResVT.getVectorElementType() == MVT::i1)
4620 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4621 (Index == ResVT.getVectorNumElements()));
4623 return (Index % ResVT.getVectorNumElements()) == 0;
4626 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4627 // Speculate cttz only if we can directly use TZCNT.
4628 return Subtarget.hasBMI();
4631 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4632 // Speculate ctlz only if we can directly use LZCNT.
4633 return Subtarget.hasLZCNT();
4636 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4637 EVT BitcastVT) const {
4638 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4641 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4644 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4645 const SelectionDAG &DAG) const {
4646 // Do not merge to float value size (128 bytes) if no implicit
4647 // float attribute is set.
4648 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4649 Attribute::NoImplicitFloat);
4652 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4653 return (MemVT.getSizeInBits() <= MaxIntSize);
4658 bool X86TargetLowering::isCtlzFast() const {
4659 return Subtarget.hasFastLZCNT();
4662 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4663 const Instruction &AndI) const {
4667 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4668 if (!Subtarget.hasBMI())
4671 // There are only 32-bit and 64-bit forms for 'andn'.
4672 EVT VT = Y.getValueType();
4673 if (VT != MVT::i32 && VT != MVT::i64)
4679 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4680 MVT VT = MVT::getIntegerVT(NumBits);
4681 if (isTypeLegal(VT))
4684 // PMOVMSKB can handle this.
4685 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4688 // VPMOVMSKB can handle this.
4689 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4692 // TODO: Allow 64-bit type for 32-bit target.
4693 // TODO: 512-bit types should be allowed, but make sure that those
4694 // cases are handled in combineVectorSizedSetCCEquality().
4696 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4699 /// Val is the undef sentinel value or equal to the specified value.
4700 static bool isUndefOrEqual(int Val, int CmpVal) {
4701 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4704 /// Val is either the undef or zero sentinel value.
4705 static bool isUndefOrZero(int Val) {
4706 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4709 /// Return true if every element in Mask, beginning
4710 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4711 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4712 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4713 if (Mask[i] != SM_SentinelUndef)
4718 /// Return true if Val is undef or if its value falls within the
4719 /// specified range (L, H].
4720 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4721 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4724 /// Return true if every element in Mask is undef or if its value
4725 /// falls within the specified range (L, H].
4726 static bool isUndefOrInRange(ArrayRef<int> Mask,
4729 if (!isUndefOrInRange(M, Low, Hi))
4734 /// Return true if Val is undef, zero or if its value falls within the
4735 /// specified range (L, H].
4736 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4737 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4740 /// Return true if every element in Mask is undef, zero or if its value
4741 /// falls within the specified range (L, H].
4742 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4744 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4749 /// Return true if every element in Mask, beginning
4750 /// from position Pos and ending in Pos+Size, falls within the specified
4751 /// sequential range (Low, Low+Size]. or is undef.
4752 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4753 unsigned Pos, unsigned Size, int Low) {
4754 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4755 if (!isUndefOrEqual(Mask[i], Low))
4760 /// Return true if every element in Mask, beginning
4761 /// from position Pos and ending in Pos+Size, falls within the specified
4762 /// sequential range (Low, Low+Size], or is undef or is zero.
4763 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4764 unsigned Size, int Low) {
4765 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4766 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4771 /// Return true if every element in Mask, beginning
4772 /// from position Pos and ending in Pos+Size is undef or is zero.
4773 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4775 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4776 if (!isUndefOrZero(Mask[i]))
4781 /// \brief Helper function to test whether a shuffle mask could be
4782 /// simplified by widening the elements being shuffled.
4784 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4785 /// leaves it in an unspecified state.
4787 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4788 /// shuffle masks. The latter have the special property of a '-2' representing
4789 /// a zero-ed lane of a vector.
4790 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4791 SmallVectorImpl<int> &WidenedMask) {
4792 WidenedMask.assign(Mask.size() / 2, 0);
4793 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4795 int M1 = Mask[i + 1];
4797 // If both elements are undef, its trivial.
4798 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4799 WidenedMask[i / 2] = SM_SentinelUndef;
4803 // Check for an undef mask and a mask value properly aligned to fit with
4804 // a pair of values. If we find such a case, use the non-undef mask's value.
4805 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4806 WidenedMask[i / 2] = M1 / 2;
4809 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4810 WidenedMask[i / 2] = M0 / 2;
4814 // When zeroing, we need to spread the zeroing across both lanes to widen.
4815 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4816 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4817 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4818 WidenedMask[i / 2] = SM_SentinelZero;
4824 // Finally check if the two mask values are adjacent and aligned with
4826 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4827 WidenedMask[i / 2] = M0 / 2;
4831 // Otherwise we can't safely widen the elements used in this shuffle.
4834 assert(WidenedMask.size() == Mask.size() / 2 &&
4835 "Incorrect size of mask after widening the elements!");
4840 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4841 bool X86::isZeroNode(SDValue Elt) {
4842 return isNullConstant(Elt) || isNullFPConstant(Elt);
4845 // Build a vector of constants.
4846 // Use an UNDEF node if MaskElt == -1.
4847 // Split 64-bit constants in the 32-bit mode.
4848 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4849 const SDLoc &dl, bool IsMask = false) {
4851 SmallVector<SDValue, 32> Ops;
4854 MVT ConstVecVT = VT;
4855 unsigned NumElts = VT.getVectorNumElements();
4856 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4857 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4858 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4862 MVT EltVT = ConstVecVT.getVectorElementType();
4863 for (unsigned i = 0; i < NumElts; ++i) {
4864 bool IsUndef = Values[i] < 0 && IsMask;
4865 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4866 DAG.getConstant(Values[i], dl, EltVT);
4867 Ops.push_back(OpNode);
4869 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4870 DAG.getConstant(0, dl, EltVT));
4872 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4874 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4878 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4879 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4880 assert(Bits.size() == Undefs.getBitWidth() &&
4881 "Unequal constant and undef arrays");
4882 SmallVector<SDValue, 32> Ops;
4885 MVT ConstVecVT = VT;
4886 unsigned NumElts = VT.getVectorNumElements();
4887 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4888 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4889 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4893 MVT EltVT = ConstVecVT.getVectorElementType();
4894 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4896 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4899 const APInt &V = Bits[i];
4900 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4902 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4903 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4904 } else if (EltVT == MVT::f32) {
4905 APFloat FV(APFloat::IEEEsingle(), V);
4906 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4907 } else if (EltVT == MVT::f64) {
4908 APFloat FV(APFloat::IEEEdouble(), V);
4909 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4911 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4915 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4916 return DAG.getBitcast(VT, ConstsNode);
4919 /// Returns a vector of specified type with all zero elements.
4920 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4921 SelectionDAG &DAG, const SDLoc &dl) {
4922 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4923 VT.getVectorElementType() == MVT::i1) &&
4924 "Unexpected vector type");
4926 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4927 // type. This ensures they get CSE'd. But if the integer type is not
4928 // available, use a floating-point +0.0 instead.
4930 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4931 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4932 } else if (VT.getVectorElementType() == MVT::i1) {
4933 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4934 "Unexpected vector type");
4935 Vec = DAG.getConstant(0, dl, VT);
4937 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4938 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4940 return DAG.getBitcast(VT, Vec);
4943 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4944 const SDLoc &dl, unsigned vectorWidth) {
4945 EVT VT = Vec.getValueType();
4946 EVT ElVT = VT.getVectorElementType();
4947 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4948 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4949 VT.getVectorNumElements()/Factor);
4951 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4952 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4953 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4955 // This is the index of the first element of the vectorWidth-bit chunk
4956 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4957 IdxVal &= ~(ElemsPerChunk - 1);
4959 // If the input is a buildvector just emit a smaller one.
4960 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4961 return DAG.getBuildVector(ResultVT, dl,
4962 Vec->ops().slice(IdxVal, ElemsPerChunk));
4964 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4965 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4968 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4969 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4970 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4971 /// instructions or a simple subregister reference. Idx is an index in the
4972 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4973 /// lowering EXTRACT_VECTOR_ELT operations easier.
4974 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4975 SelectionDAG &DAG, const SDLoc &dl) {
4976 assert((Vec.getValueType().is256BitVector() ||
4977 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4978 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4981 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4982 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4983 SelectionDAG &DAG, const SDLoc &dl) {
4984 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4985 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4988 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4989 SelectionDAG &DAG, const SDLoc &dl,
4990 unsigned vectorWidth) {
4991 assert((vectorWidth == 128 || vectorWidth == 256) &&
4992 "Unsupported vector width");
4993 // Inserting UNDEF is Result
4996 EVT VT = Vec.getValueType();
4997 EVT ElVT = VT.getVectorElementType();
4998 EVT ResultVT = Result.getValueType();
5000 // Insert the relevant vectorWidth bits.
5001 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5002 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5004 // This is the index of the first element of the vectorWidth-bit chunk
5005 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5006 IdxVal &= ~(ElemsPerChunk - 1);
5008 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5009 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5012 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5013 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5014 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5015 /// simple superregister reference. Idx is an index in the 128 bits
5016 /// we want. It need not be aligned to a 128-bit boundary. That makes
5017 /// lowering INSERT_VECTOR_ELT operations easier.
5018 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5019 SelectionDAG &DAG, const SDLoc &dl) {
5020 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5021 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5024 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5025 SelectionDAG &DAG, const SDLoc &dl) {
5026 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5027 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5030 // Return true if the instruction zeroes the unused upper part of the
5031 // destination and accepts mask.
5032 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5038 case X86ISD::CMPM_RND:
5043 /// Insert i1-subvector to i1-vector.
5044 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5045 const X86Subtarget &Subtarget) {
5048 SDValue Vec = Op.getOperand(0);
5049 SDValue SubVec = Op.getOperand(1);
5050 SDValue Idx = Op.getOperand(2);
5052 if (!isa<ConstantSDNode>(Idx))
5055 // Inserting undef is a nop. We can just return the original vector.
5056 if (SubVec.isUndef())
5059 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5060 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5063 MVT OpVT = Op.getSimpleValueType();
5064 unsigned NumElems = OpVT.getVectorNumElements();
5066 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5068 // Extend to natively supported kshift.
5069 MVT WideOpVT = OpVT;
5070 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5071 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5073 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5075 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5076 // May need to promote to a legal type.
5077 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5078 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5080 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5083 MVT SubVecVT = SubVec.getSimpleValueType();
5084 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5086 assert(IdxVal + SubVecNumElems <= NumElems &&
5087 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5088 "Unexpected index value in INSERT_SUBVECTOR");
5090 SDValue Undef = DAG.getUNDEF(WideOpVT);
5093 // Zero lower bits of the Vec
5094 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5095 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5097 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5098 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5099 // Merge them together, SubVec should be zero extended.
5100 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5101 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5103 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5104 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5107 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5108 Undef, SubVec, ZeroIdx);
5110 if (Vec.isUndef()) {
5111 assert(IdxVal != 0 && "Unexpected index");
5112 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5113 DAG.getConstant(IdxVal, dl, MVT::i8));
5114 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5117 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5118 assert(IdxVal != 0 && "Unexpected index");
5119 NumElems = WideOpVT.getVectorNumElements();
5120 unsigned ShiftLeft = NumElems - SubVecNumElems;
5121 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5122 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5123 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5124 if (ShiftRight != 0)
5125 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5126 DAG.getConstant(ShiftRight, dl, MVT::i8));
5127 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5130 // Simple case when we put subvector in the upper part
5131 if (IdxVal + SubVecNumElems == NumElems) {
5132 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5133 DAG.getConstant(IdxVal, dl, MVT::i8));
5134 if (SubVecNumElems * 2 == NumElems) {
5135 // Special case, use legal zero extending insert_subvector. This allows
5136 // isel to opimitize when bits are known zero.
5137 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5138 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5139 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5142 // Otherwise use explicit shifts to zero the bits.
5143 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5144 Undef, Vec, ZeroIdx);
5145 NumElems = WideOpVT.getVectorNumElements();
5146 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5147 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5148 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5150 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5151 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5154 // Inserting into the middle is more complicated.
5156 NumElems = WideOpVT.getVectorNumElements();
5158 // Widen the vector if needed.
5159 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5160 // Move the current value of the bit to be replace to the lsbs.
5161 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5162 DAG.getConstant(IdxVal, dl, MVT::i8));
5163 // Xor with the new bit.
5164 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5165 // Shift to MSB, filling bottom bits with 0.
5166 unsigned ShiftLeft = NumElems - SubVecNumElems;
5167 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5168 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5169 // Shift to the final position, filling upper bits with 0.
5170 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5171 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5172 DAG.getConstant(ShiftRight, dl, MVT::i8));
5173 // Xor with original vector leaving the new value.
5174 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5175 // Reduce to original width if needed.
5176 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5179 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5180 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5181 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5182 /// large BUILD_VECTORS.
5183 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5184 unsigned NumElems, SelectionDAG &DAG,
5186 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5187 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5190 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5191 unsigned NumElems, SelectionDAG &DAG,
5193 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5194 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5197 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5198 unsigned NumElems, SelectionDAG &DAG,
5199 const SDLoc &dl, unsigned VectorWidth) {
5200 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5201 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5204 /// Returns a vector of specified type with all bits set.
5205 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5206 /// Then bitcast to their original type, ensuring they get CSE'd.
5207 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5208 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5209 "Expected a 128/256/512-bit vector type");
5211 APInt Ones = APInt::getAllOnesValue(32);
5212 unsigned NumElts = VT.getSizeInBits() / 32;
5213 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5214 return DAG.getBitcast(VT, Vec);
5217 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5218 SelectionDAG &DAG) {
5219 EVT InVT = In.getValueType();
5220 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5222 if (VT.is128BitVector() && InVT.is128BitVector())
5223 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5224 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5226 // For 256-bit vectors, we only need the lower (128-bit) input half.
5227 // For 512-bit vectors, we only need the lower input half or quarter.
5228 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5229 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5230 In = extractSubVector(In, 0, DAG, DL,
5231 std::max(128, (int)VT.getSizeInBits() / Scale));
5234 return DAG.getNode(Opc, DL, VT, In);
5237 /// Returns a vector_shuffle node for an unpackl operation.
5238 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5239 SDValue V1, SDValue V2) {
5240 SmallVector<int, 8> Mask;
5241 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5242 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5245 /// Returns a vector_shuffle node for an unpackh operation.
5246 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5247 SDValue V1, SDValue V2) {
5248 SmallVector<int, 8> Mask;
5249 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5250 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5253 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5254 /// This produces a shuffle where the low element of V2 is swizzled into the
5255 /// zero/undef vector, landing at element Idx.
5256 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5257 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5259 const X86Subtarget &Subtarget,
5260 SelectionDAG &DAG) {
5261 MVT VT = V2.getSimpleValueType();
5263 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5264 int NumElems = VT.getVectorNumElements();
5265 SmallVector<int, 16> MaskVec(NumElems);
5266 for (int i = 0; i != NumElems; ++i)
5267 // If this is the insertion idx, put the low elt of V2 here.
5268 MaskVec[i] = (i == Idx) ? NumElems : i;
5269 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5272 static SDValue peekThroughBitcasts(SDValue V) {
5273 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5274 V = V.getOperand(0);
5278 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5279 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5280 V.getOperand(0).hasOneUse())
5281 V = V.getOperand(0);
5285 static const Constant *getTargetConstantFromNode(SDValue Op) {
5286 Op = peekThroughBitcasts(Op);
5288 auto *Load = dyn_cast<LoadSDNode>(Op);
5292 SDValue Ptr = Load->getBasePtr();
5293 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5294 Ptr->getOpcode() == X86ISD::WrapperRIP)
5295 Ptr = Ptr->getOperand(0);
5297 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5298 if (!CNode || CNode->isMachineConstantPoolEntry())
5301 return dyn_cast<Constant>(CNode->getConstVal());
5304 // Extract raw constant bits from constant pools.
5305 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5307 SmallVectorImpl<APInt> &EltBits,
5308 bool AllowWholeUndefs = true,
5309 bool AllowPartialUndefs = true) {
5310 assert(EltBits.empty() && "Expected an empty EltBits vector");
5312 Op = peekThroughBitcasts(Op);
5314 EVT VT = Op.getValueType();
5315 unsigned SizeInBits = VT.getSizeInBits();
5316 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5317 unsigned NumElts = SizeInBits / EltSizeInBits;
5319 // Bitcast a source array of element bits to the target size.
5320 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5321 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5322 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5323 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5324 "Constant bit sizes don't match");
5326 // Don't split if we don't allow undef bits.
5327 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5328 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5331 // If we're already the right size, don't bother bitcasting.
5332 if (NumSrcElts == NumElts) {
5333 UndefElts = UndefSrcElts;
5334 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5338 // Extract all the undef/constant element data and pack into single bitsets.
5339 APInt UndefBits(SizeInBits, 0);
5340 APInt MaskBits(SizeInBits, 0);
5342 for (unsigned i = 0; i != NumSrcElts; ++i) {
5343 unsigned BitOffset = i * SrcEltSizeInBits;
5344 if (UndefSrcElts[i])
5345 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5346 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5349 // Split the undef/constant single bitset data into the target elements.
5350 UndefElts = APInt(NumElts, 0);
5351 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5353 for (unsigned i = 0; i != NumElts; ++i) {
5354 unsigned BitOffset = i * EltSizeInBits;
5355 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5357 // Only treat an element as UNDEF if all bits are UNDEF.
5358 if (UndefEltBits.isAllOnesValue()) {
5359 if (!AllowWholeUndefs)
5361 UndefElts.setBit(i);
5365 // If only some bits are UNDEF then treat them as zero (or bail if not
5367 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5370 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5371 EltBits[i] = Bits.getZExtValue();
5376 // Collect constant bits and insert into mask/undef bit masks.
5377 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5378 unsigned UndefBitIndex) {
5381 if (isa<UndefValue>(Cst)) {
5382 Undefs.setBit(UndefBitIndex);
5385 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5386 Mask = CInt->getValue();
5389 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5390 Mask = CFP->getValueAPF().bitcastToAPInt();
5398 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5399 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5400 return CastBitData(UndefSrcElts, SrcEltBits);
5403 // Extract scalar constant bits.
5404 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5405 APInt UndefSrcElts = APInt::getNullValue(1);
5406 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5407 return CastBitData(UndefSrcElts, SrcEltBits);
5409 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5410 APInt UndefSrcElts = APInt::getNullValue(1);
5411 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5412 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5413 return CastBitData(UndefSrcElts, SrcEltBits);
5416 // Extract constant bits from build vector.
5417 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5418 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5419 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5421 APInt UndefSrcElts(NumSrcElts, 0);
5422 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5423 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5424 const SDValue &Src = Op.getOperand(i);
5425 if (Src.isUndef()) {
5426 UndefSrcElts.setBit(i);
5429 auto *Cst = cast<ConstantSDNode>(Src);
5430 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5432 return CastBitData(UndefSrcElts, SrcEltBits);
5435 // Extract constant bits from constant pool vector.
5436 if (auto *Cst = getTargetConstantFromNode(Op)) {
5437 Type *CstTy = Cst->getType();
5438 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5441 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5442 unsigned NumSrcElts = CstTy->getVectorNumElements();
5444 APInt UndefSrcElts(NumSrcElts, 0);
5445 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5446 for (unsigned i = 0; i != NumSrcElts; ++i)
5447 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5451 return CastBitData(UndefSrcElts, SrcEltBits);
5454 // Extract constant bits from a broadcasted constant pool scalar.
5455 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5456 EltSizeInBits <= VT.getScalarSizeInBits()) {
5457 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5458 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5459 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5461 APInt UndefSrcElts(NumSrcElts, 0);
5462 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5463 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5464 if (UndefSrcElts[0])
5465 UndefSrcElts.setBits(0, NumSrcElts);
5466 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5467 return CastBitData(UndefSrcElts, SrcEltBits);
5472 // Extract a rematerialized scalar constant insertion.
5473 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5474 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5475 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5476 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5477 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5479 APInt UndefSrcElts(NumSrcElts, 0);
5480 SmallVector<APInt, 64> SrcEltBits;
5481 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5482 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5483 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5484 return CastBitData(UndefSrcElts, SrcEltBits);
5490 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5491 unsigned MaskEltSizeInBits,
5492 SmallVectorImpl<uint64_t> &RawMask) {
5494 SmallVector<APInt, 64> EltBits;
5496 // Extract the raw target constant bits.
5497 // FIXME: We currently don't support UNDEF bits or mask entries.
5498 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5499 EltBits, /* AllowWholeUndefs */ false,
5500 /* AllowPartialUndefs */ false))
5503 // Insert the extracted elements into the mask.
5504 for (APInt Elt : EltBits)
5505 RawMask.push_back(Elt.getZExtValue());
5510 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5511 /// Note: This ignores saturation, so inputs must be checked first.
5512 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5514 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5515 unsigned NumElts = VT.getVectorNumElements();
5516 unsigned NumLanes = VT.getSizeInBits() / 128;
5517 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5518 unsigned Offset = Unary ? 0 : NumElts;
5520 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5521 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5522 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5523 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5524 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5528 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5529 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5530 /// operands in \p Ops, and returns true.
5531 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5532 /// IsUnary for shuffles which use a single input multiple times, and in those
5533 /// cases it will adjust the mask to only have indices within that single input.
5534 /// It is an error to call this with non-empty Mask/Ops vectors.
5535 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5536 SmallVectorImpl<SDValue> &Ops,
5537 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5538 unsigned NumElems = VT.getVectorNumElements();
5541 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5542 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5545 bool IsFakeUnary = false;
5546 switch(N->getOpcode()) {
5547 case X86ISD::BLENDI:
5548 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5549 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5550 ImmN = N->getOperand(N->getNumOperands()-1);
5551 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5552 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5555 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5556 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5557 ImmN = N->getOperand(N->getNumOperands()-1);
5558 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5559 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5561 case X86ISD::INSERTPS:
5562 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5563 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5564 ImmN = N->getOperand(N->getNumOperands()-1);
5565 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5566 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5568 case X86ISD::EXTRQI:
5569 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5570 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5571 isa<ConstantSDNode>(N->getOperand(2))) {
5572 int BitLen = N->getConstantOperandVal(1);
5573 int BitIdx = N->getConstantOperandVal(2);
5574 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5578 case X86ISD::INSERTQI:
5579 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5580 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5581 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5582 isa<ConstantSDNode>(N->getOperand(3))) {
5583 int BitLen = N->getConstantOperandVal(2);
5584 int BitIdx = N->getConstantOperandVal(3);
5585 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5586 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5589 case X86ISD::UNPCKH:
5590 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5591 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5592 DecodeUNPCKHMask(VT, Mask);
5593 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5595 case X86ISD::UNPCKL:
5596 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5597 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5598 DecodeUNPCKLMask(VT, Mask);
5599 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5601 case X86ISD::MOVHLPS:
5602 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5603 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5604 DecodeMOVHLPSMask(NumElems, Mask);
5605 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5607 case X86ISD::MOVLHPS:
5608 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5609 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5610 DecodeMOVLHPSMask(NumElems, Mask);
5611 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5613 case X86ISD::PALIGNR:
5614 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5615 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5616 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5617 ImmN = N->getOperand(N->getNumOperands()-1);
5618 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5619 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5620 Ops.push_back(N->getOperand(1));
5621 Ops.push_back(N->getOperand(0));
5623 case X86ISD::VSHLDQ:
5624 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5625 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5626 ImmN = N->getOperand(N->getNumOperands() - 1);
5627 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5630 case X86ISD::VSRLDQ:
5631 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5632 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5633 ImmN = N->getOperand(N->getNumOperands() - 1);
5634 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5637 case X86ISD::PSHUFD:
5638 case X86ISD::VPERMILPI:
5639 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5640 ImmN = N->getOperand(N->getNumOperands()-1);
5641 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5644 case X86ISD::PSHUFHW:
5645 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5646 ImmN = N->getOperand(N->getNumOperands()-1);
5647 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5650 case X86ISD::PSHUFLW:
5651 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5652 ImmN = N->getOperand(N->getNumOperands()-1);
5653 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5656 case X86ISD::VZEXT_MOVL:
5657 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5658 DecodeZeroMoveLowMask(VT, Mask);
5661 case X86ISD::VBROADCAST: {
5662 SDValue N0 = N->getOperand(0);
5663 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5664 // add the pre-extracted value to the Ops vector.
5665 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5666 N0.getOperand(0).getValueType() == VT &&
5667 N0.getConstantOperandVal(1) == 0)
5668 Ops.push_back(N0.getOperand(0));
5670 // We only decode broadcasts of same-sized vectors, unless the broadcast
5671 // came from an extract from the original width. If we found one, we
5672 // pushed it the Ops vector above.
5673 if (N0.getValueType() == VT || !Ops.empty()) {
5674 DecodeVectorBroadcast(VT, Mask);
5680 case X86ISD::VPERMILPV: {
5681 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5683 SDValue MaskNode = N->getOperand(1);
5684 unsigned MaskEltSize = VT.getScalarSizeInBits();
5685 SmallVector<uint64_t, 32> RawMask;
5686 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5687 DecodeVPERMILPMask(VT, RawMask, Mask);
5690 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5691 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5696 case X86ISD::PSHUFB: {
5697 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5698 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5699 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5701 SDValue MaskNode = N->getOperand(1);
5702 SmallVector<uint64_t, 32> RawMask;
5703 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5704 DecodePSHUFBMask(RawMask, Mask);
5707 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5708 DecodePSHUFBMask(C, Mask);
5713 case X86ISD::VPERMI:
5714 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5715 ImmN = N->getOperand(N->getNumOperands()-1);
5716 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5721 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5722 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5723 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5725 case X86ISD::VPERM2X128:
5726 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5727 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5728 ImmN = N->getOperand(N->getNumOperands()-1);
5729 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5730 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5732 case X86ISD::MOVSLDUP:
5733 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5734 DecodeMOVSLDUPMask(VT, Mask);
5737 case X86ISD::MOVSHDUP:
5738 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5739 DecodeMOVSHDUPMask(VT, Mask);
5742 case X86ISD::MOVDDUP:
5743 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5744 DecodeMOVDDUPMask(VT, Mask);
5747 case X86ISD::MOVLPD:
5748 case X86ISD::MOVLPS:
5749 // Not yet implemented
5751 case X86ISD::VPERMIL2: {
5752 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5753 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5754 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5755 unsigned MaskEltSize = VT.getScalarSizeInBits();
5756 SDValue MaskNode = N->getOperand(2);
5757 SDValue CtrlNode = N->getOperand(3);
5758 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5759 unsigned CtrlImm = CtrlOp->getZExtValue();
5760 SmallVector<uint64_t, 32> RawMask;
5761 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5762 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5765 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5766 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5772 case X86ISD::VPPERM: {
5773 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5774 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5775 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5776 SDValue MaskNode = N->getOperand(2);
5777 SmallVector<uint64_t, 32> RawMask;
5778 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5779 DecodeVPPERMMask(RawMask, Mask);
5782 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5783 DecodeVPPERMMask(C, Mask);
5788 case X86ISD::VPERMV: {
5789 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5791 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5792 Ops.push_back(N->getOperand(1));
5793 SDValue MaskNode = N->getOperand(0);
5794 SmallVector<uint64_t, 32> RawMask;
5795 unsigned MaskEltSize = VT.getScalarSizeInBits();
5796 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5797 DecodeVPERMVMask(RawMask, Mask);
5800 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5801 DecodeVPERMVMask(C, MaskEltSize, Mask);
5806 case X86ISD::VPERMV3: {
5807 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5808 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5809 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5810 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5811 Ops.push_back(N->getOperand(0));
5812 Ops.push_back(N->getOperand(2));
5813 SDValue MaskNode = N->getOperand(1);
5814 unsigned MaskEltSize = VT.getScalarSizeInBits();
5815 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5816 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5821 case X86ISD::VPERMIV3: {
5822 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5823 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5824 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5825 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5826 Ops.push_back(N->getOperand(1));
5827 Ops.push_back(N->getOperand(2));
5828 SDValue MaskNode = N->getOperand(0);
5829 unsigned MaskEltSize = VT.getScalarSizeInBits();
5830 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5831 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5836 default: llvm_unreachable("unknown target shuffle node");
5839 // Empty mask indicates the decode failed.
5843 // Check if we're getting a shuffle mask with zero'd elements.
5844 if (!AllowSentinelZero)
5845 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5848 // If we have a fake unary shuffle, the shuffle mask is spread across two
5849 // inputs that are actually the same node. Re-map the mask to always point
5850 // into the first input.
5853 if (M >= (int)Mask.size())
5856 // If we didn't already add operands in the opcode-specific code, default to
5857 // adding 1 or 2 operands starting at 0.
5859 Ops.push_back(N->getOperand(0));
5860 if (!IsUnary || IsFakeUnary)
5861 Ops.push_back(N->getOperand(1));
5867 /// Check a target shuffle mask's inputs to see if we can set any values to
5868 /// SM_SentinelZero - this is for elements that are known to be zero
5869 /// (not just zeroable) from their inputs.
5870 /// Returns true if the target shuffle mask was decoded.
5871 static bool setTargetShuffleZeroElements(SDValue N,
5872 SmallVectorImpl<int> &Mask,
5873 SmallVectorImpl<SDValue> &Ops) {
5875 if (!isTargetShuffle(N.getOpcode()))
5878 MVT VT = N.getSimpleValueType();
5879 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5882 SDValue V1 = Ops[0];
5883 SDValue V2 = IsUnary ? V1 : Ops[1];
5885 V1 = peekThroughBitcasts(V1);
5886 V2 = peekThroughBitcasts(V2);
5888 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5889 "Illegal split of shuffle value type");
5890 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5892 // Extract known constant input data.
5893 APInt UndefSrcElts[2];
5894 SmallVector<APInt, 32> SrcEltBits[2];
5895 bool IsSrcConstant[2] = {
5896 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5897 SrcEltBits[0], true, false),
5898 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5899 SrcEltBits[1], true, false)};
5901 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5904 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5908 // Determine shuffle input and normalize the mask.
5909 unsigned SrcIdx = M / Size;
5910 SDValue V = M < Size ? V1 : V2;
5913 // We are referencing an UNDEF input.
5915 Mask[i] = SM_SentinelUndef;
5919 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5920 // TODO: We currently only set UNDEF for integer types - floats use the same
5921 // registers as vectors and many of the scalar folded loads rely on the
5922 // SCALAR_TO_VECTOR pattern.
5923 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5924 (Size % V.getValueType().getVectorNumElements()) == 0) {
5925 int Scale = Size / V.getValueType().getVectorNumElements();
5926 int Idx = M / Scale;
5927 if (Idx != 0 && !VT.isFloatingPoint())
5928 Mask[i] = SM_SentinelUndef;
5929 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5930 Mask[i] = SM_SentinelZero;
5934 // Attempt to extract from the source's constant bits.
5935 if (IsSrcConstant[SrcIdx]) {
5936 if (UndefSrcElts[SrcIdx][M])
5937 Mask[i] = SM_SentinelUndef;
5938 else if (SrcEltBits[SrcIdx][M] == 0)
5939 Mask[i] = SM_SentinelZero;
5943 assert(VT.getVectorNumElements() == Mask.size() &&
5944 "Different mask size from vector size!");
5948 // Attempt to decode ops that could be represented as a shuffle mask.
5949 // The decoded shuffle mask may contain a different number of elements to the
5950 // destination value type.
5951 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5952 SmallVectorImpl<SDValue> &Ops,
5953 SelectionDAG &DAG) {
5957 MVT VT = N.getSimpleValueType();
5958 unsigned NumElts = VT.getVectorNumElements();
5959 unsigned NumSizeInBits = VT.getSizeInBits();
5960 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5961 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5962 "Expected byte aligned value types");
5964 unsigned Opcode = N.getOpcode();
5966 case ISD::VECTOR_SHUFFLE: {
5967 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
5968 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
5969 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
5970 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
5971 Ops.push_back(N.getOperand(0));
5972 Ops.push_back(N.getOperand(1));
5978 case X86ISD::ANDNP: {
5979 // Attempt to decode as a per-byte mask.
5981 SmallVector<APInt, 32> EltBits;
5982 SDValue N0 = N.getOperand(0);
5983 SDValue N1 = N.getOperand(1);
5984 bool IsAndN = (X86ISD::ANDNP == Opcode);
5985 uint64_t ZeroMask = IsAndN ? 255 : 0;
5986 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5988 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5990 Mask.push_back(SM_SentinelUndef);
5993 uint64_t ByteBits = EltBits[i].getZExtValue();
5994 if (ByteBits != 0 && ByteBits != 255)
5996 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5998 Ops.push_back(IsAndN ? N1 : N0);
6001 case ISD::SCALAR_TO_VECTOR: {
6002 // Match against a scalar_to_vector of an extract from a vector,
6003 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6004 SDValue N0 = N.getOperand(0);
6007 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6008 N0.getOperand(0).getValueType() == VT) ||
6009 (N0.getOpcode() == X86ISD::PEXTRW &&
6010 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6011 (N0.getOpcode() == X86ISD::PEXTRB &&
6012 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6016 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6019 SDValue SrcVec = SrcExtract.getOperand(0);
6020 EVT SrcVT = SrcVec.getValueType();
6021 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6022 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6024 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6025 if (NumSrcElts <= SrcIdx)
6028 Ops.push_back(SrcVec);
6029 Mask.push_back(SrcIdx);
6030 Mask.append(NumZeros, SM_SentinelZero);
6031 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6034 case X86ISD::PINSRB:
6035 case X86ISD::PINSRW: {
6036 SDValue InVec = N.getOperand(0);
6037 SDValue InScl = N.getOperand(1);
6038 SDValue InIndex = N.getOperand(2);
6039 if (!isa<ConstantSDNode>(InIndex) ||
6040 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6042 uint64_t InIdx = N.getConstantOperandVal(2);
6044 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6045 if (X86::isZeroNode(InScl)) {
6046 Ops.push_back(InVec);
6047 for (unsigned i = 0; i != NumElts; ++i)
6048 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6052 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6053 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6055 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6056 if (InScl.getOpcode() != ExOp)
6059 SDValue ExVec = InScl.getOperand(0);
6060 SDValue ExIndex = InScl.getOperand(1);
6061 if (!isa<ConstantSDNode>(ExIndex) ||
6062 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6064 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6066 Ops.push_back(InVec);
6067 Ops.push_back(ExVec);
6068 for (unsigned i = 0; i != NumElts; ++i)
6069 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6072 case X86ISD::PACKSS:
6073 case X86ISD::PACKUS: {
6074 SDValue N0 = N.getOperand(0);
6075 SDValue N1 = N.getOperand(1);
6076 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6077 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6078 "Unexpected input value type");
6080 // If we know input saturation won't happen we can treat this
6081 // as a truncation shuffle.
6082 if (Opcode == X86ISD::PACKSS) {
6083 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6084 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6087 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6088 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6089 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6093 bool IsUnary = (N0 == N1);
6099 createPackShuffleMask(VT, Mask, IsUnary);
6103 case X86ISD::VSRLI: {
6104 uint64_t ShiftVal = N.getConstantOperandVal(1);
6105 // Out of range bit shifts are guaranteed to be zero.
6106 if (NumBitsPerElt <= ShiftVal) {
6107 Mask.append(NumElts, SM_SentinelZero);
6111 // We can only decode 'whole byte' bit shifts as shuffles.
6112 if ((ShiftVal % 8) != 0)
6115 uint64_t ByteShift = ShiftVal / 8;
6116 unsigned NumBytes = NumSizeInBits / 8;
6117 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6118 Ops.push_back(N.getOperand(0));
6120 // Clear mask to all zeros and insert the shifted byte indices.
6121 Mask.append(NumBytes, SM_SentinelZero);
6123 if (X86ISD::VSHLI == Opcode) {
6124 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6125 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6126 Mask[i + j] = i + j - ByteShift;
6128 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6129 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6130 Mask[i + j - ByteShift] = i + j;
6134 case ISD::ZERO_EXTEND_VECTOR_INREG:
6135 case X86ISD::VZEXT: {
6136 // TODO - add support for VPMOVZX with smaller input vector types.
6137 SDValue Src = N.getOperand(0);
6138 MVT SrcVT = Src.getSimpleValueType();
6139 if (NumSizeInBits != SrcVT.getSizeInBits())
6141 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6150 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6151 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6152 SmallVectorImpl<int> &Mask) {
6153 int MaskWidth = Mask.size();
6154 SmallVector<SDValue, 16> UsedInputs;
6155 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6156 int lo = UsedInputs.size() * MaskWidth;
6157 int hi = lo + MaskWidth;
6159 // Strip UNDEF input usage.
6160 if (Inputs[i].isUndef())
6162 if ((lo <= M) && (M < hi))
6163 M = SM_SentinelUndef;
6165 // Check for unused inputs.
6166 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6167 UsedInputs.push_back(Inputs[i]);
6174 Inputs = UsedInputs;
6177 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6178 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6179 /// remaining input indices in case we now have a unary shuffle and adjust the
6180 /// inputs accordingly.
6181 /// Returns true if the target shuffle mask was decoded.
6182 static bool resolveTargetShuffleInputs(SDValue Op,
6183 SmallVectorImpl<SDValue> &Inputs,
6184 SmallVectorImpl<int> &Mask,
6185 SelectionDAG &DAG) {
6186 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6187 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6190 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6194 /// Returns the scalar element that will make up the ith
6195 /// element of the result of the vector shuffle.
6196 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6199 return SDValue(); // Limit search depth.
6201 SDValue V = SDValue(N, 0);
6202 EVT VT = V.getValueType();
6203 unsigned Opcode = V.getOpcode();
6205 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6206 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6207 int Elt = SV->getMaskElt(Index);
6210 return DAG.getUNDEF(VT.getVectorElementType());
6212 unsigned NumElems = VT.getVectorNumElements();
6213 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6214 : SV->getOperand(1);
6215 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6218 // Recurse into target specific vector shuffles to find scalars.
6219 if (isTargetShuffle(Opcode)) {
6220 MVT ShufVT = V.getSimpleValueType();
6221 MVT ShufSVT = ShufVT.getVectorElementType();
6222 int NumElems = (int)ShufVT.getVectorNumElements();
6223 SmallVector<int, 16> ShuffleMask;
6224 SmallVector<SDValue, 16> ShuffleOps;
6227 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6230 int Elt = ShuffleMask[Index];
6231 if (Elt == SM_SentinelZero)
6232 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6233 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6234 if (Elt == SM_SentinelUndef)
6235 return DAG.getUNDEF(ShufSVT);
6237 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6238 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6239 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6243 // Actual nodes that may contain scalar elements
6244 if (Opcode == ISD::BITCAST) {
6245 V = V.getOperand(0);
6246 EVT SrcVT = V.getValueType();
6247 unsigned NumElems = VT.getVectorNumElements();
6249 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6253 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6254 return (Index == 0) ? V.getOperand(0)
6255 : DAG.getUNDEF(VT.getVectorElementType());
6257 if (V.getOpcode() == ISD::BUILD_VECTOR)
6258 return V.getOperand(Index);
6263 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6264 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6265 unsigned NumNonZero, unsigned NumZero,
6267 const X86Subtarget &Subtarget) {
6268 MVT VT = Op.getSimpleValueType();
6269 unsigned NumElts = VT.getVectorNumElements();
6270 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6271 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6272 "Illegal vector insertion");
6278 for (unsigned i = 0; i < NumElts; ++i) {
6279 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6283 // If the build vector contains zeros or our first insertion is not the
6284 // first index then insert into zero vector to break any register
6285 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6288 if (NumZero || 0 != i)
6289 V = getZeroVector(VT, Subtarget, DAG, dl);
6291 assert(0 == i && "Expected insertion into zero-index");
6292 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6293 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6294 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6295 V = DAG.getBitcast(VT, V);
6299 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6300 DAG.getIntPtrConstant(i, dl));
6306 /// Custom lower build_vector of v16i8.
6307 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6308 unsigned NumNonZero, unsigned NumZero,
6310 const X86Subtarget &Subtarget) {
6311 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6314 // SSE4.1 - use PINSRB to insert each byte directly.
6315 if (Subtarget.hasSSE41())
6316 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6323 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6324 for (unsigned i = 0; i < 16; ++i) {
6325 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6326 if (ThisIsNonZero && First) {
6328 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6330 V = DAG.getUNDEF(MVT::v8i16);
6335 // FIXME: Investigate extending to i32 instead of just i16.
6336 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6337 SDValue ThisElt, LastElt;
6338 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6339 if (LastIsNonZero) {
6341 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6343 if (ThisIsNonZero) {
6344 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6345 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6346 DAG.getConstant(8, dl, MVT::i8));
6348 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6354 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6355 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6356 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6357 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6358 V = DAG.getBitcast(MVT::v8i16, V);
6360 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6361 DAG.getIntPtrConstant(i / 2, dl));
6367 return DAG.getBitcast(MVT::v16i8, V);
6370 /// Custom lower build_vector of v8i16.
6371 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6372 unsigned NumNonZero, unsigned NumZero,
6374 const X86Subtarget &Subtarget) {
6375 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6378 // Use PINSRW to insert each byte directly.
6379 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6383 /// Custom lower build_vector of v4i32 or v4f32.
6384 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6385 const X86Subtarget &Subtarget) {
6386 // Find all zeroable elements.
6387 std::bitset<4> Zeroable;
6388 for (int i=0; i < 4; ++i) {
6389 SDValue Elt = Op->getOperand(i);
6390 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6392 assert(Zeroable.size() - Zeroable.count() > 1 &&
6393 "We expect at least two non-zero elements!");
6395 // We only know how to deal with build_vector nodes where elements are either
6396 // zeroable or extract_vector_elt with constant index.
6397 SDValue FirstNonZero;
6398 unsigned FirstNonZeroIdx;
6399 for (unsigned i=0; i < 4; ++i) {
6402 SDValue Elt = Op->getOperand(i);
6403 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6404 !isa<ConstantSDNode>(Elt.getOperand(1)))
6406 // Make sure that this node is extracting from a 128-bit vector.
6407 MVT VT = Elt.getOperand(0).getSimpleValueType();
6408 if (!VT.is128BitVector())
6410 if (!FirstNonZero.getNode()) {
6412 FirstNonZeroIdx = i;
6416 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6417 SDValue V1 = FirstNonZero.getOperand(0);
6418 MVT VT = V1.getSimpleValueType();
6420 // See if this build_vector can be lowered as a blend with zero.
6422 unsigned EltMaskIdx, EltIdx;
6424 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6425 if (Zeroable[EltIdx]) {
6426 // The zero vector will be on the right hand side.
6427 Mask[EltIdx] = EltIdx+4;
6431 Elt = Op->getOperand(EltIdx);
6432 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6433 EltMaskIdx = Elt.getConstantOperandVal(1);
6434 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6436 Mask[EltIdx] = EltIdx;
6440 // Let the shuffle legalizer deal with blend operations.
6441 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6442 if (V1.getSimpleValueType() != VT)
6443 V1 = DAG.getBitcast(VT, V1);
6444 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6447 // See if we can lower this build_vector to a INSERTPS.
6448 if (!Subtarget.hasSSE41())
6451 SDValue V2 = Elt.getOperand(0);
6452 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6455 bool CanFold = true;
6456 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6460 SDValue Current = Op->getOperand(i);
6461 SDValue SrcVector = Current->getOperand(0);
6464 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6470 assert(V1.getNode() && "Expected at least two non-zero elements!");
6471 if (V1.getSimpleValueType() != MVT::v4f32)
6472 V1 = DAG.getBitcast(MVT::v4f32, V1);
6473 if (V2.getSimpleValueType() != MVT::v4f32)
6474 V2 = DAG.getBitcast(MVT::v4f32, V2);
6476 // Ok, we can emit an INSERTPS instruction.
6477 unsigned ZMask = Zeroable.to_ulong();
6479 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6480 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6482 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6483 DAG.getIntPtrConstant(InsertPSMask, DL));
6484 return DAG.getBitcast(VT, Result);
6487 /// Return a vector logical shift node.
6488 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6489 SelectionDAG &DAG, const TargetLowering &TLI,
6491 assert(VT.is128BitVector() && "Unknown type for VShift");
6492 MVT ShVT = MVT::v16i8;
6493 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6494 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6495 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6496 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6497 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6498 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6501 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6502 SelectionDAG &DAG) {
6504 // Check if the scalar load can be widened into a vector load. And if
6505 // the address is "base + cst" see if the cst can be "absorbed" into
6506 // the shuffle mask.
6507 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6508 SDValue Ptr = LD->getBasePtr();
6509 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6511 EVT PVT = LD->getValueType(0);
6512 if (PVT != MVT::i32 && PVT != MVT::f32)
6517 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6518 FI = FINode->getIndex();
6520 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6521 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6522 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6523 Offset = Ptr.getConstantOperandVal(1);
6524 Ptr = Ptr.getOperand(0);
6529 // FIXME: 256-bit vector instructions don't require a strict alignment,
6530 // improve this code to support it better.
6531 unsigned RequiredAlign = VT.getSizeInBits()/8;
6532 SDValue Chain = LD->getChain();
6533 // Make sure the stack object alignment is at least 16 or 32.
6534 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6535 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6536 if (MFI.isFixedObjectIndex(FI)) {
6537 // Can't change the alignment. FIXME: It's possible to compute
6538 // the exact stack offset and reference FI + adjust offset instead.
6539 // If someone *really* cares about this. That's the way to implement it.
6542 MFI.setObjectAlignment(FI, RequiredAlign);
6546 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6547 // Ptr + (Offset & ~15).
6550 if ((Offset % RequiredAlign) & 3)
6552 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6555 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6556 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6559 int EltNo = (Offset - StartOffset) >> 2;
6560 unsigned NumElems = VT.getVectorNumElements();
6562 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6563 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6564 LD->getPointerInfo().getWithOffset(StartOffset));
6566 SmallVector<int, 8> Mask(NumElems, EltNo);
6568 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6574 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6575 /// elements can be replaced by a single large load which has the same value as
6576 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6578 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6579 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6580 const SDLoc &DL, SelectionDAG &DAG,
6581 const X86Subtarget &Subtarget,
6582 bool isAfterLegalize) {
6583 unsigned NumElems = Elts.size();
6585 int LastLoadedElt = -1;
6586 SmallBitVector LoadMask(NumElems, false);
6587 SmallBitVector ZeroMask(NumElems, false);
6588 SmallBitVector UndefMask(NumElems, false);
6590 // For each element in the initializer, see if we've found a load, zero or an
6592 for (unsigned i = 0; i < NumElems; ++i) {
6593 SDValue Elt = peekThroughBitcasts(Elts[i]);
6598 UndefMask[i] = true;
6599 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6601 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6604 // Each loaded element must be the correct fractional portion of the
6605 // requested vector load.
6606 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6611 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6612 "Incomplete element masks");
6614 // Handle Special Cases - all undef or undef/zero.
6615 if (UndefMask.count() == NumElems)
6616 return DAG.getUNDEF(VT);
6618 // FIXME: Should we return this as a BUILD_VECTOR instead?
6619 if ((ZeroMask | UndefMask).count() == NumElems)
6620 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6621 : DAG.getConstantFP(0.0, DL, VT);
6623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6624 int FirstLoadedElt = LoadMask.find_first();
6625 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6626 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6627 EVT LDBaseVT = EltBase.getValueType();
6629 // Consecutive loads can contain UNDEFS but not ZERO elements.
6630 // Consecutive loads with UNDEFs and ZEROs elements require a
6631 // an additional shuffle stage to clear the ZERO elements.
6632 bool IsConsecutiveLoad = true;
6633 bool IsConsecutiveLoadWithZeros = true;
6634 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6636 SDValue Elt = peekThroughBitcasts(Elts[i]);
6637 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6638 if (!DAG.areNonVolatileConsecutiveLoads(
6639 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6640 i - FirstLoadedElt)) {
6641 IsConsecutiveLoad = false;
6642 IsConsecutiveLoadWithZeros = false;
6645 } else if (ZeroMask[i]) {
6646 IsConsecutiveLoad = false;
6650 SmallVector<LoadSDNode *, 8> Loads;
6651 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6653 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6655 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6656 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6657 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6658 "Cannot merge volatile loads.");
6660 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6661 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6662 for (auto *LD : Loads)
6663 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6667 // LOAD - all consecutive load/undefs (must start/end with a load).
6668 // If we have found an entire vector of loads and undefs, then return a large
6669 // load of the entire vector width starting at the base pointer.
6670 // If the vector contains zeros, then attempt to shuffle those elements.
6671 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6672 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6673 assert(LDBase && "Did not find base load for merging consecutive loads");
6674 EVT EltVT = LDBase->getValueType(0);
6675 // Ensure that the input vector size for the merged loads matches the
6676 // cumulative size of the input elements.
6677 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6680 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6683 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6684 // will lower to regular temporal loads and use the cache.
6685 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6686 VT.is256BitVector() && !Subtarget.hasInt256())
6689 if (IsConsecutiveLoad)
6690 return CreateLoad(VT, LDBase);
6692 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6693 // vector and a zero vector to clear out the zero elements.
6694 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6695 SmallVector<int, 4> ClearMask(NumElems, -1);
6696 for (unsigned i = 0; i < NumElems; ++i) {
6698 ClearMask[i] = i + NumElems;
6699 else if (LoadMask[i])
6702 SDValue V = CreateLoad(VT, LDBase);
6703 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6704 : DAG.getConstantFP(0.0, DL, VT);
6705 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6710 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6712 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6713 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6714 (LoadSize == 32 || LoadSize == 64) &&
6715 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6716 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6717 : MVT::getIntegerVT(LoadSize);
6718 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6719 if (TLI.isTypeLegal(VecVT)) {
6720 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6721 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6723 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6724 LDBase->getPointerInfo(),
6725 LDBase->getAlignment(),
6726 MachineMemOperand::MOLoad);
6727 for (auto *LD : Loads)
6728 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6729 return DAG.getBitcast(VT, ResNode);
6736 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6737 unsigned SplatBitSize, LLVMContext &C) {
6738 unsigned ScalarSize = VT.getScalarSizeInBits();
6739 unsigned NumElm = SplatBitSize / ScalarSize;
6741 SmallVector<Constant *, 32> ConstantVec;
6742 for (unsigned i = 0; i < NumElm; i++) {
6743 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6745 if (VT.isFloatingPoint()) {
6746 if (ScalarSize == 32) {
6747 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6749 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6750 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6753 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6754 ConstantVec.push_back(Const);
6756 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6759 static bool isUseOfShuffle(SDNode *N) {
6760 for (auto *U : N->uses()) {
6761 if (isTargetShuffle(U->getOpcode()))
6763 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6764 return isUseOfShuffle(U);
6769 // Check if the current node of build vector is a zero extended vector.
6770 // // If so, return the value extended.
6771 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6772 // // NumElt - return the number of zero extended identical values.
6773 // // EltType - return the type of the value include the zero extend.
6774 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6775 unsigned &NumElt, MVT &EltType) {
6776 SDValue ExtValue = Op->getOperand(0);
6777 unsigned NumElts = Op->getNumOperands();
6778 unsigned Delta = NumElts;
6780 for (unsigned i = 1; i < NumElts; i++) {
6781 if (Op->getOperand(i) == ExtValue) {
6785 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6788 if (!isPowerOf2_32(Delta) || Delta == 1)
6791 for (unsigned i = Delta; i < NumElts; i++) {
6792 if (i % Delta == 0) {
6793 if (Op->getOperand(i) != ExtValue)
6795 } else if (!(isNullConstant(Op->getOperand(i)) ||
6796 Op->getOperand(i).isUndef()))
6799 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6800 unsigned ExtVTSize = EltSize * Delta;
6801 EltType = MVT::getIntegerVT(ExtVTSize);
6802 NumElt = NumElts / Delta;
6806 /// Attempt to use the vbroadcast instruction to generate a splat value
6807 /// from a splat BUILD_VECTOR which uses:
6808 /// a. A single scalar load, or a constant.
6809 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6811 /// The VBROADCAST node is returned when a pattern is found,
6812 /// or SDValue() otherwise.
6813 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6814 const X86Subtarget &Subtarget,
6815 SelectionDAG &DAG) {
6816 // VBROADCAST requires AVX.
6817 // TODO: Splats could be generated for non-AVX CPUs using SSE
6818 // instructions, but there's less potential gain for only 128-bit vectors.
6819 if (!Subtarget.hasAVX())
6822 MVT VT = BVOp->getSimpleValueType(0);
6825 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6826 "Unsupported vector type for broadcast.");
6828 BitVector UndefElements;
6829 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6831 // Attempt to use VBROADCASTM
6832 // From this paterrn:
6833 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6834 // b. t1 = (build_vector t0 t0)
6836 // Create (VBROADCASTM v2i1 X)
6837 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6838 MVT EltType = VT.getScalarType();
6839 unsigned NumElts = VT.getVectorNumElements();
6841 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6842 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6843 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6844 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6846 BOperand = ZeroExtended.getOperand(0);
6848 BOperand = Ld.getOperand(0).getOperand(0);
6849 if (BOperand.getValueType().isVector() &&
6850 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6851 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6852 NumElts == 8)) || // for broadcastmb2q
6853 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6854 NumElts == 16))) { // for broadcastmw2d
6856 DAG.getNode(X86ISD::VBROADCASTM, dl,
6857 MVT::getVectorVT(EltType, NumElts), BOperand);
6858 return DAG.getBitcast(VT, Brdcst);
6864 // We need a splat of a single value to use broadcast, and it doesn't
6865 // make any sense if the value is only in one element of the vector.
6866 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6867 APInt SplatValue, Undef;
6868 unsigned SplatBitSize;
6870 // Check if this is a repeated constant pattern suitable for broadcasting.
6871 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6872 SplatBitSize > VT.getScalarSizeInBits() &&
6873 SplatBitSize < VT.getSizeInBits()) {
6874 // Avoid replacing with broadcast when it's a use of a shuffle
6875 // instruction to preserve the present custom lowering of shuffles.
6876 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6878 // replace BUILD_VECTOR with broadcast of the repeated constants.
6879 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6880 LLVMContext *Ctx = DAG.getContext();
6881 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6882 if (Subtarget.hasAVX()) {
6883 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6884 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6885 // Splatted value can fit in one INTEGER constant in constant pool.
6886 // Load the constant and broadcast it.
6887 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6888 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6889 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6890 SDValue CP = DAG.getConstantPool(C, PVT);
6891 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6893 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6895 CVT, dl, DAG.getEntryNode(), CP,
6896 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6898 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6899 MVT::getVectorVT(CVT, Repeat), Ld);
6900 return DAG.getBitcast(VT, Brdcst);
6901 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6902 // Splatted value can fit in one FLOAT constant in constant pool.
6903 // Load the constant and broadcast it.
6904 // AVX have support for 32 and 64 bit broadcast for floats only.
6905 // No 64bit integer in 32bit subtarget.
6906 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6907 // Lower the splat via APFloat directly, to avoid any conversion.
6910 ? ConstantFP::get(*Ctx,
6911 APFloat(APFloat::IEEEsingle(), SplatValue))
6912 : ConstantFP::get(*Ctx,
6913 APFloat(APFloat::IEEEdouble(), SplatValue));
6914 SDValue CP = DAG.getConstantPool(C, PVT);
6915 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6917 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6919 CVT, dl, DAG.getEntryNode(), CP,
6920 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6922 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6923 MVT::getVectorVT(CVT, Repeat), Ld);
6924 return DAG.getBitcast(VT, Brdcst);
6925 } else if (SplatBitSize > 64) {
6926 // Load the vector of constants and broadcast it.
6927 MVT CVT = VT.getScalarType();
6928 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6930 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6931 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6932 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6934 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6935 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6937 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6938 return DAG.getBitcast(VT, Brdcst);
6945 bool ConstSplatVal =
6946 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6948 // Make sure that all of the users of a non-constant load are from the
6949 // BUILD_VECTOR node.
6950 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6953 unsigned ScalarSize = Ld.getValueSizeInBits();
6954 bool IsGE256 = (VT.getSizeInBits() >= 256);
6956 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6957 // instruction to save 8 or more bytes of constant pool data.
6958 // TODO: If multiple splats are generated to load the same constant,
6959 // it may be detrimental to overall size. There needs to be a way to detect
6960 // that condition to know if this is truly a size win.
6961 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
6963 // Handle broadcasting a single constant scalar from the constant pool
6965 // On Sandybridge (no AVX2), it is still better to load a constant vector
6966 // from the constant pool and not to broadcast it from a scalar.
6967 // But override that restriction when optimizing for size.
6968 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6969 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6970 EVT CVT = Ld.getValueType();
6971 assert(!CVT.isVector() && "Must not broadcast a vector type");
6973 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6974 // For size optimization, also splat v2f64 and v2i64, and for size opt
6975 // with AVX2, also splat i8 and i16.
6976 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6977 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6978 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6979 const Constant *C = nullptr;
6980 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6981 C = CI->getConstantIntValue();
6982 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6983 C = CF->getConstantFPValue();
6985 assert(C && "Invalid constant type");
6987 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6989 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6990 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6992 CVT, dl, DAG.getEntryNode(), CP,
6993 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6996 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7000 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7002 // Handle AVX2 in-register broadcasts.
7003 if (!IsLoad && Subtarget.hasInt256() &&
7004 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7005 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7007 // The scalar source must be a normal load.
7011 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7012 (Subtarget.hasVLX() && ScalarSize == 64))
7013 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7015 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7016 // double since there is no vbroadcastsd xmm
7017 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7018 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7019 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7022 // Unsupported broadcast.
7026 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
7027 /// underlying vector and index.
7029 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7031 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7033 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7034 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7037 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7039 // (extract_vector_elt (v8f32 %1), Constant<6>)
7041 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7042 // (extract_subvector (v8f32 %0), Constant<4>),
7045 // In this case the vector is the extract_subvector expression and the index
7046 // is 2, as specified by the shuffle.
7047 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7048 SDValue ShuffleVec = SVOp->getOperand(0);
7049 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7050 assert(ShuffleVecVT.getVectorElementType() ==
7051 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7053 int ShuffleIdx = SVOp->getMaskElt(Idx);
7054 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7055 ExtractedFromVec = ShuffleVec;
7061 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7062 MVT VT = Op.getSimpleValueType();
7064 // Skip if insert_vec_elt is not supported.
7065 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7066 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7070 unsigned NumElems = Op.getNumOperands();
7074 SmallVector<unsigned, 4> InsertIndices;
7075 SmallVector<int, 8> Mask(NumElems, -1);
7077 for (unsigned i = 0; i != NumElems; ++i) {
7078 unsigned Opc = Op.getOperand(i).getOpcode();
7080 if (Opc == ISD::UNDEF)
7083 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7084 // Quit if more than 1 elements need inserting.
7085 if (InsertIndices.size() > 1)
7088 InsertIndices.push_back(i);
7092 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7093 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7095 // Quit if non-constant index.
7096 if (!isa<ConstantSDNode>(ExtIdx))
7098 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7100 // Quit if extracted from vector of different type.
7101 if (ExtractedFromVec.getValueType() != VT)
7104 if (!VecIn1.getNode())
7105 VecIn1 = ExtractedFromVec;
7106 else if (VecIn1 != ExtractedFromVec) {
7107 if (!VecIn2.getNode())
7108 VecIn2 = ExtractedFromVec;
7109 else if (VecIn2 != ExtractedFromVec)
7110 // Quit if more than 2 vectors to shuffle
7114 if (ExtractedFromVec == VecIn1)
7116 else if (ExtractedFromVec == VecIn2)
7117 Mask[i] = Idx + NumElems;
7120 if (!VecIn1.getNode())
7123 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7124 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7126 for (unsigned Idx : InsertIndices)
7127 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7128 DAG.getIntPtrConstant(Idx, DL));
7133 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7134 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7135 Op.getScalarValueSizeInBits() == 1 &&
7136 "Can not convert non-constant vector");
7137 uint64_t Immediate = 0;
7138 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7139 SDValue In = Op.getOperand(idx);
7141 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7144 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7145 return DAG.getConstant(Immediate, dl, VT);
7147 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7148 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7149 const X86Subtarget &Subtarget) {
7151 MVT VT = Op.getSimpleValueType();
7152 assert((VT.getVectorElementType() == MVT::i1) &&
7153 "Unexpected type in LowerBUILD_VECTORvXi1!");
7156 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7159 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7162 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7163 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7164 // Split the pieces.
7166 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7168 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7169 // We have to manually lower both halves so getNode doesn't try to
7170 // reassemble the build_vector.
7171 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7172 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7173 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7175 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7176 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7177 return DAG.getBitcast(VT, Imm);
7178 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7179 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7180 DAG.getIntPtrConstant(0, dl));
7183 // Vector has one or more non-const elements
7184 uint64_t Immediate = 0;
7185 SmallVector<unsigned, 16> NonConstIdx;
7186 bool IsSplat = true;
7187 bool HasConstElts = false;
7189 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7190 SDValue In = Op.getOperand(idx);
7193 if (!isa<ConstantSDNode>(In))
7194 NonConstIdx.push_back(idx);
7196 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7197 HasConstElts = true;
7201 else if (In != Op.getOperand(SplatIdx))
7205 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7207 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7208 DAG.getConstant(1, dl, VT),
7209 DAG.getConstant(0, dl, VT));
7211 // insert elements one by one
7215 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7216 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7218 else if (HasConstElts)
7219 Imm = DAG.getConstant(0, dl, VT);
7221 Imm = DAG.getUNDEF(VT);
7222 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7223 DstVec = DAG.getBitcast(VT, Imm);
7225 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7226 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7227 DAG.getIntPtrConstant(0, dl));
7230 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7231 unsigned InsertIdx = NonConstIdx[i];
7232 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7233 Op.getOperand(InsertIdx),
7234 DAG.getIntPtrConstant(InsertIdx, dl));
7239 /// \brief Return true if \p N implements a horizontal binop and return the
7240 /// operands for the horizontal binop into V0 and V1.
7242 /// This is a helper function of LowerToHorizontalOp().
7243 /// This function checks that the build_vector \p N in input implements a
7244 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7245 /// operation to match.
7246 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7247 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7248 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7251 /// This function only analyzes elements of \p N whose indices are
7252 /// in range [BaseIdx, LastIdx).
7253 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7255 unsigned BaseIdx, unsigned LastIdx,
7256 SDValue &V0, SDValue &V1) {
7257 EVT VT = N->getValueType(0);
7259 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7260 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7261 "Invalid Vector in input!");
7263 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7264 bool CanFold = true;
7265 unsigned ExpectedVExtractIdx = BaseIdx;
7266 unsigned NumElts = LastIdx - BaseIdx;
7267 V0 = DAG.getUNDEF(VT);
7268 V1 = DAG.getUNDEF(VT);
7270 // Check if N implements a horizontal binop.
7271 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7272 SDValue Op = N->getOperand(i + BaseIdx);
7275 if (Op->isUndef()) {
7276 // Update the expected vector extract index.
7277 if (i * 2 == NumElts)
7278 ExpectedVExtractIdx = BaseIdx;
7279 ExpectedVExtractIdx += 2;
7283 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7288 SDValue Op0 = Op.getOperand(0);
7289 SDValue Op1 = Op.getOperand(1);
7291 // Try to match the following pattern:
7292 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7293 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7294 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7295 Op0.getOperand(0) == Op1.getOperand(0) &&
7296 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7297 isa<ConstantSDNode>(Op1.getOperand(1)));
7301 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7302 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7304 if (i * 2 < NumElts) {
7306 V0 = Op0.getOperand(0);
7307 if (V0.getValueType() != VT)
7312 V1 = Op0.getOperand(0);
7313 if (V1.getValueType() != VT)
7316 if (i * 2 == NumElts)
7317 ExpectedVExtractIdx = BaseIdx;
7320 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7321 if (I0 == ExpectedVExtractIdx)
7322 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7323 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7324 // Try to match the following dag sequence:
7325 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7326 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7330 ExpectedVExtractIdx += 2;
7336 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7337 /// a concat_vector.
7339 /// This is a helper function of LowerToHorizontalOp().
7340 /// This function expects two 256-bit vectors called V0 and V1.
7341 /// At first, each vector is split into two separate 128-bit vectors.
7342 /// Then, the resulting 128-bit vectors are used to implement two
7343 /// horizontal binary operations.
7345 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7347 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7348 /// the two new horizontal binop.
7349 /// When Mode is set, the first horizontal binop dag node would take as input
7350 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7351 /// horizontal binop dag node would take as input the lower 128-bit of V1
7352 /// and the upper 128-bit of V1.
7354 /// HADD V0_LO, V0_HI
7355 /// HADD V1_LO, V1_HI
7357 /// Otherwise, the first horizontal binop dag node takes as input the lower
7358 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7359 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7361 /// HADD V0_LO, V1_LO
7362 /// HADD V0_HI, V1_HI
7364 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7365 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7366 /// the upper 128-bits of the result.
7367 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7368 const SDLoc &DL, SelectionDAG &DAG,
7369 unsigned X86Opcode, bool Mode,
7370 bool isUndefLO, bool isUndefHI) {
7371 MVT VT = V0.getSimpleValueType();
7372 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7373 "Invalid nodes in input!");
7375 unsigned NumElts = VT.getVectorNumElements();
7376 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7377 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7378 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7379 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7380 MVT NewVT = V0_LO.getSimpleValueType();
7382 SDValue LO = DAG.getUNDEF(NewVT);
7383 SDValue HI = DAG.getUNDEF(NewVT);
7386 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7387 if (!isUndefLO && !V0->isUndef())
7388 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7389 if (!isUndefHI && !V1->isUndef())
7390 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7392 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7393 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7394 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7396 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7397 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7400 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7403 /// Returns true iff \p BV builds a vector with the result equivalent to
7404 /// the result of ADDSUB operation.
7405 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7406 /// are written to the parameters \p Opnd0 and \p Opnd1.
7407 static bool isAddSub(const BuildVectorSDNode *BV,
7408 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7409 SDValue &Opnd0, SDValue &Opnd1,
7410 unsigned &NumExtracts) {
7412 MVT VT = BV->getSimpleValueType(0);
7413 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7414 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7415 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7418 unsigned NumElts = VT.getVectorNumElements();
7419 SDValue InVec0 = DAG.getUNDEF(VT);
7420 SDValue InVec1 = DAG.getUNDEF(VT);
7424 // Odd-numbered elements in the input build vector are obtained from
7425 // adding two integer/float elements.
7426 // Even-numbered elements in the input build vector are obtained from
7427 // subtracting two integer/float elements.
7428 unsigned ExpectedOpcode = ISD::FSUB;
7429 unsigned NextExpectedOpcode = ISD::FADD;
7430 bool AddFound = false;
7431 bool SubFound = false;
7433 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7434 SDValue Op = BV->getOperand(i);
7436 // Skip 'undef' values.
7437 unsigned Opcode = Op.getOpcode();
7438 if (Opcode == ISD::UNDEF) {
7439 std::swap(ExpectedOpcode, NextExpectedOpcode);
7443 // Early exit if we found an unexpected opcode.
7444 if (Opcode != ExpectedOpcode)
7447 SDValue Op0 = Op.getOperand(0);
7448 SDValue Op1 = Op.getOperand(1);
7450 // Try to match the following pattern:
7451 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7452 // Early exit if we cannot match that sequence.
7453 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7454 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7455 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7456 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7457 Op0.getOperand(1) != Op1.getOperand(1))
7460 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7464 // We found a valid add/sub node. Update the information accordingly.
7470 // Update InVec0 and InVec1.
7471 if (InVec0.isUndef()) {
7472 InVec0 = Op0.getOperand(0);
7473 if (InVec0.getSimpleValueType() != VT)
7476 if (InVec1.isUndef()) {
7477 InVec1 = Op1.getOperand(0);
7478 if (InVec1.getSimpleValueType() != VT)
7482 // Make sure that operands in input to each add/sub node always
7483 // come from a same pair of vectors.
7484 if (InVec0 != Op0.getOperand(0)) {
7485 if (ExpectedOpcode == ISD::FSUB)
7488 // FADD is commutable. Try to commute the operands
7489 // and then test again.
7490 std::swap(Op0, Op1);
7491 if (InVec0 != Op0.getOperand(0))
7495 if (InVec1 != Op1.getOperand(0))
7498 // Update the pair of expected opcodes.
7499 std::swap(ExpectedOpcode, NextExpectedOpcode);
7501 // Increment the number of extractions done.
7505 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7506 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7514 /// Returns true if is possible to fold MUL and an idiom that has already been
7515 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7516 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7517 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7519 /// Prior to calling this function it should be known that there is some
7520 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7521 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7522 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7523 /// of \p Opnd0 uses is expected to be equal to 2.
7524 /// For example, this function may be called for the following IR:
7525 /// %AB = fmul fast <2 x double> %A, %B
7526 /// %Sub = fsub fast <2 x double> %AB, %C
7527 /// %Add = fadd fast <2 x double> %AB, %C
7528 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7529 /// <2 x i32> <i32 0, i32 3>
7530 /// There is a def for %Addsub here, which potentially can be replaced by
7531 /// X86ISD::ADDSUB operation:
7532 /// %Addsub = X86ISD::ADDSUB %AB, %C
7533 /// and such ADDSUB can further be replaced with FMADDSUB:
7534 /// %Addsub = FMADDSUB %A, %B, %C.
7536 /// The main reason why this method is called before the replacement of the
7537 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7538 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7540 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7542 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7543 unsigned ExpectedUses) {
7544 if (Opnd0.getOpcode() != ISD::FMUL ||
7545 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7548 // FIXME: These checks must match the similar ones in
7549 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7550 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7551 // or MUL + ADDSUB to FMADDSUB.
7552 const TargetOptions &Options = DAG.getTarget().Options;
7554 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7559 Opnd1 = Opnd0.getOperand(1);
7560 Opnd0 = Opnd0.getOperand(0);
7565 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7566 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7567 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7568 const X86Subtarget &Subtarget,
7569 SelectionDAG &DAG) {
7570 SDValue Opnd0, Opnd1;
7571 unsigned NumExtracts;
7572 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
7575 MVT VT = BV->getSimpleValueType(0);
7578 // Try to generate X86ISD::FMADDSUB node here.
7580 // TODO: According to coverage reports, the FMADDSUB transform is not
7581 // triggered by any tests.
7582 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
7583 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7585 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7586 // the ADDSUB idiom has been successfully recognized. There are no known
7587 // X86 targets with 512-bit ADDSUB instructions!
7588 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7590 if (VT.is512BitVector())
7593 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7596 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7597 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7598 const X86Subtarget &Subtarget,
7599 SelectionDAG &DAG) {
7600 MVT VT = BV->getSimpleValueType(0);
7601 unsigned NumElts = VT.getVectorNumElements();
7602 unsigned NumUndefsLO = 0;
7603 unsigned NumUndefsHI = 0;
7604 unsigned Half = NumElts/2;
7606 // Count the number of UNDEF operands in the build_vector in input.
7607 for (unsigned i = 0, e = Half; i != e; ++i)
7608 if (BV->getOperand(i)->isUndef())
7611 for (unsigned i = Half, e = NumElts; i != e; ++i)
7612 if (BV->getOperand(i)->isUndef())
7615 // Early exit if this is either a build_vector of all UNDEFs or all the
7616 // operands but one are UNDEF.
7617 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7621 SDValue InVec0, InVec1;
7622 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7623 // Try to match an SSE3 float HADD/HSUB.
7624 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7625 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7627 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7628 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7629 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7630 // Try to match an SSSE3 integer HADD/HSUB.
7631 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7632 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7634 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7635 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7638 if (!Subtarget.hasAVX())
7641 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7642 // Try to match an AVX horizontal add/sub of packed single/double
7643 // precision floating point values from 256-bit vectors.
7644 SDValue InVec2, InVec3;
7645 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7646 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7647 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7648 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7649 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7651 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7652 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7653 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7654 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7655 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7656 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7657 // Try to match an AVX2 horizontal add/sub of signed integers.
7658 SDValue InVec2, InVec3;
7660 bool CanFold = true;
7662 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7663 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7664 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7665 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7666 X86Opcode = X86ISD::HADD;
7667 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7668 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7669 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7670 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7671 X86Opcode = X86ISD::HSUB;
7676 // Fold this build_vector into a single horizontal add/sub.
7677 // Do this only if the target has AVX2.
7678 if (Subtarget.hasAVX2())
7679 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7681 // Do not try to expand this build_vector into a pair of horizontal
7682 // add/sub if we can emit a pair of scalar add/sub.
7683 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7686 // Convert this build_vector into a pair of horizontal binop followed by
7688 bool isUndefLO = NumUndefsLO == Half;
7689 bool isUndefHI = NumUndefsHI == Half;
7690 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7691 isUndefLO, isUndefHI);
7695 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7696 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7698 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7699 X86Opcode = X86ISD::HADD;
7700 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7701 X86Opcode = X86ISD::HSUB;
7702 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7703 X86Opcode = X86ISD::FHADD;
7704 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7705 X86Opcode = X86ISD::FHSUB;
7709 // Don't try to expand this build_vector into a pair of horizontal add/sub
7710 // if we can simply emit a pair of scalar add/sub.
7711 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7714 // Convert this build_vector into two horizontal add/sub followed by
7716 bool isUndefLO = NumUndefsLO == Half;
7717 bool isUndefHI = NumUndefsHI == Half;
7718 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7719 isUndefLO, isUndefHI);
7725 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7726 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7727 /// just apply the bit to the vectors.
7728 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7729 /// from this, but enough scalar bit operations are created from the later
7730 /// legalization + scalarization stages to need basic support.
7731 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7732 SelectionDAG &DAG) {
7734 MVT VT = Op->getSimpleValueType(0);
7735 unsigned NumElems = VT.getVectorNumElements();
7736 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7738 // Check that all elements have the same opcode.
7739 // TODO: Should we allow UNDEFS and if so how many?
7740 unsigned Opcode = Op->getOperand(0).getOpcode();
7741 for (unsigned i = 1; i < NumElems; ++i)
7742 if (Opcode != Op->getOperand(i).getOpcode())
7745 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7752 // Don't do this if the buildvector is a splat - we'd replace one
7753 // constant with an entire vector.
7754 if (Op->getSplatValue())
7756 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7761 SmallVector<SDValue, 4> LHSElts, RHSElts;
7762 for (SDValue Elt : Op->ops()) {
7763 SDValue LHS = Elt.getOperand(0);
7764 SDValue RHS = Elt.getOperand(1);
7766 // We expect the canonicalized RHS operand to be the constant.
7767 if (!isa<ConstantSDNode>(RHS))
7769 LHSElts.push_back(LHS);
7770 RHSElts.push_back(RHS);
7773 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7774 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7775 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7778 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7779 /// functionality to do this, so it's all zeros, all ones, or some derivation
7780 /// that is cheap to calculate.
7781 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7782 const X86Subtarget &Subtarget) {
7784 MVT VT = Op.getSimpleValueType();
7786 // Vectors containing all zeros can be matched by pxor and xorps.
7787 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7788 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7789 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7790 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7793 return getZeroVector(VT, Subtarget, DAG, DL);
7796 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7797 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7798 // vpcmpeqd on 256-bit vectors.
7799 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7800 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7801 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7804 return getOnesVector(VT, DAG, DL);
7810 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
7811 // reasoned to be a permutation of a vector by indices in a non-constant vector.
7812 // (build_vector (extract_elt V, (extract_elt I, 0)),
7813 // (extract_elt V, (extract_elt I, 1)),
7818 // TODO: Handle undefs
7819 // TODO: Utilize pshufb and zero mask blending to support more efficient
7820 // construction of vectors with constant-0 elements.
7822 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
7823 const X86Subtarget &Subtarget) {
7824 // Look for VPERMV/VPERMILPV/PSHUFB opportunities.
7825 auto LegalPermuteOpcode = [&Subtarget](MVT DstVT, MVT &ShuffleVT) {
7826 unsigned Opcode = 0;
7827 switch (DstVT.SimpleTy) {
7831 if (Subtarget.hasSSE3())
7832 Opcode = X86ISD::PSHUFB;
7835 if (Subtarget.hasVLX() && Subtarget.hasBWI())
7836 Opcode = X86ISD::VPERMV;
7837 else if (Subtarget.hasSSE3()) {
7838 Opcode = X86ISD::PSHUFB;
7839 ShuffleVT = MVT::v16i8;
7844 if (Subtarget.hasAVX()) {
7845 Opcode = X86ISD::VPERMILPV;
7846 ShuffleVT = MVT::v4f32;
7847 } else if (Subtarget.hasSSE3()) {
7848 Opcode = X86ISD::PSHUFB;
7849 ShuffleVT = MVT::v16i8;
7854 if (Subtarget.hasAVX()) {
7855 Opcode = X86ISD::VPERMILPV;
7856 ShuffleVT = MVT::v2f64;
7861 if (Subtarget.hasAVX2())
7862 Opcode = X86ISD::VPERMV;
7866 if (Subtarget.hasVLX())
7867 Opcode = X86ISD::VPERMV;
7868 else if (Subtarget.hasAVX2()) {
7869 Opcode = X86ISD::VPERMV;
7870 ShuffleVT = MVT::v8f32;
7877 if (Subtarget.hasAVX512())
7878 Opcode = X86ISD::VPERMV;
7881 if (Subtarget.hasBWI())
7882 Opcode = X86ISD::VPERMV;
7885 if (Subtarget.hasVLX() && Subtarget.hasBWI())
7886 Opcode = X86ISD::VPERMV;
7889 if (Subtarget.hasVBMI())
7890 Opcode = X86ISD::VPERMV;
7893 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
7894 Opcode = X86ISD::VPERMV;
7900 SDValue SrcVec, IndicesVec;
7901 // Check for a match of the permute source vector and permute index elements.
7902 // This is done by checking that the i-th build_vector operand is of the form:
7903 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
7904 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
7905 SDValue Op = V.getOperand(Idx);
7906 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7909 // If this is the first extract encountered in V, set the source vector,
7910 // otherwise verify the extract is from the previously defined source
7913 SrcVec = Op.getOperand(0);
7914 else if (SrcVec != Op.getOperand(0))
7916 SDValue ExtractedIndex = Op->getOperand(1);
7917 // Peek through extends.
7918 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
7919 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
7920 ExtractedIndex = ExtractedIndex.getOperand(0);
7921 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7924 // If this is the first extract from the index vector candidate, set the
7925 // indices vector, otherwise verify the extract is from the previously
7926 // defined indices vector.
7928 IndicesVec = ExtractedIndex.getOperand(0);
7929 else if (IndicesVec != ExtractedIndex.getOperand(0))
7932 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
7933 if (!PermIdx || PermIdx->getZExtValue() != Idx)
7937 MVT VT = V.getSimpleValueType();
7939 unsigned Opcode = LegalPermuteOpcode(VT, ShuffleVT);
7942 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
7943 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
7944 "Illegal variable permute shuffle type");
7946 unsigned NumElts = VT.getVectorNumElements();
7947 if (IndicesVec.getValueType().getVectorNumElements() < NumElts)
7949 else if (IndicesVec.getValueType().getVectorNumElements() > NumElts) {
7950 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
7951 NumElts * VT.getScalarSizeInBits());
7954 MVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger().getSimpleVT();
7955 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7957 if (SrcVec.getValueSizeInBits() > VT.getSizeInBits())
7959 else if (SrcVec.getValueSizeInBits() < VT.getSizeInBits()) {
7961 DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(SrcVec), VT, DAG.getUNDEF(VT),
7962 SrcVec, DAG.getIntPtrConstant(0, SDLoc(SrcVec)));
7965 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
7967 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
7968 unsigned ShuffleBits = ShuffleVT.getScalarSizeInBits();
7969 uint64_t IndexScale = 0;
7970 uint64_t IndexOffset = 0;
7972 // If we're scaling a smaller permute op, then we need to repeat the indices,
7973 // scaling and offsetting them as well.
7974 // e.g. v4i32 -> v16i8 (Scale = 4)
7975 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
7976 // indexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
7977 for (uint64_t i = 0; i != Scale; ++i) {
7978 IndexScale |= Scale << (i * ShuffleBits);
7979 IndexOffset |= i << (i * ShuffleBits);
7982 SDLoc DL(IndicesVec);
7983 IndicesVec = DAG.getNode(ISD::MUL, DL, IndicesVT, IndicesVec,
7984 DAG.getConstant(IndexScale, DL, IndicesVT));
7985 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec,
7986 DAG.getConstant(IndexOffset, DL, IndicesVT));
7989 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
7990 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
7992 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
7994 Opcode == X86ISD::VPERMV
7995 ? DAG.getNode(Opcode, SDLoc(V), ShuffleVT, IndicesVec, SrcVec)
7996 : DAG.getNode(Opcode, SDLoc(V), ShuffleVT, SrcVec, IndicesVec);
7997 return DAG.getBitcast(VT, Res);
8001 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8004 MVT VT = Op.getSimpleValueType();
8005 MVT EltVT = VT.getVectorElementType();
8006 unsigned NumElems = Op.getNumOperands();
8008 // Generate vectors for predicate vectors.
8009 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8010 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8012 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8013 return VectorConstant;
8015 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8016 // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
8018 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8020 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8021 return HorizontalOp;
8022 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8024 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8027 unsigned EVTBits = EltVT.getSizeInBits();
8029 unsigned NumZero = 0;
8030 unsigned NumNonZero = 0;
8031 uint64_t NonZeros = 0;
8032 bool IsAllConstants = true;
8033 SmallSet<SDValue, 8> Values;
8034 unsigned NumConstants = NumElems;
8035 for (unsigned i = 0; i < NumElems; ++i) {
8036 SDValue Elt = Op.getOperand(i);
8040 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8041 IsAllConstants = false;
8044 if (X86::isZeroNode(Elt))
8047 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8048 NonZeros |= ((uint64_t)1 << i);
8053 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8054 if (NumNonZero == 0)
8055 return DAG.getUNDEF(VT);
8057 // If we are inserting one variable into a vector of non-zero constants, try
8058 // to avoid loading each constant element as a scalar. Load the constants as a
8059 // vector and then insert the variable scalar element. If insertion is not
8060 // supported, we assume that we will fall back to a shuffle to get the scalar
8061 // blended with the constants. Insertion into a zero vector is handled as a
8062 // special-case somewhere below here.
8063 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8064 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8065 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8066 // Create an all-constant vector. The variable element in the old
8067 // build vector is replaced by undef in the constant vector. Save the
8068 // variable scalar element and its index for use in the insertelement.
8069 LLVMContext &Context = *DAG.getContext();
8070 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8071 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8074 for (unsigned i = 0; i != NumElems; ++i) {
8075 SDValue Elt = Op.getOperand(i);
8076 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8077 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8078 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8079 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8080 else if (!Elt.isUndef()) {
8081 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8082 "Expected one variable element in this vector");
8084 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8087 Constant *CV = ConstantVector::get(ConstVecOps);
8088 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8090 // The constants we just created may not be legal (eg, floating point). We
8091 // must lower the vector right here because we can not guarantee that we'll
8092 // legalize it before loading it. This is also why we could not just create
8093 // a new build vector here. If the build vector contains illegal constants,
8094 // it could get split back up into a series of insert elements.
8095 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8096 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8097 MachineFunction &MF = DAG.getMachineFunction();
8098 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8099 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8100 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8103 // Special case for single non-zero, non-undef, element.
8104 if (NumNonZero == 1) {
8105 unsigned Idx = countTrailingZeros(NonZeros);
8106 SDValue Item = Op.getOperand(Idx);
8108 // If we have a constant or non-constant insertion into the low element of
8109 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8110 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8111 // depending on what the source datatype is.
8114 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8116 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8117 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8118 assert((VT.is128BitVector() || VT.is256BitVector() ||
8119 VT.is512BitVector()) &&
8120 "Expected an SSE value type!");
8121 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8122 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8123 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8126 // We can't directly insert an i8 or i16 into a vector, so zero extend
8128 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8129 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8130 if (VT.getSizeInBits() >= 256) {
8131 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8132 if (Subtarget.hasAVX()) {
8133 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8134 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8136 // Without AVX, we need to extend to a 128-bit vector and then
8137 // insert into the 256-bit vector.
8138 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8139 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8140 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8143 assert(VT.is128BitVector() && "Expected an SSE value type!");
8144 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8145 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8147 return DAG.getBitcast(VT, Item);
8151 // Is it a vector logical left shift?
8152 if (NumElems == 2 && Idx == 1 &&
8153 X86::isZeroNode(Op.getOperand(0)) &&
8154 !X86::isZeroNode(Op.getOperand(1))) {
8155 unsigned NumBits = VT.getSizeInBits();
8156 return getVShift(true, VT,
8157 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8158 VT, Op.getOperand(1)),
8159 NumBits/2, DAG, *this, dl);
8162 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8165 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8166 // is a non-constant being inserted into an element other than the low one,
8167 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8168 // movd/movss) to move this into the low element, then shuffle it into
8170 if (EVTBits == 32) {
8171 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8172 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8176 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8177 if (Values.size() == 1) {
8178 if (EVTBits == 32) {
8179 // Instead of a shuffle like this:
8180 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8181 // Check if it's possible to issue this instead.
8182 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8183 unsigned Idx = countTrailingZeros(NonZeros);
8184 SDValue Item = Op.getOperand(Idx);
8185 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8186 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8191 // A vector full of immediates; various special cases are already
8192 // handled, so this is best done with a single constant-pool load.
8196 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8199 // See if we can use a vector load to get all of the elements.
8201 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8203 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8207 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8208 // build_vector and broadcast it.
8209 // TODO: We could probably generalize this more.
8210 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8211 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8212 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8213 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8214 // Make sure all the even/odd operands match.
8215 for (unsigned i = 2; i != NumElems; ++i)
8216 if (Ops[i % 2] != Op.getOperand(i))
8220 if (CanSplat(Op, NumElems, Ops)) {
8221 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8222 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8223 // Create a new build vector and cast to v2i64/v2f64.
8224 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8225 DAG.getBuildVector(NarrowVT, dl, Ops));
8226 // Broadcast from v2i64/v2f64 and cast to final VT.
8227 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8228 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8233 // For AVX-length vectors, build the individual 128-bit pieces and use
8234 // shuffles to put them in place.
8235 if (VT.getSizeInBits() > 128) {
8236 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8238 // Build both the lower and upper subvector.
8240 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8241 SDValue Upper = DAG.getBuildVector(
8242 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8244 // Recreate the wider vector with the lower and upper part.
8245 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8246 VT.getSizeInBits() / 2);
8249 // Let legalizer expand 2-wide build_vectors.
8250 if (EVTBits == 64) {
8251 if (NumNonZero == 1) {
8252 // One half is zero or undef.
8253 unsigned Idx = countTrailingZeros(NonZeros);
8254 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8255 Op.getOperand(Idx));
8256 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8261 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8262 if (EVTBits == 8 && NumElems == 16)
8263 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8267 if (EVTBits == 16 && NumElems == 8)
8268 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8272 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8273 if (EVTBits == 32 && NumElems == 4)
8274 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8277 // If element VT is == 32 bits, turn it into a number of shuffles.
8278 if (NumElems == 4 && NumZero > 0) {
8279 SmallVector<SDValue, 8> Ops(NumElems);
8280 for (unsigned i = 0; i < 4; ++i) {
8281 bool isZero = !(NonZeros & (1ULL << i));
8283 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8285 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8288 for (unsigned i = 0; i < 2; ++i) {
8289 switch ((NonZeros >> (i*2)) & 0x3) {
8290 default: llvm_unreachable("Unexpected NonZero count");
8292 Ops[i] = Ops[i*2]; // Must be a zero vector.
8295 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8298 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8301 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8306 bool Reverse1 = (NonZeros & 0x3) == 2;
8307 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8311 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8312 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8314 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8317 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8319 // Check for a build vector from mostly shuffle plus few inserting.
8320 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8323 // For SSE 4.1, use insertps to put the high elements into the low element.
8324 if (Subtarget.hasSSE41()) {
8326 if (!Op.getOperand(0).isUndef())
8327 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8329 Result = DAG.getUNDEF(VT);
8331 for (unsigned i = 1; i < NumElems; ++i) {
8332 if (Op.getOperand(i).isUndef()) continue;
8333 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8334 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8339 // Otherwise, expand into a number of unpckl*, start by extending each of
8340 // our (non-undef) elements to the full vector width with the element in the
8341 // bottom slot of the vector (which generates no code for SSE).
8342 SmallVector<SDValue, 8> Ops(NumElems);
8343 for (unsigned i = 0; i < NumElems; ++i) {
8344 if (!Op.getOperand(i).isUndef())
8345 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8347 Ops[i] = DAG.getUNDEF(VT);
8350 // Next, we iteratively mix elements, e.g. for v4f32:
8351 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8352 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8353 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8354 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8355 // Generate scaled UNPCKL shuffle mask.
8356 SmallVector<int, 16> Mask;
8357 for(unsigned i = 0; i != Scale; ++i)
8359 for (unsigned i = 0; i != Scale; ++i)
8360 Mask.push_back(NumElems+i);
8361 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8363 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8364 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8369 // 256-bit AVX can use the vinsertf128 instruction
8370 // to create 256-bit vectors from two other 128-bit ones.
8371 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
8373 MVT ResVT = Op.getSimpleValueType();
8375 assert((ResVT.is256BitVector() ||
8376 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8378 SDValue V1 = Op.getOperand(0);
8379 SDValue V2 = Op.getOperand(1);
8380 unsigned NumElems = ResVT.getVectorNumElements();
8381 if (ResVT.is256BitVector())
8382 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8384 if (Op.getNumOperands() == 4) {
8385 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8386 ResVT.getVectorNumElements()/2);
8387 SDValue V3 = Op.getOperand(2);
8388 SDValue V4 = Op.getOperand(3);
8389 return concat256BitVectors(
8390 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
8391 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8394 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8397 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8398 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8399 static bool isExpandWithZeros(const SDValue &Op) {
8400 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8401 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8403 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8404 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8410 // Returns true if the given node is a type promotion (by concatenating i1
8411 // zeros) of the result of a node that already zeros all upper bits of
8413 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8414 unsigned Opc = Op.getOpcode();
8416 assert(Opc == ISD::CONCAT_VECTORS &&
8417 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8418 "Unexpected node to check for type promotion!");
8420 // As long as we are concatenating zeros to the upper part of a previous node
8421 // result, climb up the tree until a node with different opcode is
8423 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8424 if (Opc == ISD::INSERT_SUBVECTOR) {
8425 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8426 Op.getConstantOperandVal(2) == 0)
8427 Op = Op.getOperand(1);
8430 } else { // Opc == ISD::CONCAT_VECTORS
8431 if (isExpandWithZeros(Op))
8432 Op = Op.getOperand(0);
8436 Opc = Op.getOpcode();
8439 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8440 // of a node that zeros the upper bits (its masked version).
8441 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8442 (Op.getOpcode() == ISD::AND &&
8443 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8444 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8451 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8452 const X86Subtarget &Subtarget,
8453 SelectionDAG & DAG) {
8455 MVT ResVT = Op.getSimpleValueType();
8456 unsigned NumOperands = Op.getNumOperands();
8458 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8459 "Unexpected number of operands in CONCAT_VECTORS");
8461 // If this node promotes - by concatenating zeroes - the type of the result
8462 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8463 // output register, mark it as legal and catch the pattern in instruction
8464 // selection to avoid emitting extra instructions (for zeroing upper bits).
8465 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8466 SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
8467 SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
8468 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8472 unsigned NumZero = 0;
8473 unsigned NumNonZero = 0;
8474 uint64_t NonZeros = 0;
8475 for (unsigned i = 0; i != NumOperands; ++i) {
8476 SDValue SubVec = Op.getOperand(i);
8477 if (SubVec.isUndef())
8479 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8482 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8483 NonZeros |= (uint64_t)1 << i;
8489 // If there are zero or one non-zeros we can handle this very simply.
8490 if (NumNonZero <= 1) {
8491 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8492 : DAG.getUNDEF(ResVT);
8495 unsigned Idx = countTrailingZeros(NonZeros);
8496 SDValue SubVec = Op.getOperand(Idx);
8497 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8498 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8499 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8502 if (NumOperands > 2) {
8503 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8504 ResVT.getVectorNumElements()/2);
8505 ArrayRef<SDUse> Ops = Op->ops();
8506 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8507 Ops.slice(0, NumOperands/2));
8508 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8509 Ops.slice(NumOperands/2));
8510 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8513 assert(NumNonZero == 2 && "Simple cases not handled?");
8515 if (ResVT.getVectorNumElements() >= 16)
8516 return Op; // The operation is legal with KUNPCK
8518 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8519 DAG.getUNDEF(ResVT), Op.getOperand(0),
8520 DAG.getIntPtrConstant(0, dl));
8521 unsigned NumElems = ResVT.getVectorNumElements();
8522 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8523 DAG.getIntPtrConstant(NumElems/2, dl));
8526 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8527 const X86Subtarget &Subtarget,
8528 SelectionDAG &DAG) {
8529 MVT VT = Op.getSimpleValueType();
8530 if (VT.getVectorElementType() == MVT::i1)
8531 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8533 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8534 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8535 Op.getNumOperands() == 4)));
8537 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8538 // from two other 128-bit ones.
8540 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8541 return LowerAVXCONCAT_VECTORS(Op, DAG);
8544 //===----------------------------------------------------------------------===//
8545 // Vector shuffle lowering
8547 // This is an experimental code path for lowering vector shuffles on x86. It is
8548 // designed to handle arbitrary vector shuffles and blends, gracefully
8549 // degrading performance as necessary. It works hard to recognize idiomatic
8550 // shuffles and lower them to optimal instruction patterns without leaving
8551 // a framework that allows reasonably efficient handling of all vector shuffle
8553 //===----------------------------------------------------------------------===//
8555 /// \brief Tiny helper function to identify a no-op mask.
8557 /// This is a somewhat boring predicate function. It checks whether the mask
8558 /// array input, which is assumed to be a single-input shuffle mask of the kind
8559 /// used by the X86 shuffle instructions (not a fully general
8560 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8561 /// in-place shuffle are 'no-op's.
8562 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8563 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8564 assert(Mask[i] >= -1 && "Out of bound mask element!");
8565 if (Mask[i] >= 0 && Mask[i] != i)
8571 /// \brief Test whether there are elements crossing 128-bit lanes in this
8574 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8575 /// and we routinely test for these.
8576 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8577 int LaneSize = 128 / VT.getScalarSizeInBits();
8578 int Size = Mask.size();
8579 for (int i = 0; i < Size; ++i)
8580 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8585 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8587 /// This checks a shuffle mask to see if it is performing the same
8588 /// lane-relative shuffle in each sub-lane. This trivially implies
8589 /// that it is also not lane-crossing. It may however involve a blend from the
8590 /// same lane of a second vector.
8592 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8593 /// non-trivial to compute in the face of undef lanes. The representation is
8594 /// suitable for use with existing 128-bit shuffles as entries from the second
8595 /// vector have been remapped to [LaneSize, 2*LaneSize).
8596 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8598 SmallVectorImpl<int> &RepeatedMask) {
8599 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8600 RepeatedMask.assign(LaneSize, -1);
8601 int Size = Mask.size();
8602 for (int i = 0; i < Size; ++i) {
8603 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8606 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8607 // This entry crosses lanes, so there is no way to model this shuffle.
8610 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8611 // Adjust second vector indices to start at LaneSize instead of Size.
8612 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8613 : Mask[i] % LaneSize + LaneSize;
8614 if (RepeatedMask[i % LaneSize] < 0)
8615 // This is the first non-undef entry in this slot of a 128-bit lane.
8616 RepeatedMask[i % LaneSize] = LocalM;
8617 else if (RepeatedMask[i % LaneSize] != LocalM)
8618 // Found a mismatch with the repeated mask.
8624 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8626 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8627 SmallVectorImpl<int> &RepeatedMask) {
8628 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8631 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8633 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8634 SmallVectorImpl<int> &RepeatedMask) {
8635 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8638 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8639 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8640 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8642 SmallVectorImpl<int> &RepeatedMask) {
8643 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8644 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8645 int Size = Mask.size();
8646 for (int i = 0; i < Size; ++i) {
8647 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8648 if (Mask[i] == SM_SentinelUndef)
8650 if (Mask[i] == SM_SentinelZero) {
8651 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8653 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8656 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8657 // This entry crosses lanes, so there is no way to model this shuffle.
8660 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8661 // Adjust second vector indices to start at LaneSize instead of Size.
8663 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8664 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8665 // This is the first non-undef entry in this slot of a 128-bit lane.
8666 RepeatedMask[i % LaneSize] = LocalM;
8667 else if (RepeatedMask[i % LaneSize] != LocalM)
8668 // Found a mismatch with the repeated mask.
8674 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8677 /// This is a fast way to test a shuffle mask against a fixed pattern:
8679 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8681 /// It returns true if the mask is exactly as wide as the argument list, and
8682 /// each element of the mask is either -1 (signifying undef) or the value given
8683 /// in the argument.
8684 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8685 ArrayRef<int> ExpectedMask) {
8686 if (Mask.size() != ExpectedMask.size())
8689 int Size = Mask.size();
8691 // If the values are build vectors, we can look through them to find
8692 // equivalent inputs that make the shuffles equivalent.
8693 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8694 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8696 for (int i = 0; i < Size; ++i) {
8697 assert(Mask[i] >= -1 && "Out of bound mask element!");
8698 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8699 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8700 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8701 if (!MaskBV || !ExpectedBV ||
8702 MaskBV->getOperand(Mask[i] % Size) !=
8703 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8711 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8713 /// The masks must be exactly the same width.
8715 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8716 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8718 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8719 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8720 ArrayRef<int> ExpectedMask) {
8721 int Size = Mask.size();
8722 if (Size != (int)ExpectedMask.size())
8725 for (int i = 0; i < Size; ++i)
8726 if (Mask[i] == SM_SentinelUndef)
8728 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8730 else if (Mask[i] != ExpectedMask[i])
8736 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8738 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8739 const APInt &Zeroable) {
8740 int NumElts = Mask.size();
8741 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8743 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8744 for (int i = 0; i != NumElts; ++i) {
8746 if (M == SM_SentinelUndef)
8748 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8749 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8754 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8756 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8757 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8760 SmallVector<int, 8> Unpcklwd;
8761 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8762 /* Unary = */ false);
8763 SmallVector<int, 8> Unpckhwd;
8764 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8765 /* Unary = */ false);
8766 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8767 isTargetShuffleEquivalent(Mask, Unpckhwd));
8768 return IsUnpackwdMask;
8771 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8773 /// This helper function produces an 8-bit shuffle immediate corresponding to
8774 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8775 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8778 /// NB: We rely heavily on "undef" masks preserving the input lane.
8779 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8780 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8781 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8782 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8783 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8784 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8787 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8788 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8789 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8790 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8794 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8795 SelectionDAG &DAG) {
8796 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8799 /// \brief Compute whether each element of a shuffle is zeroable.
8801 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8802 /// Either it is an undef element in the shuffle mask, the element of the input
8803 /// referenced is undef, or the element of the input referenced is known to be
8804 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8805 /// as many lanes with this technique as possible to simplify the remaining
8807 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8808 SDValue V1, SDValue V2) {
8809 APInt Zeroable(Mask.size(), 0);
8810 V1 = peekThroughBitcasts(V1);
8811 V2 = peekThroughBitcasts(V2);
8813 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8814 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8816 int VectorSizeInBits = V1.getValueSizeInBits();
8817 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8818 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8820 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8822 // Handle the easy cases.
8823 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8828 // Determine shuffle input and normalize the mask.
8829 SDValue V = M < Size ? V1 : V2;
8832 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8833 if (V.getOpcode() != ISD::BUILD_VECTOR)
8836 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8837 // the (larger) source element must be UNDEF/ZERO.
8838 if ((Size % V.getNumOperands()) == 0) {
8839 int Scale = Size / V->getNumOperands();
8840 SDValue Op = V.getOperand(M / Scale);
8841 if (Op.isUndef() || X86::isZeroNode(Op))
8843 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8844 APInt Val = Cst->getAPIntValue();
8845 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8846 Val = Val.getLoBits(ScalarSizeInBits);
8849 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8850 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8851 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8852 Val = Val.getLoBits(ScalarSizeInBits);
8859 // If the BUILD_VECTOR has more elements then all the (smaller) source
8860 // elements must be UNDEF or ZERO.
8861 if ((V.getNumOperands() % Size) == 0) {
8862 int Scale = V->getNumOperands() / Size;
8863 bool AllZeroable = true;
8864 for (int j = 0; j < Scale; ++j) {
8865 SDValue Op = V.getOperand((M * Scale) + j);
8866 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8877 // The Shuffle result is as follow:
8878 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8879 // Each Zeroable's element correspond to a particular Mask's element.
8880 // As described in computeZeroableShuffleElements function.
8882 // The function looks for a sub-mask that the nonzero elements are in
8883 // increasing order. If such sub-mask exist. The function returns true.
8884 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8885 ArrayRef<int> Mask, const EVT &VectorType,
8886 bool &IsZeroSideLeft) {
8887 int NextElement = -1;
8888 // Check if the Mask's nonzero elements are in increasing order.
8889 for (int i = 0, e = Mask.size(); i < e; i++) {
8890 // Checks if the mask's zeros elements are built from only zeros.
8891 assert(Mask[i] >= -1 && "Out of bound mask element!");
8896 // Find the lowest non zero element
8897 if (NextElement < 0) {
8898 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8899 IsZeroSideLeft = NextElement != 0;
8901 // Exit if the mask's non zero elements are not in increasing order.
8902 if (NextElement != Mask[i])
8909 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8910 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8911 ArrayRef<int> Mask, SDValue V1,
8913 const APInt &Zeroable,
8914 const X86Subtarget &Subtarget,
8915 SelectionDAG &DAG) {
8916 int Size = Mask.size();
8917 int LaneSize = 128 / VT.getScalarSizeInBits();
8918 const int NumBytes = VT.getSizeInBits() / 8;
8919 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8921 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8922 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8923 (Subtarget.hasBWI() && VT.is512BitVector()));
8925 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8926 // Sign bit set in i8 mask means zero element.
8927 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8930 for (int i = 0; i < NumBytes; ++i) {
8931 int M = Mask[i / NumEltBytes];
8933 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8936 if (Zeroable[i / NumEltBytes]) {
8937 PSHUFBMask[i] = ZeroMask;
8941 // We can only use a single input of V1 or V2.
8942 SDValue SrcV = (M >= Size ? V2 : V1);
8948 // PSHUFB can't cross lanes, ensure this doesn't happen.
8949 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8953 M = M * NumEltBytes + (i % NumEltBytes);
8954 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8956 assert(V && "Failed to find a source input");
8958 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8959 return DAG.getBitcast(
8960 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8961 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8964 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8965 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8968 // X86 has dedicated shuffle that can be lowered to VEXPAND
8969 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8970 const APInt &Zeroable,
8971 ArrayRef<int> Mask, SDValue &V1,
8972 SDValue &V2, SelectionDAG &DAG,
8973 const X86Subtarget &Subtarget) {
8974 bool IsLeftZeroSide = true;
8975 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8978 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8980 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8981 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8982 unsigned NumElts = VT.getVectorNumElements();
8983 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8984 "Unexpected number of vector elements");
8985 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8986 Subtarget, DAG, DL);
8987 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8988 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8989 return DAG.getSelect(DL, VT, VMask,
8990 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8994 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8995 unsigned &UnpackOpcode, bool IsUnary,
8996 ArrayRef<int> TargetMask,
8997 const SDLoc &DL, SelectionDAG &DAG,
8998 const X86Subtarget &Subtarget) {
8999 int NumElts = VT.getVectorNumElements();
9001 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9002 for (int i = 0; i != NumElts; i += 2) {
9003 int M1 = TargetMask[i + 0];
9004 int M2 = TargetMask[i + 1];
9005 Undef1 &= (SM_SentinelUndef == M1);
9006 Undef2 &= (SM_SentinelUndef == M2);
9007 Zero1 &= isUndefOrZero(M1);
9008 Zero2 &= isUndefOrZero(M2);
9010 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9011 "Zeroable shuffle detected");
9013 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9014 SmallVector<int, 64> Unpckl, Unpckh;
9015 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9016 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9017 UnpackOpcode = X86ISD::UNPCKL;
9018 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9019 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9023 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9024 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9025 UnpackOpcode = X86ISD::UNPCKH;
9026 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9027 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9031 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9032 if (IsUnary && (Zero1 || Zero2)) {
9033 // Don't bother if we can blend instead.
9034 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9035 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9038 bool MatchLo = true, MatchHi = true;
9039 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9040 int M = TargetMask[i];
9042 // Ignore if the input is known to be zero or the index is undef.
9043 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9044 (M == SM_SentinelUndef))
9047 MatchLo &= (M == Unpckl[i]);
9048 MatchHi &= (M == Unpckh[i]);
9051 if (MatchLo || MatchHi) {
9052 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9053 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9054 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9059 // If a binary shuffle, commute and try again.
9061 ShuffleVectorSDNode::commuteMask(Unpckl);
9062 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9063 UnpackOpcode = X86ISD::UNPCKL;
9068 ShuffleVectorSDNode::commuteMask(Unpckh);
9069 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9070 UnpackOpcode = X86ISD::UNPCKH;
9079 // X86 has dedicated unpack instructions that can handle specific blend
9080 // operations: UNPCKH and UNPCKL.
9081 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9082 ArrayRef<int> Mask, SDValue V1,
9083 SDValue V2, SelectionDAG &DAG) {
9084 SmallVector<int, 8> Unpckl;
9085 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9086 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9087 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9089 SmallVector<int, 8> Unpckh;
9090 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9091 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9092 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9094 // Commute and try again.
9095 ShuffleVectorSDNode::commuteMask(Unpckl);
9096 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9097 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9099 ShuffleVectorSDNode::commuteMask(Unpckh);
9100 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9101 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9106 // X86 has dedicated pack instructions that can handle specific truncation
9107 // operations: PACKSS and PACKUS.
9108 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9109 SDValue &V2, unsigned &PackOpcode,
9110 ArrayRef<int> TargetMask,
9112 const X86Subtarget &Subtarget) {
9113 unsigned NumElts = VT.getVectorNumElements();
9114 unsigned BitSize = VT.getScalarSizeInBits();
9115 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9116 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9118 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9119 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9120 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9121 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9122 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9126 PackOpcode = X86ISD::PACKSS;
9130 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9131 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9132 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9133 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9137 PackOpcode = X86ISD::PACKUS;
9145 // Try binary shuffle.
9146 SmallVector<int, 32> BinaryMask;
9147 createPackShuffleMask(VT, BinaryMask, false);
9148 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9149 if (MatchPACK(V1, V2))
9152 // Try unary shuffle.
9153 SmallVector<int, 32> UnaryMask;
9154 createPackShuffleMask(VT, UnaryMask, true);
9155 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9156 if (MatchPACK(V1, V1))
9162 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9163 ArrayRef<int> Mask, SDValue V1,
9164 SDValue V2, SelectionDAG &DAG,
9165 const X86Subtarget &Subtarget) {
9167 unsigned PackOpcode;
9168 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9170 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9171 DAG.getBitcast(PackVT, V2));
9176 /// \brief Try to emit a bitmask instruction for a shuffle.
9178 /// This handles cases where we can model a blend exactly as a bitmask due to
9179 /// one of the inputs being zeroable.
9180 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9181 SDValue V2, ArrayRef<int> Mask,
9182 const APInt &Zeroable,
9183 SelectionDAG &DAG) {
9184 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9185 MVT EltVT = VT.getVectorElementType();
9186 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9187 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9188 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9190 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9193 if (Mask[i] % Size != i)
9194 return SDValue(); // Not a blend.
9196 V = Mask[i] < Size ? V1 : V2;
9197 else if (V != (Mask[i] < Size ? V1 : V2))
9198 return SDValue(); // Can only let one input through the mask.
9200 VMaskOps[i] = AllOnes;
9203 return SDValue(); // No non-zeroable elements!
9205 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9206 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9209 /// \brief Try to emit a blend instruction for a shuffle using bit math.
9211 /// This is used as a fallback approach when first class blend instructions are
9212 /// unavailable. Currently it is only suitable for integer vectors, but could
9213 /// be generalized for floating point vectors if desirable.
9214 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9215 SDValue V2, ArrayRef<int> Mask,
9216 SelectionDAG &DAG) {
9217 assert(VT.isInteger() && "Only supports integer vector types!");
9218 MVT EltVT = VT.getVectorElementType();
9219 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9220 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9221 SmallVector<SDValue, 16> MaskOps;
9222 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9223 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9224 return SDValue(); // Shuffled input!
9225 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9228 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9229 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9230 // We have to cast V2 around.
9231 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9232 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9233 DAG.getBitcast(MaskVT, V1Mask),
9234 DAG.getBitcast(MaskVT, V2)));
9235 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9238 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9239 SDValue PreservedSrc,
9240 const X86Subtarget &Subtarget,
9243 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9244 MutableArrayRef<int> TargetMask,
9245 bool &ForceV1Zero, bool &ForceV2Zero,
9246 uint64_t &BlendMask) {
9247 bool V1IsZeroOrUndef =
9248 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9249 bool V2IsZeroOrUndef =
9250 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9253 ForceV1Zero = false, ForceV2Zero = false;
9254 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9256 // Attempt to generate the binary blend mask. If an input is zero then
9257 // we can use any lane.
9258 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9259 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9260 int M = TargetMask[i];
9261 if (M == SM_SentinelUndef)
9265 if (M == i + Size) {
9266 BlendMask |= 1ull << i;
9269 if (M == SM_SentinelZero) {
9270 if (V1IsZeroOrUndef) {
9275 if (V2IsZeroOrUndef) {
9277 BlendMask |= 1ull << i;
9278 TargetMask[i] = i + Size;
9287 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9289 uint64_t ScaledMask = 0;
9290 for (int i = 0; i != Size; ++i)
9291 if (BlendMask & (1ull << i))
9292 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9296 /// \brief Try to emit a blend instruction for a shuffle.
9298 /// This doesn't do any checks for the availability of instructions for blending
9299 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9300 /// be matched in the backend with the type given. What it does check for is
9301 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9302 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9303 SDValue V2, ArrayRef<int> Original,
9304 const APInt &Zeroable,
9305 const X86Subtarget &Subtarget,
9306 SelectionDAG &DAG) {
9307 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9309 uint64_t BlendMask = 0;
9310 bool ForceV1Zero = false, ForceV2Zero = false;
9311 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9315 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9317 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9319 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9321 switch (VT.SimpleTy) {
9326 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9327 DAG.getConstant(BlendMask, DL, MVT::i8));
9331 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9335 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9336 // that instruction.
9337 if (Subtarget.hasAVX2()) {
9338 // Scale the blend by the number of 32-bit dwords per element.
9339 int Scale = VT.getScalarSizeInBits() / 32;
9340 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9341 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9342 V1 = DAG.getBitcast(BlendVT, V1);
9343 V2 = DAG.getBitcast(BlendVT, V2);
9344 return DAG.getBitcast(
9345 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9346 DAG.getConstant(BlendMask, DL, MVT::i8)));
9350 // For integer shuffles we need to expand the mask and cast the inputs to
9351 // v8i16s prior to blending.
9352 int Scale = 8 / VT.getVectorNumElements();
9353 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9354 V1 = DAG.getBitcast(MVT::v8i16, V1);
9355 V2 = DAG.getBitcast(MVT::v8i16, V2);
9356 return DAG.getBitcast(VT,
9357 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9358 DAG.getConstant(BlendMask, DL, MVT::i8)));
9362 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9363 SmallVector<int, 8> RepeatedMask;
9364 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9365 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9366 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9368 for (int i = 0; i < 8; ++i)
9369 if (RepeatedMask[i] >= 8)
9370 BlendMask |= 1ull << i;
9371 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9372 DAG.getConstant(BlendMask, DL, MVT::i8));
9378 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9379 "256-bit byte-blends require AVX2 support!");
9381 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9383 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9384 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9385 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9388 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9389 if (SDValue Masked =
9390 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9393 // Scale the blend by the number of bytes per element.
9394 int Scale = VT.getScalarSizeInBits() / 8;
9396 // This form of blend is always done on bytes. Compute the byte vector
9398 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9400 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9401 // mix of LLVM's code generator and the x86 backend. We tell the code
9402 // generator that boolean values in the elements of an x86 vector register
9403 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9404 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9405 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9406 // of the element (the remaining are ignored) and 0 in that high bit would
9407 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9408 // the LLVM model for boolean values in vector elements gets the relevant
9409 // bit set, it is set backwards and over constrained relative to x86's
9411 SmallVector<SDValue, 32> VSELECTMask;
9412 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9413 for (int j = 0; j < Scale; ++j)
9414 VSELECTMask.push_back(
9415 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9416 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9419 V1 = DAG.getBitcast(BlendVT, V1);
9420 V2 = DAG.getBitcast(BlendVT, V2);
9421 return DAG.getBitcast(
9423 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9433 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9434 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9435 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9438 llvm_unreachable("Not a supported integer vector type!");
9442 /// \brief Try to lower as a blend of elements from two inputs followed by
9443 /// a single-input permutation.
9445 /// This matches the pattern where we can blend elements from two inputs and
9446 /// then reduce the shuffle to a single-input permutation.
9447 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9448 SDValue V1, SDValue V2,
9450 SelectionDAG &DAG) {
9451 // We build up the blend mask while checking whether a blend is a viable way
9452 // to reduce the shuffle.
9453 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9454 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9456 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9460 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9462 if (BlendMask[Mask[i] % Size] < 0)
9463 BlendMask[Mask[i] % Size] = Mask[i];
9464 else if (BlendMask[Mask[i] % Size] != Mask[i])
9465 return SDValue(); // Can't blend in the needed input!
9467 PermuteMask[i] = Mask[i] % Size;
9470 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9471 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9474 /// \brief Generic routine to decompose a shuffle and blend into independent
9475 /// blends and permutes.
9477 /// This matches the extremely common pattern for handling combined
9478 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9479 /// operations. It will try to pick the best arrangement of shuffles and
9481 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9485 SelectionDAG &DAG) {
9486 // Shuffle the input elements into the desired positions in V1 and V2 and
9487 // blend them together.
9488 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9489 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9490 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9491 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9492 if (Mask[i] >= 0 && Mask[i] < Size) {
9493 V1Mask[i] = Mask[i];
9495 } else if (Mask[i] >= Size) {
9496 V2Mask[i] = Mask[i] - Size;
9497 BlendMask[i] = i + Size;
9500 // Try to lower with the simpler initial blend strategy unless one of the
9501 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9502 // shuffle may be able to fold with a load or other benefit. However, when
9503 // we'll have to do 2x as many shuffles in order to achieve this, blending
9504 // first is a better strategy.
9505 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9506 if (SDValue BlendPerm =
9507 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9510 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9511 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9512 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9515 /// \brief Try to lower a vector shuffle as a rotation.
9517 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9518 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9519 ArrayRef<int> Mask) {
9520 int NumElts = Mask.size();
9522 // We need to detect various ways of spelling a rotation:
9523 // [11, 12, 13, 14, 15, 0, 1, 2]
9524 // [-1, 12, 13, 14, -1, -1, 1, -1]
9525 // [-1, -1, -1, -1, -1, -1, 1, 2]
9526 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9527 // [-1, 4, 5, 6, -1, -1, 9, -1]
9528 // [-1, 4, 5, 6, -1, -1, -1, -1]
9531 for (int i = 0; i < NumElts; ++i) {
9533 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9534 "Unexpected mask index.");
9538 // Determine where a rotated vector would have started.
9539 int StartIdx = i - (M % NumElts);
9541 // The identity rotation isn't interesting, stop.
9544 // If we found the tail of a vector the rotation must be the missing
9545 // front. If we found the head of a vector, it must be how much of the
9547 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9550 Rotation = CandidateRotation;
9551 else if (Rotation != CandidateRotation)
9552 // The rotations don't match, so we can't match this mask.
9555 // Compute which value this mask is pointing at.
9556 SDValue MaskV = M < NumElts ? V1 : V2;
9558 // Compute which of the two target values this index should be assigned
9559 // to. This reflects whether the high elements are remaining or the low
9560 // elements are remaining.
9561 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9563 // Either set up this value if we've not encountered it before, or check
9564 // that it remains consistent.
9567 else if (TargetV != MaskV)
9568 // This may be a rotation, but it pulls from the inputs in some
9569 // unsupported interleaving.
9573 // Check that we successfully analyzed the mask, and normalize the results.
9574 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9575 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9587 /// \brief Try to lower a vector shuffle as a byte rotation.
9589 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9590 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9591 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9592 /// try to generically lower a vector shuffle through such an pattern. It
9593 /// does not check for the profitability of lowering either as PALIGNR or
9594 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9595 /// This matches shuffle vectors that look like:
9597 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9599 /// Essentially it concatenates V1 and V2, shifts right by some number of
9600 /// elements, and takes the low elements as the result. Note that while this is
9601 /// specified as a *right shift* because x86 is little-endian, it is a *left
9602 /// rotate* of the vector lanes.
9603 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9604 ArrayRef<int> Mask) {
9605 // Don't accept any shuffles with zero elements.
9606 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9609 // PALIGNR works on 128-bit lanes.
9610 SmallVector<int, 16> RepeatedMask;
9611 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9614 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9618 // PALIGNR rotates bytes, so we need to scale the
9619 // rotation based on how many bytes are in the vector lane.
9620 int NumElts = RepeatedMask.size();
9621 int Scale = 16 / NumElts;
9622 return Rotation * Scale;
9625 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9626 SDValue V1, SDValue V2,
9628 const X86Subtarget &Subtarget,
9629 SelectionDAG &DAG) {
9630 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9632 SDValue Lo = V1, Hi = V2;
9633 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9634 if (ByteRotation <= 0)
9637 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9639 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9640 Lo = DAG.getBitcast(ByteVT, Lo);
9641 Hi = DAG.getBitcast(ByteVT, Hi);
9643 // SSSE3 targets can use the palignr instruction.
9644 if (Subtarget.hasSSSE3()) {
9645 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9646 "512-bit PALIGNR requires BWI instructions");
9647 return DAG.getBitcast(
9648 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9649 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9652 assert(VT.is128BitVector() &&
9653 "Rotate-based lowering only supports 128-bit lowering!");
9654 assert(Mask.size() <= 16 &&
9655 "Can shuffle at most 16 bytes in a 128-bit vector!");
9656 assert(ByteVT == MVT::v16i8 &&
9657 "SSE2 rotate lowering only needed for v16i8!");
9659 // Default SSE2 implementation
9660 int LoByteShift = 16 - ByteRotation;
9661 int HiByteShift = ByteRotation;
9663 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9664 DAG.getConstant(LoByteShift, DL, MVT::i8));
9665 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9666 DAG.getConstant(HiByteShift, DL, MVT::i8));
9667 return DAG.getBitcast(VT,
9668 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9671 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9673 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9674 /// rotation of the concatenation of two vectors; This routine will
9675 /// try to generically lower a vector shuffle through such an pattern.
9677 /// Essentially it concatenates V1 and V2, shifts right by some number of
9678 /// elements, and takes the low elements as the result. Note that while this is
9679 /// specified as a *right shift* because x86 is little-endian, it is a *left
9680 /// rotate* of the vector lanes.
9681 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9682 SDValue V1, SDValue V2,
9684 const X86Subtarget &Subtarget,
9685 SelectionDAG &DAG) {
9686 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9687 "Only 32-bit and 64-bit elements are supported!");
9689 // 128/256-bit vectors are only supported with VLX.
9690 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9691 && "VLX required for 128/256-bit vectors");
9693 SDValue Lo = V1, Hi = V2;
9694 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9698 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9699 DAG.getConstant(Rotation, DL, MVT::i8));
9702 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9704 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9705 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9706 /// matches elements from one of the input vectors shuffled to the left or
9707 /// right with zeroable elements 'shifted in'. It handles both the strictly
9708 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9711 /// PSHL : (little-endian) left bit shift.
9712 /// [ zz, 0, zz, 2 ]
9713 /// [ -1, 4, zz, -1 ]
9714 /// PSRL : (little-endian) right bit shift.
9716 /// [ -1, -1, 7, zz]
9717 /// PSLLDQ : (little-endian) left byte shift
9718 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9719 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9720 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9721 /// PSRLDQ : (little-endian) right byte shift
9722 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9723 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9724 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9725 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9726 unsigned ScalarSizeInBits,
9727 ArrayRef<int> Mask, int MaskOffset,
9728 const APInt &Zeroable,
9729 const X86Subtarget &Subtarget) {
9730 int Size = Mask.size();
9731 unsigned SizeInBits = Size * ScalarSizeInBits;
9733 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9734 for (int i = 0; i < Size; i += Scale)
9735 for (int j = 0; j < Shift; ++j)
9736 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9742 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9743 for (int i = 0; i != Size; i += Scale) {
9744 unsigned Pos = Left ? i + Shift : i;
9745 unsigned Low = Left ? i : i + Shift;
9746 unsigned Len = Scale - Shift;
9747 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9751 int ShiftEltBits = ScalarSizeInBits * Scale;
9752 bool ByteShift = ShiftEltBits > 64;
9753 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9754 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9755 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9757 // Normalize the scale for byte shifts to still produce an i64 element
9759 Scale = ByteShift ? Scale / 2 : Scale;
9761 // We need to round trip through the appropriate type for the shift.
9762 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9763 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9764 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9765 return (int)ShiftAmt;
9768 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9769 // keep doubling the size of the integer elements up to that. We can
9770 // then shift the elements of the integer vector by whole multiples of
9771 // their width within the elements of the larger integer vector. Test each
9772 // multiple to see if we can find a match with the moved element indices
9773 // and that the shifted in elements are all zeroable.
9774 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9775 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9776 for (int Shift = 1; Shift != Scale; ++Shift)
9777 for (bool Left : {true, false})
9778 if (CheckZeros(Shift, Scale, Left)) {
9779 int ShiftAmt = MatchShift(Shift, Scale, Left);
9788 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9789 SDValue V2, ArrayRef<int> Mask,
9790 const APInt &Zeroable,
9791 const X86Subtarget &Subtarget,
9792 SelectionDAG &DAG) {
9793 int Size = Mask.size();
9794 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9800 // Try to match shuffle against V1 shift.
9801 int ShiftAmt = matchVectorShuffleAsShift(
9802 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9804 // If V1 failed, try to match shuffle against V2 shift.
9807 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9808 Mask, Size, Zeroable, Subtarget);
9815 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9816 "Illegal integer vector type");
9817 V = DAG.getBitcast(ShiftVT, V);
9818 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9819 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9820 return DAG.getBitcast(VT, V);
9823 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9824 // Remainder of lower half result is zero and upper half is all undef.
9825 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9826 ArrayRef<int> Mask, uint64_t &BitLen,
9827 uint64_t &BitIdx, const APInt &Zeroable) {
9828 int Size = Mask.size();
9829 int HalfSize = Size / 2;
9830 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9831 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9833 // Upper half must be undefined.
9834 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9837 // Determine the extraction length from the part of the
9838 // lower half that isn't zeroable.
9840 for (; Len > 0; --Len)
9841 if (!Zeroable[Len - 1])
9843 assert(Len > 0 && "Zeroable shuffle mask");
9845 // Attempt to match first Len sequential elements from the lower half.
9848 for (int i = 0; i != Len; ++i) {
9850 if (M == SM_SentinelUndef)
9852 SDValue &V = (M < Size ? V1 : V2);
9855 // The extracted elements must start at a valid index and all mask
9856 // elements must be in the lower half.
9857 if (i > M || M >= HalfSize)
9860 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9868 if (!Src || Idx < 0)
9871 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9872 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9873 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9878 // INSERTQ: Extract lowest Len elements from lower half of second source and
9879 // insert over first source, starting at Idx.
9880 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9881 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9882 ArrayRef<int> Mask, uint64_t &BitLen,
9884 int Size = Mask.size();
9885 int HalfSize = Size / 2;
9886 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9888 // Upper half must be undefined.
9889 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9892 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9895 // Attempt to match first source from mask before insertion point.
9896 if (isUndefInRange(Mask, 0, Idx)) {
9898 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9900 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9906 // Extend the extraction length looking to match both the insertion of
9907 // the second source and the remaining elements of the first.
9908 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9913 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9915 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9921 // Match the remaining elements of the lower half.
9922 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9924 } else if ((!Base || (Base == V1)) &&
9925 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9927 } else if ((!Base || (Base == V2)) &&
9928 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9935 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9936 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9946 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9947 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9948 SDValue V2, ArrayRef<int> Mask,
9949 const APInt &Zeroable,
9950 SelectionDAG &DAG) {
9951 uint64_t BitLen, BitIdx;
9952 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9953 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9954 DAG.getConstant(BitLen, DL, MVT::i8),
9955 DAG.getConstant(BitIdx, DL, MVT::i8));
9957 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9958 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9959 V2 ? V2 : DAG.getUNDEF(VT),
9960 DAG.getConstant(BitLen, DL, MVT::i8),
9961 DAG.getConstant(BitIdx, DL, MVT::i8));
9966 /// \brief Lower a vector shuffle as a zero or any extension.
9968 /// Given a specific number of elements, element bit width, and extension
9969 /// stride, produce either a zero or any extension based on the available
9970 /// features of the subtarget. The extended elements are consecutive and
9971 /// begin and can start from an offsetted element index in the input; to
9972 /// avoid excess shuffling the offset must either being in the bottom lane
9973 /// or at the start of a higher lane. All extended elements must be from
9975 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9976 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9977 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9978 assert(Scale > 1 && "Need a scale to extend.");
9979 int EltBits = VT.getScalarSizeInBits();
9980 int NumElements = VT.getVectorNumElements();
9981 int NumEltsPerLane = 128 / EltBits;
9982 int OffsetLane = Offset / NumEltsPerLane;
9983 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9984 "Only 8, 16, and 32 bit elements can be extended.");
9985 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9986 assert(0 <= Offset && "Extension offset must be positive.");
9987 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9988 "Extension offset must be in the first lane or start an upper lane.");
9990 // Check that an index is in same lane as the base offset.
9991 auto SafeOffset = [&](int Idx) {
9992 return OffsetLane == (Idx / NumEltsPerLane);
9995 // Shift along an input so that the offset base moves to the first element.
9996 auto ShuffleOffset = [&](SDValue V) {
10000 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10001 for (int i = 0; i * Scale < NumElements; ++i) {
10002 int SrcIdx = i + Offset;
10003 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10005 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10008 // Found a valid zext mask! Try various lowering strategies based on the
10009 // input type and available ISA extensions.
10010 if (Subtarget.hasSSE41()) {
10011 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10012 // PUNPCK will catch this in a later shuffle match.
10013 if (Offset && Scale == 2 && VT.is128BitVector())
10015 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10016 NumElements / Scale);
10017 InputV = ShuffleOffset(InputV);
10018 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10019 return DAG.getBitcast(VT, InputV);
10022 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10024 // For any extends we can cheat for larger element sizes and use shuffle
10025 // instructions that can fold with a load and/or copy.
10026 if (AnyExt && EltBits == 32) {
10027 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10029 return DAG.getBitcast(
10030 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10031 DAG.getBitcast(MVT::v4i32, InputV),
10032 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10034 if (AnyExt && EltBits == 16 && Scale > 2) {
10035 int PSHUFDMask[4] = {Offset / 2, -1,
10036 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10037 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10038 DAG.getBitcast(MVT::v4i32, InputV),
10039 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10040 int PSHUFWMask[4] = {1, -1, -1, -1};
10041 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10042 return DAG.getBitcast(
10043 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10044 DAG.getBitcast(MVT::v8i16, InputV),
10045 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10048 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10050 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10051 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10052 assert(VT.is128BitVector() && "Unexpected vector width!");
10054 int LoIdx = Offset * EltBits;
10055 SDValue Lo = DAG.getBitcast(
10056 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10057 DAG.getConstant(EltBits, DL, MVT::i8),
10058 DAG.getConstant(LoIdx, DL, MVT::i8)));
10060 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10061 !SafeOffset(Offset + 1))
10062 return DAG.getBitcast(VT, Lo);
10064 int HiIdx = (Offset + 1) * EltBits;
10065 SDValue Hi = DAG.getBitcast(
10066 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10067 DAG.getConstant(EltBits, DL, MVT::i8),
10068 DAG.getConstant(HiIdx, DL, MVT::i8)));
10069 return DAG.getBitcast(VT,
10070 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10073 // If this would require more than 2 unpack instructions to expand, use
10074 // pshufb when available. We can only use more than 2 unpack instructions
10075 // when zero extending i8 elements which also makes it easier to use pshufb.
10076 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10077 assert(NumElements == 16 && "Unexpected byte vector width!");
10078 SDValue PSHUFBMask[16];
10079 for (int i = 0; i < 16; ++i) {
10080 int Idx = Offset + (i / Scale);
10081 PSHUFBMask[i] = DAG.getConstant(
10082 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10084 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10085 return DAG.getBitcast(
10086 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10087 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10090 // If we are extending from an offset, ensure we start on a boundary that
10091 // we can unpack from.
10092 int AlignToUnpack = Offset % (NumElements / Scale);
10093 if (AlignToUnpack) {
10094 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10095 for (int i = AlignToUnpack; i < NumElements; ++i)
10096 ShMask[i - AlignToUnpack] = i;
10097 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10098 Offset -= AlignToUnpack;
10101 // Otherwise emit a sequence of unpacks.
10103 unsigned UnpackLoHi = X86ISD::UNPCKL;
10104 if (Offset >= (NumElements / 2)) {
10105 UnpackLoHi = X86ISD::UNPCKH;
10106 Offset -= (NumElements / 2);
10109 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10110 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10111 : getZeroVector(InputVT, Subtarget, DAG, DL);
10112 InputV = DAG.getBitcast(InputVT, InputV);
10113 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10117 } while (Scale > 1);
10118 return DAG.getBitcast(VT, InputV);
10121 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
10123 /// This routine will try to do everything in its power to cleverly lower
10124 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10125 /// check for the profitability of this lowering, it tries to aggressively
10126 /// match this pattern. It will use all of the micro-architectural details it
10127 /// can to emit an efficient lowering. It handles both blends with all-zero
10128 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10129 /// masking out later).
10131 /// The reason we have dedicated lowering for zext-style shuffles is that they
10132 /// are both incredibly common and often quite performance sensitive.
10133 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10134 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10135 const APInt &Zeroable, const X86Subtarget &Subtarget,
10136 SelectionDAG &DAG) {
10137 int Bits = VT.getSizeInBits();
10138 int NumLanes = Bits / 128;
10139 int NumElements = VT.getVectorNumElements();
10140 int NumEltsPerLane = NumElements / NumLanes;
10141 assert(VT.getScalarSizeInBits() <= 32 &&
10142 "Exceeds 32-bit integer zero extension limit");
10143 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10145 // Define a helper function to check a particular ext-scale and lower to it if
10147 auto Lower = [&](int Scale) -> SDValue {
10149 bool AnyExt = true;
10152 for (int i = 0; i < NumElements; ++i) {
10155 continue; // Valid anywhere but doesn't tell us anything.
10156 if (i % Scale != 0) {
10157 // Each of the extended elements need to be zeroable.
10161 // We no longer are in the anyext case.
10166 // Each of the base elements needs to be consecutive indices into the
10167 // same input vector.
10168 SDValue V = M < NumElements ? V1 : V2;
10169 M = M % NumElements;
10172 Offset = M - (i / Scale);
10173 } else if (InputV != V)
10174 return SDValue(); // Flip-flopping inputs.
10176 // Offset must start in the lowest 128-bit lane or at the start of an
10178 // FIXME: Is it ever worth allowing a negative base offset?
10179 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10180 (Offset % NumEltsPerLane) == 0))
10183 // If we are offsetting, all referenced entries must come from the same
10185 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10188 if ((M % NumElements) != (Offset + (i / Scale)))
10189 return SDValue(); // Non-consecutive strided elements.
10193 // If we fail to find an input, we have a zero-shuffle which should always
10194 // have already been handled.
10195 // FIXME: Maybe handle this here in case during blending we end up with one?
10199 // If we are offsetting, don't extend if we only match a single input, we
10200 // can always do better by using a basic PSHUF or PUNPCK.
10201 if (Offset != 0 && Matches < 2)
10204 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10205 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10208 // The widest scale possible for extending is to a 64-bit integer.
10209 assert(Bits % 64 == 0 &&
10210 "The number of bits in a vector must be divisible by 64 on x86!");
10211 int NumExtElements = Bits / 64;
10213 // Each iteration, try extending the elements half as much, but into twice as
10215 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10216 assert(NumElements % NumExtElements == 0 &&
10217 "The input vector size must be divisible by the extended size.");
10218 if (SDValue V = Lower(NumElements / NumExtElements))
10222 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10226 // Returns one of the source operands if the shuffle can be reduced to a
10227 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10228 auto CanZExtLowHalf = [&]() {
10229 for (int i = NumElements / 2; i != NumElements; ++i)
10232 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10234 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10239 if (SDValue V = CanZExtLowHalf()) {
10240 V = DAG.getBitcast(MVT::v2i64, V);
10241 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10242 return DAG.getBitcast(VT, V);
10245 // No viable ext lowering found.
10249 /// \brief Try to get a scalar value for a specific element of a vector.
10251 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10252 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10253 SelectionDAG &DAG) {
10254 MVT VT = V.getSimpleValueType();
10255 MVT EltVT = VT.getVectorElementType();
10256 V = peekThroughBitcasts(V);
10258 // If the bitcasts shift the element size, we can't extract an equivalent
10259 // element from it.
10260 MVT NewVT = V.getSimpleValueType();
10261 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10264 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10265 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10266 // Ensure the scalar operand is the same size as the destination.
10267 // FIXME: Add support for scalar truncation where possible.
10268 SDValue S = V.getOperand(Idx);
10269 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10270 return DAG.getBitcast(EltVT, S);
10276 /// \brief Helper to test for a load that can be folded with x86 shuffles.
10278 /// This is particularly important because the set of instructions varies
10279 /// significantly based on whether the operand is a load or not.
10280 static bool isShuffleFoldableLoad(SDValue V) {
10281 V = peekThroughBitcasts(V);
10282 return ISD::isNON_EXTLoad(V.getNode());
10285 /// \brief Try to lower insertion of a single element into a zero vector.
10287 /// This is a common pattern that we have especially efficient patterns to lower
10288 /// across all subtarget feature sets.
10289 static SDValue lowerVectorShuffleAsElementInsertion(
10290 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10291 const APInt &Zeroable, const X86Subtarget &Subtarget,
10292 SelectionDAG &DAG) {
10294 MVT EltVT = VT.getVectorElementType();
10297 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10299 bool IsV1Zeroable = true;
10300 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10301 if (i != V2Index && !Zeroable[i]) {
10302 IsV1Zeroable = false;
10306 // Check for a single input from a SCALAR_TO_VECTOR node.
10307 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10308 // all the smarts here sunk into that routine. However, the current
10309 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10310 // vector shuffle lowering is dead.
10311 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10313 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10314 // We need to zext the scalar if it is smaller than an i32.
10315 V2S = DAG.getBitcast(EltVT, V2S);
10316 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10317 // Using zext to expand a narrow element won't work for non-zero
10322 // Zero-extend directly to i32.
10323 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10324 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10326 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10327 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10328 EltVT == MVT::i16) {
10329 // Either not inserting from the low element of the input or the input
10330 // element size is too small to use VZEXT_MOVL to clear the high bits.
10334 if (!IsV1Zeroable) {
10335 // If V1 can't be treated as a zero vector we have fewer options to lower
10336 // this. We can't support integer vectors or non-zero targets cheaply, and
10337 // the V1 elements can't be permuted in any way.
10338 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10339 if (!VT.isFloatingPoint() || V2Index != 0)
10341 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10342 V1Mask[V2Index] = -1;
10343 if (!isNoopShuffleMask(V1Mask))
10345 if (!VT.is128BitVector())
10348 // Otherwise, use MOVSD or MOVSS.
10349 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10350 "Only two types of floating point element types to handle!");
10351 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10355 // This lowering only works for the low element with floating point vectors.
10356 if (VT.isFloatingPoint() && V2Index != 0)
10359 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10361 V2 = DAG.getBitcast(VT, V2);
10363 if (V2Index != 0) {
10364 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10365 // the desired position. Otherwise it is more efficient to do a vector
10366 // shift left. We know that we can do a vector shift left because all
10367 // the inputs are zero.
10368 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10369 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10370 V2Shuffle[V2Index] = 0;
10371 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10373 V2 = DAG.getBitcast(MVT::v16i8, V2);
10375 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10376 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10377 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10378 DAG.getDataLayout(), VT)));
10379 V2 = DAG.getBitcast(VT, V2);
10385 /// Try to lower broadcast of a single - truncated - integer element,
10386 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10388 /// This assumes we have AVX2.
10389 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10390 SDValue V0, int BroadcastIdx,
10391 const X86Subtarget &Subtarget,
10392 SelectionDAG &DAG) {
10393 assert(Subtarget.hasAVX2() &&
10394 "We can only lower integer broadcasts with AVX2!");
10396 EVT EltVT = VT.getVectorElementType();
10397 EVT V0VT = V0.getValueType();
10399 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10400 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10402 EVT V0EltVT = V0VT.getVectorElementType();
10403 if (!V0EltVT.isInteger())
10406 const unsigned EltSize = EltVT.getSizeInBits();
10407 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10409 // This is only a truncation if the original element type is larger.
10410 if (V0EltSize <= EltSize)
10413 assert(((V0EltSize % EltSize) == 0) &&
10414 "Scalar type sizes must all be powers of 2 on x86!");
10416 const unsigned V0Opc = V0.getOpcode();
10417 const unsigned Scale = V0EltSize / EltSize;
10418 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10420 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10421 V0Opc != ISD::BUILD_VECTOR)
10424 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10426 // If we're extracting non-least-significant bits, shift so we can truncate.
10427 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10428 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10429 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10430 if (const int OffsetIdx = BroadcastIdx % Scale)
10431 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10432 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10434 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10435 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10438 /// \brief Try to lower broadcast of a single element.
10440 /// For convenience, this code also bundles all of the subtarget feature set
10441 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10442 /// a convenient way to factor it out.
10443 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10444 SDValue V1, SDValue V2,
10445 ArrayRef<int> Mask,
10446 const X86Subtarget &Subtarget,
10447 SelectionDAG &DAG) {
10448 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10449 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10450 (Subtarget.hasAVX2() && VT.isInteger())))
10453 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10454 // we can only broadcast from a register with AVX2.
10455 unsigned NumElts = Mask.size();
10456 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10458 : X86ISD::VBROADCAST;
10459 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10461 // Check that the mask is a broadcast.
10462 int BroadcastIdx = -1;
10463 for (int i = 0; i != (int)NumElts; ++i) {
10464 SmallVector<int, 8> BroadcastMask(NumElts, i);
10465 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10471 if (BroadcastIdx < 0)
10473 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10474 "a sorted mask where the broadcast "
10477 // Go up the chain of (vector) values to find a scalar load that we can
10478 // combine with the broadcast.
10481 switch (V.getOpcode()) {
10482 case ISD::BITCAST: {
10483 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10484 SDValue VSrc = V.getOperand(0);
10485 unsigned NumEltBits = V.getScalarValueSizeInBits();
10486 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10487 if ((NumEltBits % NumSrcBits) == 0)
10488 BroadcastIdx *= (NumEltBits / NumSrcBits);
10489 else if ((NumSrcBits % NumEltBits) == 0 &&
10490 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10491 BroadcastIdx /= (NumSrcBits / NumEltBits);
10497 case ISD::CONCAT_VECTORS: {
10498 int OperandSize = Mask.size() / V.getNumOperands();
10499 V = V.getOperand(BroadcastIdx / OperandSize);
10500 BroadcastIdx %= OperandSize;
10503 case ISD::INSERT_SUBVECTOR: {
10504 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10505 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10509 int BeginIdx = (int)ConstantIdx->getZExtValue();
10511 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10512 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10513 BroadcastIdx -= BeginIdx;
10524 // Ensure the source vector and BroadcastIdx are for a suitable type.
10525 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10526 unsigned NumEltBits = VT.getScalarSizeInBits();
10527 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10528 if ((NumSrcBits % NumEltBits) == 0)
10529 BroadcastIdx *= (NumSrcBits / NumEltBits);
10530 else if ((NumEltBits % NumSrcBits) == 0 &&
10531 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10532 BroadcastIdx /= (NumEltBits / NumSrcBits);
10536 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10537 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10538 V = DAG.getBitcast(SrcVT, V);
10541 // Check if this is a broadcast of a scalar. We special case lowering
10542 // for scalars so that we can more effectively fold with loads.
10543 // First, look through bitcast: if the original value has a larger element
10544 // type than the shuffle, the broadcast element is in essence truncated.
10545 // Make that explicit to ease folding.
10546 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10547 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10548 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10549 return TruncBroadcast;
10551 MVT BroadcastVT = VT;
10553 // Peek through any bitcast (only useful for loads).
10554 SDValue BC = peekThroughBitcasts(V);
10556 // Also check the simpler case, where we can directly reuse the scalar.
10557 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10558 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10559 V = V.getOperand(BroadcastIdx);
10561 // If we can't broadcast from a register, check that the input is a load.
10562 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10564 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10565 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10566 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10567 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10568 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10573 // If we are broadcasting a load that is only used by the shuffle
10574 // then we can reduce the vector load to the broadcasted scalar load.
10575 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10576 SDValue BaseAddr = Ld->getOperand(1);
10577 EVT SVT = BroadcastVT.getScalarType();
10578 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10579 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10580 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10581 DAG.getMachineFunction().getMachineMemOperand(
10582 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10583 DAG.makeEquivalentMemoryOrdering(Ld, V);
10584 } else if (!BroadcastFromReg) {
10585 // We can't broadcast from a vector register.
10587 } else if (BroadcastIdx != 0) {
10588 // We can only broadcast from the zero-element of a vector register,
10589 // but it can be advantageous to broadcast from the zero-element of a
10591 if (!VT.is256BitVector() && !VT.is512BitVector())
10594 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10595 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10598 // Only broadcast the zero-element of a 128-bit subvector.
10599 unsigned EltSize = VT.getScalarSizeInBits();
10600 if (((BroadcastIdx * EltSize) % 128) != 0)
10603 // The shuffle input might have been a bitcast we looked through; look at
10604 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10605 // later bitcast it to BroadcastVT.
10606 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10607 "Unexpected vector element size");
10608 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
10609 "Unexpected vector size");
10610 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
10613 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10614 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10615 DAG.getBitcast(MVT::f64, V));
10617 // Bitcast back to the same scalar type as BroadcastVT.
10618 MVT SrcVT = V.getSimpleValueType();
10619 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10620 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10621 "Unexpected vector element size");
10622 if (SrcVT.isVector()) {
10623 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10624 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10626 SrcVT = BroadcastVT.getScalarType();
10628 V = DAG.getBitcast(SrcVT, V);
10631 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10632 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10633 V = DAG.getBitcast(MVT::f64, V);
10634 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10635 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10638 // We only support broadcasting from 128-bit vectors to minimize the
10639 // number of patterns we need to deal with in isel. So extract down to
10640 // 128-bits, removing as many bitcasts as possible.
10641 if (SrcVT.getSizeInBits() > 128) {
10642 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
10643 128 / SrcVT.getScalarSizeInBits());
10644 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
10645 V = DAG.getBitcast(ExtVT, V);
10648 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10651 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10652 // INSERTPS when the V1 elements are already in the correct locations
10653 // because otherwise we can just always use two SHUFPS instructions which
10654 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10655 // perform INSERTPS if a single V1 element is out of place and all V2
10656 // elements are zeroable.
10657 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10658 unsigned &InsertPSMask,
10659 const APInt &Zeroable,
10660 ArrayRef<int> Mask,
10661 SelectionDAG &DAG) {
10662 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10663 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10664 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10666 // Attempt to match INSERTPS with one element from VA or VB being
10667 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10669 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10670 ArrayRef<int> CandidateMask) {
10671 unsigned ZMask = 0;
10672 int VADstIndex = -1;
10673 int VBDstIndex = -1;
10674 bool VAUsedInPlace = false;
10676 for (int i = 0; i < 4; ++i) {
10677 // Synthesize a zero mask from the zeroable elements (includes undefs).
10683 // Flag if we use any VA inputs in place.
10684 if (i == CandidateMask[i]) {
10685 VAUsedInPlace = true;
10689 // We can only insert a single non-zeroable element.
10690 if (VADstIndex >= 0 || VBDstIndex >= 0)
10693 if (CandidateMask[i] < 4) {
10694 // VA input out of place for insertion.
10697 // VB input for insertion.
10702 // Don't bother if we have no (non-zeroable) element for insertion.
10703 if (VADstIndex < 0 && VBDstIndex < 0)
10706 // Determine element insertion src/dst indices. The src index is from the
10707 // start of the inserted vector, not the start of the concatenated vector.
10708 unsigned VBSrcIndex = 0;
10709 if (VADstIndex >= 0) {
10710 // If we have a VA input out of place, we use VA as the V2 element
10711 // insertion and don't use the original V2 at all.
10712 VBSrcIndex = CandidateMask[VADstIndex];
10713 VBDstIndex = VADstIndex;
10716 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10719 // If no V1 inputs are used in place, then the result is created only from
10720 // the zero mask and the V2 insertion - so remove V1 dependency.
10721 if (!VAUsedInPlace)
10722 VA = DAG.getUNDEF(MVT::v4f32);
10724 // Update V1, V2 and InsertPSMask accordingly.
10728 // Insert the V2 element into the desired position.
10729 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10730 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10734 if (matchAsInsertPS(V1, V2, Mask))
10737 // Commute and try again.
10738 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10739 ShuffleVectorSDNode::commuteMask(CommutedMask);
10740 if (matchAsInsertPS(V2, V1, CommutedMask))
10746 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10747 SDValue V2, ArrayRef<int> Mask,
10748 const APInt &Zeroable,
10749 SelectionDAG &DAG) {
10750 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10751 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10753 // Attempt to match the insertps pattern.
10754 unsigned InsertPSMask;
10755 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10758 // Insert the V2 element into the desired position.
10759 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10760 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10763 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10764 /// UNPCK instruction.
10766 /// This specifically targets cases where we end up with alternating between
10767 /// the two inputs, and so can permute them into something that feeds a single
10768 /// UNPCK instruction. Note that this routine only targets integer vectors
10769 /// because for floating point vectors we have a generalized SHUFPS lowering
10770 /// strategy that handles everything that doesn't *exactly* match an unpack,
10771 /// making this clever lowering unnecessary.
10772 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10773 SDValue V1, SDValue V2,
10774 ArrayRef<int> Mask,
10775 SelectionDAG &DAG) {
10776 assert(!VT.isFloatingPoint() &&
10777 "This routine only supports integer vectors.");
10778 assert(VT.is128BitVector() &&
10779 "This routine only works on 128-bit vectors.");
10780 assert(!V2.isUndef() &&
10781 "This routine should only be used when blending two inputs.");
10782 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10784 int Size = Mask.size();
10787 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10789 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10791 bool UnpackLo = NumLoInputs >= NumHiInputs;
10793 auto TryUnpack = [&](int ScalarSize, int Scale) {
10794 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10795 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10797 for (int i = 0; i < Size; ++i) {
10801 // Each element of the unpack contains Scale elements from this mask.
10802 int UnpackIdx = i / Scale;
10804 // We only handle the case where V1 feeds the first slots of the unpack.
10805 // We rely on canonicalization to ensure this is the case.
10806 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10809 // Setup the mask for this input. The indexing is tricky as we have to
10810 // handle the unpack stride.
10811 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10812 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10816 // If we will have to shuffle both inputs to use the unpack, check whether
10817 // we can just unpack first and shuffle the result. If so, skip this unpack.
10818 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10819 !isNoopShuffleMask(V2Mask))
10822 // Shuffle the inputs into place.
10823 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10824 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10826 // Cast the inputs to the type we will use to unpack them.
10827 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10828 V1 = DAG.getBitcast(UnpackVT, V1);
10829 V2 = DAG.getBitcast(UnpackVT, V2);
10831 // Unpack the inputs and cast the result back to the desired type.
10832 return DAG.getBitcast(
10833 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10834 UnpackVT, V1, V2));
10837 // We try each unpack from the largest to the smallest to try and find one
10838 // that fits this mask.
10839 int OrigScalarSize = VT.getScalarSizeInBits();
10840 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10841 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10844 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10846 if (NumLoInputs == 0 || NumHiInputs == 0) {
10847 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10848 "We have to have *some* inputs!");
10849 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10851 // FIXME: We could consider the total complexity of the permute of each
10852 // possible unpacking. Or at the least we should consider how many
10853 // half-crossings are created.
10854 // FIXME: We could consider commuting the unpacks.
10856 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10857 for (int i = 0; i < Size; ++i) {
10861 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10864 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10866 return DAG.getVectorShuffle(
10867 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10869 DAG.getUNDEF(VT), PermMask);
10875 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10877 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10878 /// support for floating point shuffles but not integer shuffles. These
10879 /// instructions will incur a domain crossing penalty on some chips though so
10880 /// it is better to avoid lowering through this for integer vectors where
10882 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10883 const APInt &Zeroable,
10884 SDValue V1, SDValue V2,
10885 const X86Subtarget &Subtarget,
10886 SelectionDAG &DAG) {
10887 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10888 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10889 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10891 if (V2.isUndef()) {
10892 // Check for being able to broadcast a single element.
10893 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10894 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10897 // Straight shuffle of a single input vector. Simulate this by using the
10898 // single input as both of the "inputs" to this instruction..
10899 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10901 if (Subtarget.hasAVX()) {
10902 // If we have AVX, we can use VPERMILPS which will allow folding a load
10903 // into the shuffle.
10904 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10905 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10908 return DAG.getNode(
10909 X86ISD::SHUFP, DL, MVT::v2f64,
10910 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10911 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10912 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10914 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10915 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10917 // If we have a single input, insert that into V1 if we can do so cheaply.
10918 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10919 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10920 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10922 // Try inverting the insertion since for v2 masks it is easy to do and we
10923 // can't reliably sort the mask one way or the other.
10924 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10925 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10926 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10927 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10931 // Try to use one of the special instruction patterns to handle two common
10932 // blend patterns if a zero-blend above didn't work.
10933 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10934 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10935 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10936 // We can either use a special instruction to load over the low double or
10937 // to move just the low double.
10938 return DAG.getNode(
10939 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10940 DL, MVT::v2f64, V2,
10941 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10943 if (Subtarget.hasSSE41())
10944 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10945 Zeroable, Subtarget, DAG))
10948 // Use dedicated unpack instructions for masks that match their pattern.
10950 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10953 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10954 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10955 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10958 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10960 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10961 /// the integer unit to minimize domain crossing penalties. However, for blends
10962 /// it falls back to the floating point shuffle operation with appropriate bit
10964 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10965 const APInt &Zeroable,
10966 SDValue V1, SDValue V2,
10967 const X86Subtarget &Subtarget,
10968 SelectionDAG &DAG) {
10969 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10970 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10971 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10973 if (V2.isUndef()) {
10974 // Check for being able to broadcast a single element.
10975 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10976 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10979 // Straight shuffle of a single input vector. For everything from SSE2
10980 // onward this has a single fast instruction with no scary immediates.
10981 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10982 V1 = DAG.getBitcast(MVT::v4i32, V1);
10983 int WidenedMask[4] = {
10984 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10985 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10986 return DAG.getBitcast(
10988 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10989 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10991 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10992 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10993 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10994 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10996 // Try to use shift instructions.
10997 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10998 Zeroable, Subtarget, DAG))
11001 // When loading a scalar and then shuffling it into a vector we can often do
11002 // the insertion cheaply.
11003 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11004 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11006 // Try inverting the insertion since for v2 masks it is easy to do and we
11007 // can't reliably sort the mask one way or the other.
11008 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11009 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11010 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11013 // We have different paths for blend lowering, but they all must use the
11014 // *exact* same predicate.
11015 bool IsBlendSupported = Subtarget.hasSSE41();
11016 if (IsBlendSupported)
11017 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11018 Zeroable, Subtarget, DAG))
11021 // Use dedicated unpack instructions for masks that match their pattern.
11023 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11026 // Try to use byte rotation instructions.
11027 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11028 if (Subtarget.hasSSSE3()) {
11029 if (Subtarget.hasVLX())
11030 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11031 Mask, Subtarget, DAG))
11034 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11035 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11039 // If we have direct support for blends, we should lower by decomposing into
11040 // a permute. That will be faster than the domain cross.
11041 if (IsBlendSupported)
11042 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11045 // We implement this with SHUFPD which is pretty lame because it will likely
11046 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11047 // However, all the alternatives are still more cycles and newer chips don't
11048 // have this problem. It would be really nice if x86 had better shuffles here.
11049 V1 = DAG.getBitcast(MVT::v2f64, V1);
11050 V2 = DAG.getBitcast(MVT::v2f64, V2);
11051 return DAG.getBitcast(MVT::v2i64,
11052 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11055 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
11057 /// This is used to disable more specialized lowerings when the shufps lowering
11058 /// will happen to be efficient.
11059 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11060 // This routine only handles 128-bit shufps.
11061 assert(Mask.size() == 4 && "Unsupported mask size!");
11062 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11063 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11064 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11065 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11067 // To lower with a single SHUFPS we need to have the low half and high half
11068 // each requiring a single input.
11069 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11071 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11077 /// \brief Lower a vector shuffle using the SHUFPS instruction.
11079 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11080 /// It makes no assumptions about whether this is the *best* lowering, it simply
11082 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11083 ArrayRef<int> Mask, SDValue V1,
11084 SDValue V2, SelectionDAG &DAG) {
11085 SDValue LowV = V1, HighV = V2;
11086 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11088 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11090 if (NumV2Elements == 1) {
11091 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11093 // Compute the index adjacent to V2Index and in the same half by toggling
11095 int V2AdjIndex = V2Index ^ 1;
11097 if (Mask[V2AdjIndex] < 0) {
11098 // Handles all the cases where we have a single V2 element and an undef.
11099 // This will only ever happen in the high lanes because we commute the
11100 // vector otherwise.
11102 std::swap(LowV, HighV);
11103 NewMask[V2Index] -= 4;
11105 // Handle the case where the V2 element ends up adjacent to a V1 element.
11106 // To make this work, blend them together as the first step.
11107 int V1Index = V2AdjIndex;
11108 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11109 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11110 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11112 // Now proceed to reconstruct the final blend as we have the necessary
11113 // high or low half formed.
11120 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11121 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11123 } else if (NumV2Elements == 2) {
11124 if (Mask[0] < 4 && Mask[1] < 4) {
11125 // Handle the easy case where we have V1 in the low lanes and V2 in the
11129 } else if (Mask[2] < 4 && Mask[3] < 4) {
11130 // We also handle the reversed case because this utility may get called
11131 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11132 // arrange things in the right direction.
11138 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11139 // trying to place elements directly, just blend them and set up the final
11140 // shuffle to place them.
11142 // The first two blend mask elements are for V1, the second two are for
11144 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11145 Mask[2] < 4 ? Mask[2] : Mask[3],
11146 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11147 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11148 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11149 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11151 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11154 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11155 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11156 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11157 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11160 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11161 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11164 /// \brief Lower 4-lane 32-bit floating point shuffles.
11166 /// Uses instructions exclusively from the floating point unit to minimize
11167 /// domain crossing penalties, as these are sufficient to implement all v4f32
11169 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11170 const APInt &Zeroable,
11171 SDValue V1, SDValue V2,
11172 const X86Subtarget &Subtarget,
11173 SelectionDAG &DAG) {
11174 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11175 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11176 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11178 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11180 if (NumV2Elements == 0) {
11181 // Check for being able to broadcast a single element.
11182 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11183 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11186 // Use even/odd duplicate instructions for masks that match their pattern.
11187 if (Subtarget.hasSSE3()) {
11188 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11189 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11190 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11191 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11194 if (Subtarget.hasAVX()) {
11195 // If we have AVX, we can use VPERMILPS which will allow folding a load
11196 // into the shuffle.
11197 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11198 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11201 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11202 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11203 if (!Subtarget.hasSSE2()) {
11204 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11205 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11206 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11207 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11210 // Otherwise, use a straight shuffle of a single input vector. We pass the
11211 // input vector to both operands to simulate this with a SHUFPS.
11212 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11213 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11216 // There are special ways we can lower some single-element blends. However, we
11217 // have custom ways we can lower more complex single-element blends below that
11218 // we defer to if both this and BLENDPS fail to match, so restrict this to
11219 // when the V2 input is targeting element 0 of the mask -- that is the fast
11221 if (NumV2Elements == 1 && Mask[0] >= 4)
11222 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11223 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11226 if (Subtarget.hasSSE41()) {
11227 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11228 Zeroable, Subtarget, DAG))
11231 // Use INSERTPS if we can complete the shuffle efficiently.
11233 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11236 if (!isSingleSHUFPSMask(Mask))
11237 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11238 DL, MVT::v4f32, V1, V2, Mask, DAG))
11242 // Use low/high mov instructions. These are only valid in SSE1 because
11243 // otherwise they are widened to v2f64 and never get here.
11244 if (!Subtarget.hasSSE2()) {
11245 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11246 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11247 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11248 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11251 // Use dedicated unpack instructions for masks that match their pattern.
11253 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11256 // Otherwise fall back to a SHUFPS lowering strategy.
11257 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11260 /// \brief Lower 4-lane i32 vector shuffles.
11262 /// We try to handle these with integer-domain shuffles where we can, but for
11263 /// blends we use the floating point domain blend instructions.
11264 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11265 const APInt &Zeroable,
11266 SDValue V1, SDValue V2,
11267 const X86Subtarget &Subtarget,
11268 SelectionDAG &DAG) {
11269 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11270 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11271 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11273 // Whenever we can lower this as a zext, that instruction is strictly faster
11274 // than any alternative. It also allows us to fold memory operands into the
11275 // shuffle in many cases.
11276 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11277 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11280 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11282 if (NumV2Elements == 0) {
11283 // Check for being able to broadcast a single element.
11284 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11285 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11288 // Straight shuffle of a single input vector. For everything from SSE2
11289 // onward this has a single fast instruction with no scary immediates.
11290 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11291 // but we aren't actually going to use the UNPCK instruction because doing
11292 // so prevents folding a load into this instruction or making a copy.
11293 const int UnpackLoMask[] = {0, 0, 1, 1};
11294 const int UnpackHiMask[] = {2, 2, 3, 3};
11295 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11296 Mask = UnpackLoMask;
11297 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11298 Mask = UnpackHiMask;
11300 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11301 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11304 // Try to use shift instructions.
11305 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11306 Zeroable, Subtarget, DAG))
11309 // There are special ways we can lower some single-element blends.
11310 if (NumV2Elements == 1)
11311 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11312 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11315 // We have different paths for blend lowering, but they all must use the
11316 // *exact* same predicate.
11317 bool IsBlendSupported = Subtarget.hasSSE41();
11318 if (IsBlendSupported)
11319 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11320 Zeroable, Subtarget, DAG))
11323 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11327 // Use dedicated unpack instructions for masks that match their pattern.
11329 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11332 // Try to use byte rotation instructions.
11333 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11334 if (Subtarget.hasSSSE3()) {
11335 if (Subtarget.hasVLX())
11336 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11337 Mask, Subtarget, DAG))
11340 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11341 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11345 // Assume that a single SHUFPS is faster than an alternative sequence of
11346 // multiple instructions (even if the CPU has a domain penalty).
11347 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11348 if (!isSingleSHUFPSMask(Mask)) {
11349 // If we have direct support for blends, we should lower by decomposing into
11350 // a permute. That will be faster than the domain cross.
11351 if (IsBlendSupported)
11352 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11355 // Try to lower by permuting the inputs into an unpack instruction.
11356 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11357 DL, MVT::v4i32, V1, V2, Mask, DAG))
11361 // We implement this with SHUFPS because it can blend from two vectors.
11362 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11363 // up the inputs, bypassing domain shift penalties that we would incur if we
11364 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11366 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11367 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11368 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11369 return DAG.getBitcast(MVT::v4i32, ShufPS);
11372 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11373 /// shuffle lowering, and the most complex part.
11375 /// The lowering strategy is to try to form pairs of input lanes which are
11376 /// targeted at the same half of the final vector, and then use a dword shuffle
11377 /// to place them onto the right half, and finally unpack the paired lanes into
11378 /// their final position.
11380 /// The exact breakdown of how to form these dword pairs and align them on the
11381 /// correct sides is really tricky. See the comments within the function for
11382 /// more of the details.
11384 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11385 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11386 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11387 /// vector, form the analogous 128-bit 8-element Mask.
11388 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11389 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11390 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11391 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11392 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11394 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11395 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11396 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11398 // Attempt to directly match PSHUFLW or PSHUFHW.
11399 if (isUndefOrInRange(LoMask, 0, 4) &&
11400 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11401 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11402 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11404 if (isUndefOrInRange(HiMask, 4, 8) &&
11405 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11406 for (int i = 0; i != 4; ++i)
11407 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11408 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11409 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11412 SmallVector<int, 4> LoInputs;
11413 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11414 std::sort(LoInputs.begin(), LoInputs.end());
11415 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11416 SmallVector<int, 4> HiInputs;
11417 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11418 std::sort(HiInputs.begin(), HiInputs.end());
11419 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11421 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11422 int NumHToL = LoInputs.size() - NumLToL;
11424 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11425 int NumHToH = HiInputs.size() - NumLToH;
11426 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11427 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11428 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11429 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11431 // If we are shuffling values from one half - check how many different DWORD
11432 // pairs we need to create. If only 1 or 2 then we can perform this as a
11433 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11434 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11435 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11436 V = DAG.getNode(ShufWOp, DL, VT, V,
11437 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11438 V = DAG.getBitcast(PSHUFDVT, V);
11439 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11440 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11441 return DAG.getBitcast(VT, V);
11444 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11445 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11446 SmallVector<std::pair<int, int>, 4> DWordPairs;
11447 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11449 // Collect the different DWORD pairs.
11450 for (int DWord = 0; DWord != 4; ++DWord) {
11451 int M0 = Mask[2 * DWord + 0];
11452 int M1 = Mask[2 * DWord + 1];
11453 M0 = (M0 >= 0 ? M0 % 4 : M0);
11454 M1 = (M1 >= 0 ? M1 % 4 : M1);
11455 if (M0 < 0 && M1 < 0)
11458 bool Match = false;
11459 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11460 auto &DWordPair = DWordPairs[j];
11461 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11462 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11463 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11464 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11465 PSHUFDMask[DWord] = DOffset + j;
11471 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11472 DWordPairs.push_back(std::make_pair(M0, M1));
11476 if (DWordPairs.size() <= 2) {
11477 DWordPairs.resize(2, std::make_pair(-1, -1));
11478 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11479 DWordPairs[1].first, DWordPairs[1].second};
11480 if ((NumHToL + NumHToH) == 0)
11481 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11482 if ((NumLToL + NumLToH) == 0)
11483 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11487 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11488 // such inputs we can swap two of the dwords across the half mark and end up
11489 // with <=2 inputs to each half in each half. Once there, we can fall through
11490 // to the generic code below. For example:
11492 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11493 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11495 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11496 // and an existing 2-into-2 on the other half. In this case we may have to
11497 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11498 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11499 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11500 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11501 // half than the one we target for fixing) will be fixed when we re-enter this
11502 // path. We will also combine away any sequence of PSHUFD instructions that
11503 // result into a single instruction. Here is an example of the tricky case:
11505 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11506 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11508 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11510 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11511 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11513 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11514 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11516 // The result is fine to be handled by the generic logic.
11517 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11518 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11519 int AOffset, int BOffset) {
11520 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11521 "Must call this with A having 3 or 1 inputs from the A half.");
11522 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11523 "Must call this with B having 1 or 3 inputs from the B half.");
11524 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11525 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11527 bool ThreeAInputs = AToAInputs.size() == 3;
11529 // Compute the index of dword with only one word among the three inputs in
11530 // a half by taking the sum of the half with three inputs and subtracting
11531 // the sum of the actual three inputs. The difference is the remaining
11533 int ADWord, BDWord;
11534 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11535 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11536 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11537 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11538 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11539 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11540 int TripleNonInputIdx =
11541 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11542 TripleDWord = TripleNonInputIdx / 2;
11544 // We use xor with one to compute the adjacent DWord to whichever one the
11546 OneInputDWord = (OneInput / 2) ^ 1;
11548 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11549 // and BToA inputs. If there is also such a problem with the BToB and AToB
11550 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11551 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11552 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11553 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11554 // Compute how many inputs will be flipped by swapping these DWords. We
11556 // to balance this to ensure we don't form a 3-1 shuffle in the other
11558 int NumFlippedAToBInputs =
11559 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11560 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11561 int NumFlippedBToBInputs =
11562 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11563 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11564 if ((NumFlippedAToBInputs == 1 &&
11565 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11566 (NumFlippedBToBInputs == 1 &&
11567 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11568 // We choose whether to fix the A half or B half based on whether that
11569 // half has zero flipped inputs. At zero, we may not be able to fix it
11570 // with that half. We also bias towards fixing the B half because that
11571 // will more commonly be the high half, and we have to bias one way.
11572 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11573 ArrayRef<int> Inputs) {
11574 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11575 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11576 // Determine whether the free index is in the flipped dword or the
11577 // unflipped dword based on where the pinned index is. We use this bit
11578 // in an xor to conditionally select the adjacent dword.
11579 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11580 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11581 if (IsFixIdxInput == IsFixFreeIdxInput)
11583 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11584 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11585 "We need to be changing the number of flipped inputs!");
11586 int PSHUFHalfMask[] = {0, 1, 2, 3};
11587 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11589 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11590 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11591 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11593 for (int &M : Mask)
11594 if (M >= 0 && M == FixIdx)
11596 else if (M >= 0 && M == FixFreeIdx)
11599 if (NumFlippedBToBInputs != 0) {
11601 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11602 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11604 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11605 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11606 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11611 int PSHUFDMask[] = {0, 1, 2, 3};
11612 PSHUFDMask[ADWord] = BDWord;
11613 PSHUFDMask[BDWord] = ADWord;
11614 V = DAG.getBitcast(
11616 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11617 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11619 // Adjust the mask to match the new locations of A and B.
11620 for (int &M : Mask)
11621 if (M >= 0 && M/2 == ADWord)
11622 M = 2 * BDWord + M % 2;
11623 else if (M >= 0 && M/2 == BDWord)
11624 M = 2 * ADWord + M % 2;
11626 // Recurse back into this routine to re-compute state now that this isn't
11627 // a 3 and 1 problem.
11628 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11631 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11632 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11633 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11634 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11636 // At this point there are at most two inputs to the low and high halves from
11637 // each half. That means the inputs can always be grouped into dwords and
11638 // those dwords can then be moved to the correct half with a dword shuffle.
11639 // We use at most one low and one high word shuffle to collect these paired
11640 // inputs into dwords, and finally a dword shuffle to place them.
11641 int PSHUFLMask[4] = {-1, -1, -1, -1};
11642 int PSHUFHMask[4] = {-1, -1, -1, -1};
11643 int PSHUFDMask[4] = {-1, -1, -1, -1};
11645 // First fix the masks for all the inputs that are staying in their
11646 // original halves. This will then dictate the targets of the cross-half
11648 auto fixInPlaceInputs =
11649 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11650 MutableArrayRef<int> SourceHalfMask,
11651 MutableArrayRef<int> HalfMask, int HalfOffset) {
11652 if (InPlaceInputs.empty())
11654 if (InPlaceInputs.size() == 1) {
11655 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11656 InPlaceInputs[0] - HalfOffset;
11657 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11660 if (IncomingInputs.empty()) {
11661 // Just fix all of the in place inputs.
11662 for (int Input : InPlaceInputs) {
11663 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11664 PSHUFDMask[Input / 2] = Input / 2;
11669 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11670 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11671 InPlaceInputs[0] - HalfOffset;
11672 // Put the second input next to the first so that they are packed into
11673 // a dword. We find the adjacent index by toggling the low bit.
11674 int AdjIndex = InPlaceInputs[0] ^ 1;
11675 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11676 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11677 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11679 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11680 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11682 // Now gather the cross-half inputs and place them into a free dword of
11683 // their target half.
11684 // FIXME: This operation could almost certainly be simplified dramatically to
11685 // look more like the 3-1 fixing operation.
11686 auto moveInputsToRightHalf = [&PSHUFDMask](
11687 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11688 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11689 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11691 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11692 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11694 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11696 int LowWord = Word & ~1;
11697 int HighWord = Word | 1;
11698 return isWordClobbered(SourceHalfMask, LowWord) ||
11699 isWordClobbered(SourceHalfMask, HighWord);
11702 if (IncomingInputs.empty())
11705 if (ExistingInputs.empty()) {
11706 // Map any dwords with inputs from them into the right half.
11707 for (int Input : IncomingInputs) {
11708 // If the source half mask maps over the inputs, turn those into
11709 // swaps and use the swapped lane.
11710 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11711 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11712 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11713 Input - SourceOffset;
11714 // We have to swap the uses in our half mask in one sweep.
11715 for (int &M : HalfMask)
11716 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11718 else if (M == Input)
11719 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11721 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11722 Input - SourceOffset &&
11723 "Previous placement doesn't match!");
11725 // Note that this correctly re-maps both when we do a swap and when
11726 // we observe the other side of the swap above. We rely on that to
11727 // avoid swapping the members of the input list directly.
11728 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11731 // Map the input's dword into the correct half.
11732 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11733 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11735 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11737 "Previous placement doesn't match!");
11740 // And just directly shift any other-half mask elements to be same-half
11741 // as we will have mirrored the dword containing the element into the
11742 // same position within that half.
11743 for (int &M : HalfMask)
11744 if (M >= SourceOffset && M < SourceOffset + 4) {
11745 M = M - SourceOffset + DestOffset;
11746 assert(M >= 0 && "This should never wrap below zero!");
11751 // Ensure we have the input in a viable dword of its current half. This
11752 // is particularly tricky because the original position may be clobbered
11753 // by inputs being moved and *staying* in that half.
11754 if (IncomingInputs.size() == 1) {
11755 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11756 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11758 SourceHalfMask[InputFixed - SourceOffset] =
11759 IncomingInputs[0] - SourceOffset;
11760 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11762 IncomingInputs[0] = InputFixed;
11764 } else if (IncomingInputs.size() == 2) {
11765 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11766 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11767 // We have two non-adjacent or clobbered inputs we need to extract from
11768 // the source half. To do this, we need to map them into some adjacent
11769 // dword slot in the source mask.
11770 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11771 IncomingInputs[1] - SourceOffset};
11773 // If there is a free slot in the source half mask adjacent to one of
11774 // the inputs, place the other input in it. We use (Index XOR 1) to
11775 // compute an adjacent index.
11776 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11777 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11778 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11779 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11780 InputsFixed[1] = InputsFixed[0] ^ 1;
11781 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11782 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11783 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11784 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11785 InputsFixed[0] = InputsFixed[1] ^ 1;
11786 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11787 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11788 // The two inputs are in the same DWord but it is clobbered and the
11789 // adjacent DWord isn't used at all. Move both inputs to the free
11791 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11792 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11793 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11794 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11796 // The only way we hit this point is if there is no clobbering
11797 // (because there are no off-half inputs to this half) and there is no
11798 // free slot adjacent to one of the inputs. In this case, we have to
11799 // swap an input with a non-input.
11800 for (int i = 0; i < 4; ++i)
11801 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11802 "We can't handle any clobbers here!");
11803 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11804 "Cannot have adjacent inputs here!");
11806 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11807 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11809 // We also have to update the final source mask in this case because
11810 // it may need to undo the above swap.
11811 for (int &M : FinalSourceHalfMask)
11812 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11813 M = InputsFixed[1] + SourceOffset;
11814 else if (M == InputsFixed[1] + SourceOffset)
11815 M = (InputsFixed[0] ^ 1) + SourceOffset;
11817 InputsFixed[1] = InputsFixed[0] ^ 1;
11820 // Point everything at the fixed inputs.
11821 for (int &M : HalfMask)
11822 if (M == IncomingInputs[0])
11823 M = InputsFixed[0] + SourceOffset;
11824 else if (M == IncomingInputs[1])
11825 M = InputsFixed[1] + SourceOffset;
11827 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11828 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11831 llvm_unreachable("Unhandled input size!");
11834 // Now hoist the DWord down to the right half.
11835 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11836 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11837 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11838 for (int &M : HalfMask)
11839 for (int Input : IncomingInputs)
11841 M = FreeDWord * 2 + Input % 2;
11843 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11844 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11845 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11846 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11848 // Now enact all the shuffles we've computed to move the inputs into their
11850 if (!isNoopShuffleMask(PSHUFLMask))
11851 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11852 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11853 if (!isNoopShuffleMask(PSHUFHMask))
11854 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11855 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11856 if (!isNoopShuffleMask(PSHUFDMask))
11857 V = DAG.getBitcast(
11859 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11860 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11862 // At this point, each half should contain all its inputs, and we can then
11863 // just shuffle them into their final position.
11864 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11865 "Failed to lift all the high half inputs to the low mask!");
11866 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11867 "Failed to lift all the low half inputs to the high mask!");
11869 // Do a half shuffle for the low mask.
11870 if (!isNoopShuffleMask(LoMask))
11871 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11872 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11874 // Do a half shuffle with the high mask after shifting its values down.
11875 for (int &M : HiMask)
11878 if (!isNoopShuffleMask(HiMask))
11879 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11880 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11885 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11886 /// blend if only one input is used.
11887 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11888 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11889 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11891 SDValue V1Mask[16];
11892 SDValue V2Mask[16];
11896 int Size = Mask.size();
11897 int Scale = 16 / Size;
11898 for (int i = 0; i < 16; ++i) {
11899 if (Mask[i / Scale] < 0) {
11900 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11902 const int ZeroMask = 0x80;
11903 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11905 int V2Idx = Mask[i / Scale] < Size
11907 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11908 if (Zeroable[i / Scale])
11909 V1Idx = V2Idx = ZeroMask;
11910 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11911 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11912 V1InUse |= (ZeroMask != V1Idx);
11913 V2InUse |= (ZeroMask != V2Idx);
11918 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11919 DAG.getBitcast(MVT::v16i8, V1),
11920 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11922 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11923 DAG.getBitcast(MVT::v16i8, V2),
11924 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11926 // If we need shuffled inputs from both, blend the two.
11928 if (V1InUse && V2InUse)
11929 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11931 V = V1InUse ? V1 : V2;
11933 // Cast the result back to the correct type.
11934 return DAG.getBitcast(VT, V);
11937 /// \brief Generic lowering of 8-lane i16 shuffles.
11939 /// This handles both single-input shuffles and combined shuffle/blends with
11940 /// two inputs. The single input shuffles are immediately delegated to
11941 /// a dedicated lowering routine.
11943 /// The blends are lowered in one of three fundamental ways. If there are few
11944 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11945 /// of the input is significantly cheaper when lowered as an interleaving of
11946 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11947 /// halves of the inputs separately (making them have relatively few inputs)
11948 /// and then concatenate them.
11949 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11950 const APInt &Zeroable,
11951 SDValue V1, SDValue V2,
11952 const X86Subtarget &Subtarget,
11953 SelectionDAG &DAG) {
11954 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11955 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11956 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11958 // Whenever we can lower this as a zext, that instruction is strictly faster
11959 // than any alternative.
11960 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11961 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11964 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11966 if (NumV2Inputs == 0) {
11967 // Check for being able to broadcast a single element.
11968 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11969 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11972 // Try to use shift instructions.
11973 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11974 Zeroable, Subtarget, DAG))
11977 // Use dedicated unpack instructions for masks that match their pattern.
11979 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11982 // Use dedicated pack instructions for masks that match their pattern.
11983 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
11987 // Try to use byte rotation instructions.
11988 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11989 Mask, Subtarget, DAG))
11992 // Make a copy of the mask so it can be modified.
11993 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11994 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11995 MutableMask, Subtarget,
11999 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12000 "All single-input shuffles should be canonicalized to be V1-input "
12003 // Try to use shift instructions.
12004 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12005 Zeroable, Subtarget, DAG))
12008 // See if we can use SSE4A Extraction / Insertion.
12009 if (Subtarget.hasSSE4A())
12010 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12014 // There are special ways we can lower some single-element blends.
12015 if (NumV2Inputs == 1)
12016 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12017 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12020 // We have different paths for blend lowering, but they all must use the
12021 // *exact* same predicate.
12022 bool IsBlendSupported = Subtarget.hasSSE41();
12023 if (IsBlendSupported)
12024 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12025 Zeroable, Subtarget, DAG))
12028 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12032 // Use dedicated unpack instructions for masks that match their pattern.
12034 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12037 // Use dedicated pack instructions for masks that match their pattern.
12038 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12042 // Try to use byte rotation instructions.
12043 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12044 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12047 if (SDValue BitBlend =
12048 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12051 // Try to lower by permuting the inputs into an unpack instruction.
12052 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12056 // If we can't directly blend but can use PSHUFB, that will be better as it
12057 // can both shuffle and set up the inefficient blend.
12058 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12059 bool V1InUse, V2InUse;
12060 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12061 Zeroable, DAG, V1InUse, V2InUse);
12064 // We can always bit-blend if we have to so the fallback strategy is to
12065 // decompose into single-input permutes and blends.
12066 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12070 /// \brief Check whether a compaction lowering can be done by dropping even
12071 /// elements and compute how many times even elements must be dropped.
12073 /// This handles shuffles which take every Nth element where N is a power of
12074 /// two. Example shuffle masks:
12076 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12077 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12078 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12079 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12080 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12081 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12083 /// Any of these lanes can of course be undef.
12085 /// This routine only supports N <= 3.
12086 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12089 /// \returns N above, or the number of times even elements must be dropped if
12090 /// there is such a number. Otherwise returns zero.
12091 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12092 bool IsSingleInput) {
12093 // The modulus for the shuffle vector entries is based on whether this is
12094 // a single input or not.
12095 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12096 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12097 "We should only be called with masks with a power-of-2 size!");
12099 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12101 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12102 // and 2^3 simultaneously. This is because we may have ambiguity with
12103 // partially undef inputs.
12104 bool ViableForN[3] = {true, true, true};
12106 for (int i = 0, e = Mask.size(); i < e; ++i) {
12107 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12112 bool IsAnyViable = false;
12113 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12114 if (ViableForN[j]) {
12115 uint64_t N = j + 1;
12117 // The shuffle mask must be equal to (i * 2^N) % M.
12118 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12119 IsAnyViable = true;
12121 ViableForN[j] = false;
12123 // Early exit if we exhaust the possible powers of two.
12128 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12132 // Return 0 as there is no viable power of two.
12136 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12137 ArrayRef<int> Mask, SDValue V1,
12138 SDValue V2, SelectionDAG &DAG) {
12139 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12140 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12142 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12144 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12146 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12149 /// \brief Generic lowering of v16i8 shuffles.
12151 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12152 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12153 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12154 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12156 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12157 const APInt &Zeroable,
12158 SDValue V1, SDValue V2,
12159 const X86Subtarget &Subtarget,
12160 SelectionDAG &DAG) {
12161 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12162 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12163 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12165 // Try to use shift instructions.
12166 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12167 Zeroable, Subtarget, DAG))
12170 // Try to use byte rotation instructions.
12171 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12172 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12175 // Use dedicated pack instructions for masks that match their pattern.
12176 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12180 // Try to use a zext lowering.
12181 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12182 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12185 // See if we can use SSE4A Extraction / Insertion.
12186 if (Subtarget.hasSSE4A())
12187 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12191 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12193 // For single-input shuffles, there are some nicer lowering tricks we can use.
12194 if (NumV2Elements == 0) {
12195 // Check for being able to broadcast a single element.
12196 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12197 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12200 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12201 // Notably, this handles splat and partial-splat shuffles more efficiently.
12202 // However, it only makes sense if the pre-duplication shuffle simplifies
12203 // things significantly. Currently, this means we need to be able to
12204 // express the pre-duplication shuffle as an i16 shuffle.
12206 // FIXME: We should check for other patterns which can be widened into an
12207 // i16 shuffle as well.
12208 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12209 for (int i = 0; i < 16; i += 2)
12210 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12215 auto tryToWidenViaDuplication = [&]() -> SDValue {
12216 if (!canWidenViaDuplication(Mask))
12218 SmallVector<int, 4> LoInputs;
12219 copy_if(Mask, std::back_inserter(LoInputs),
12220 [](int M) { return M >= 0 && M < 8; });
12221 std::sort(LoInputs.begin(), LoInputs.end());
12222 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12224 SmallVector<int, 4> HiInputs;
12225 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12226 std::sort(HiInputs.begin(), HiInputs.end());
12227 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12230 bool TargetLo = LoInputs.size() >= HiInputs.size();
12231 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12232 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12234 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12235 SmallDenseMap<int, int, 8> LaneMap;
12236 for (int I : InPlaceInputs) {
12237 PreDupI16Shuffle[I/2] = I/2;
12240 int j = TargetLo ? 0 : 4, je = j + 4;
12241 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12242 // Check if j is already a shuffle of this input. This happens when
12243 // there are two adjacent bytes after we move the low one.
12244 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12245 // If we haven't yet mapped the input, search for a slot into which
12247 while (j < je && PreDupI16Shuffle[j] >= 0)
12251 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12254 // Map this input with the i16 shuffle.
12255 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12258 // Update the lane map based on the mapping we ended up with.
12259 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12261 V1 = DAG.getBitcast(
12263 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12264 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12266 // Unpack the bytes to form the i16s that will be shuffled into place.
12267 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12268 MVT::v16i8, V1, V1);
12270 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12271 for (int i = 0; i < 16; ++i)
12272 if (Mask[i] >= 0) {
12273 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12274 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12275 if (PostDupI16Shuffle[i / 2] < 0)
12276 PostDupI16Shuffle[i / 2] = MappedMask;
12278 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12279 "Conflicting entries in the original shuffle!");
12281 return DAG.getBitcast(
12283 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12284 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12286 if (SDValue V = tryToWidenViaDuplication())
12290 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12294 // Use dedicated unpack instructions for masks that match their pattern.
12296 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12299 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12300 // with PSHUFB. It is important to do this before we attempt to generate any
12301 // blends but after all of the single-input lowerings. If the single input
12302 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12303 // want to preserve that and we can DAG combine any longer sequences into
12304 // a PSHUFB in the end. But once we start blending from multiple inputs,
12305 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12306 // and there are *very* few patterns that would actually be faster than the
12307 // PSHUFB approach because of its ability to zero lanes.
12309 // FIXME: The only exceptions to the above are blends which are exact
12310 // interleavings with direct instructions supporting them. We currently don't
12311 // handle those well here.
12312 if (Subtarget.hasSSSE3()) {
12313 bool V1InUse = false;
12314 bool V2InUse = false;
12316 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12317 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12319 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12320 // do so. This avoids using them to handle blends-with-zero which is
12321 // important as a single pshufb is significantly faster for that.
12322 if (V1InUse && V2InUse) {
12323 if (Subtarget.hasSSE41())
12324 if (SDValue Blend = lowerVectorShuffleAsBlend(
12325 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12328 // We can use an unpack to do the blending rather than an or in some
12329 // cases. Even though the or may be (very minorly) more efficient, we
12330 // preference this lowering because there are common cases where part of
12331 // the complexity of the shuffles goes away when we do the final blend as
12333 // FIXME: It might be worth trying to detect if the unpack-feeding
12334 // shuffles will both be pshufb, in which case we shouldn't bother with
12336 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12337 DL, MVT::v16i8, V1, V2, Mask, DAG))
12340 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12341 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12342 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12348 // There are special ways we can lower some single-element blends.
12349 if (NumV2Elements == 1)
12350 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12351 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12354 if (SDValue BitBlend =
12355 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12358 // Check whether a compaction lowering can be done. This handles shuffles
12359 // which take every Nth element for some even N. See the helper function for
12362 // We special case these as they can be particularly efficiently handled with
12363 // the PACKUSB instruction on x86 and they show up in common patterns of
12364 // rearranging bytes to truncate wide elements.
12365 bool IsSingleInput = V2.isUndef();
12366 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12367 // NumEvenDrops is the power of two stride of the elements. Another way of
12368 // thinking about it is that we need to drop the even elements this many
12369 // times to get the original input.
12371 // First we need to zero all the dropped bytes.
12372 assert(NumEvenDrops <= 3 &&
12373 "No support for dropping even elements more than 3 times.");
12374 // We use the mask type to pick which bytes are preserved based on how many
12375 // elements are dropped.
12376 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12377 SDValue ByteClearMask = DAG.getBitcast(
12378 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12379 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12380 if (!IsSingleInput)
12381 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12383 // Now pack things back together.
12384 V1 = DAG.getBitcast(MVT::v8i16, V1);
12385 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12386 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12387 for (int i = 1; i < NumEvenDrops; ++i) {
12388 Result = DAG.getBitcast(MVT::v8i16, Result);
12389 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12395 // Handle multi-input cases by blending single-input shuffles.
12396 if (NumV2Elements > 0)
12397 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12400 // The fallback path for single-input shuffles widens this into two v8i16
12401 // vectors with unpacks, shuffles those, and then pulls them back together
12405 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12406 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12407 for (int i = 0; i < 16; ++i)
12409 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12411 SDValue VLoHalf, VHiHalf;
12412 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12413 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12415 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12416 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12417 // Use a mask to drop the high bytes.
12418 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12419 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12420 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12422 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12423 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12425 // Squash the masks to point directly into VLoHalf.
12426 for (int &M : LoBlendMask)
12429 for (int &M : HiBlendMask)
12433 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12434 // VHiHalf so that we can blend them as i16s.
12435 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12437 VLoHalf = DAG.getBitcast(
12438 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12439 VHiHalf = DAG.getBitcast(
12440 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12443 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12444 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12446 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12449 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12451 /// This routine breaks down the specific type of 128-bit shuffle and
12452 /// dispatches to the lowering routines accordingly.
12453 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12454 MVT VT, SDValue V1, SDValue V2,
12455 const APInt &Zeroable,
12456 const X86Subtarget &Subtarget,
12457 SelectionDAG &DAG) {
12458 switch (VT.SimpleTy) {
12460 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12462 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12464 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12466 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12468 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12470 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12473 llvm_unreachable("Unimplemented!");
12477 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
12479 /// This routine just extracts two subvectors, shuffles them independently, and
12480 /// then concatenates them back together. This should work effectively with all
12481 /// AVX vector shuffle types.
12482 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12483 SDValue V2, ArrayRef<int> Mask,
12484 SelectionDAG &DAG) {
12485 assert(VT.getSizeInBits() >= 256 &&
12486 "Only for 256-bit or wider vector shuffles!");
12487 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12488 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12490 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12491 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12493 int NumElements = VT.getVectorNumElements();
12494 int SplitNumElements = NumElements / 2;
12495 MVT ScalarVT = VT.getVectorElementType();
12496 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12498 // Rather than splitting build-vectors, just build two narrower build
12499 // vectors. This helps shuffling with splats and zeros.
12500 auto SplitVector = [&](SDValue V) {
12501 V = peekThroughBitcasts(V);
12503 MVT OrigVT = V.getSimpleValueType();
12504 int OrigNumElements = OrigVT.getVectorNumElements();
12505 int OrigSplitNumElements = OrigNumElements / 2;
12506 MVT OrigScalarVT = OrigVT.getVectorElementType();
12507 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12511 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12513 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12514 DAG.getIntPtrConstant(0, DL));
12515 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12516 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12519 SmallVector<SDValue, 16> LoOps, HiOps;
12520 for (int i = 0; i < OrigSplitNumElements; ++i) {
12521 LoOps.push_back(BV->getOperand(i));
12522 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12524 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12525 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12527 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12528 DAG.getBitcast(SplitVT, HiV));
12531 SDValue LoV1, HiV1, LoV2, HiV2;
12532 std::tie(LoV1, HiV1) = SplitVector(V1);
12533 std::tie(LoV2, HiV2) = SplitVector(V2);
12535 // Now create two 4-way blends of these half-width vectors.
12536 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12537 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12538 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12539 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12540 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12541 for (int i = 0; i < SplitNumElements; ++i) {
12542 int M = HalfMask[i];
12543 if (M >= NumElements) {
12544 if (M >= NumElements + SplitNumElements)
12548 V2BlendMask[i] = M - NumElements;
12549 BlendMask[i] = SplitNumElements + i;
12550 } else if (M >= 0) {
12551 if (M >= SplitNumElements)
12555 V1BlendMask[i] = M;
12560 // Because the lowering happens after all combining takes place, we need to
12561 // manually combine these blend masks as much as possible so that we create
12562 // a minimal number of high-level vector shuffle nodes.
12564 // First try just blending the halves of V1 or V2.
12565 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12566 return DAG.getUNDEF(SplitVT);
12567 if (!UseLoV2 && !UseHiV2)
12568 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12569 if (!UseLoV1 && !UseHiV1)
12570 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12572 SDValue V1Blend, V2Blend;
12573 if (UseLoV1 && UseHiV1) {
12575 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12577 // We only use half of V1 so map the usage down into the final blend mask.
12578 V1Blend = UseLoV1 ? LoV1 : HiV1;
12579 for (int i = 0; i < SplitNumElements; ++i)
12580 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12581 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12583 if (UseLoV2 && UseHiV2) {
12585 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12587 // We only use half of V2 so map the usage down into the final blend mask.
12588 V2Blend = UseLoV2 ? LoV2 : HiV2;
12589 for (int i = 0; i < SplitNumElements; ++i)
12590 if (BlendMask[i] >= SplitNumElements)
12591 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12593 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12595 SDValue Lo = HalfBlend(LoMask);
12596 SDValue Hi = HalfBlend(HiMask);
12597 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12600 /// \brief Either split a vector in halves or decompose the shuffles and the
12603 /// This is provided as a good fallback for many lowerings of non-single-input
12604 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12605 /// between splitting the shuffle into 128-bit components and stitching those
12606 /// back together vs. extracting the single-input shuffles and blending those
12608 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12609 SDValue V1, SDValue V2,
12610 ArrayRef<int> Mask,
12611 SelectionDAG &DAG) {
12612 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12613 "shuffles as it could then recurse on itself.");
12614 int Size = Mask.size();
12616 // If this can be modeled as a broadcast of two elements followed by a blend,
12617 // prefer that lowering. This is especially important because broadcasts can
12618 // often fold with memory operands.
12619 auto DoBothBroadcast = [&] {
12620 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12623 if (V2BroadcastIdx < 0)
12624 V2BroadcastIdx = M - Size;
12625 else if (M - Size != V2BroadcastIdx)
12627 } else if (M >= 0) {
12628 if (V1BroadcastIdx < 0)
12629 V1BroadcastIdx = M;
12630 else if (M != V1BroadcastIdx)
12635 if (DoBothBroadcast())
12636 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12639 // If the inputs all stem from a single 128-bit lane of each input, then we
12640 // split them rather than blending because the split will decompose to
12641 // unusually few instructions.
12642 int LaneCount = VT.getSizeInBits() / 128;
12643 int LaneSize = Size / LaneCount;
12644 SmallBitVector LaneInputs[2];
12645 LaneInputs[0].resize(LaneCount, false);
12646 LaneInputs[1].resize(LaneCount, false);
12647 for (int i = 0; i < Size; ++i)
12649 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12650 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12651 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12653 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12654 // that the decomposed single-input shuffles don't end up here.
12655 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12658 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12659 /// a permutation and blend of those lanes.
12661 /// This essentially blends the out-of-lane inputs to each lane into the lane
12662 /// from a permuted copy of the vector. This lowering strategy results in four
12663 /// instructions in the worst case for a single-input cross lane shuffle which
12664 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12665 /// of. Special cases for each particular shuffle pattern should be handled
12666 /// prior to trying this lowering.
12667 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12668 SDValue V1, SDValue V2,
12669 ArrayRef<int> Mask,
12671 const X86Subtarget &Subtarget) {
12672 // FIXME: This should probably be generalized for 512-bit vectors as well.
12673 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12674 int Size = Mask.size();
12675 int LaneSize = Size / 2;
12677 // If there are only inputs from one 128-bit lane, splitting will in fact be
12678 // less expensive. The flags track whether the given lane contains an element
12679 // that crosses to another lane.
12680 if (!Subtarget.hasAVX2()) {
12681 bool LaneCrossing[2] = {false, false};
12682 for (int i = 0; i < Size; ++i)
12683 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12684 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12685 if (!LaneCrossing[0] || !LaneCrossing[1])
12686 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12688 bool LaneUsed[2] = {false, false};
12689 for (int i = 0; i < Size; ++i)
12691 LaneUsed[(Mask[i] / LaneSize)] = true;
12692 if (!LaneUsed[0] || !LaneUsed[1])
12693 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12696 assert(V2.isUndef() &&
12697 "This last part of this routine only works on single input shuffles");
12699 SmallVector<int, 32> FlippedBlendMask(Size);
12700 for (int i = 0; i < Size; ++i)
12701 FlippedBlendMask[i] =
12702 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12704 : Mask[i] % LaneSize +
12705 (i / LaneSize) * LaneSize + Size);
12707 // Flip the vector, and blend the results which should now be in-lane.
12708 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12709 SDValue Flipped = DAG.getBitcast(PVT, V1);
12710 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12712 Flipped = DAG.getBitcast(VT, Flipped);
12713 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12716 /// \brief Handle lowering 2-lane 128-bit shuffles.
12717 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12718 SDValue V2, ArrayRef<int> Mask,
12719 const APInt &Zeroable,
12720 const X86Subtarget &Subtarget,
12721 SelectionDAG &DAG) {
12722 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12723 if (Subtarget.hasAVX2() && V2.isUndef())
12726 SmallVector<int, 4> WidenedMask;
12727 if (!canWidenShuffleElements(Mask, WidenedMask))
12730 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12731 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12733 // Try to use an insert into a zero vector.
12734 if (WidenedMask[0] == 0 && IsHighZero) {
12735 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12736 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12737 DAG.getIntPtrConstant(0, DL));
12738 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
12739 getZeroVector(VT, Subtarget, DAG, DL), LoV,
12740 DAG.getIntPtrConstant(0, DL));
12743 // TODO: If minimizing size and one of the inputs is a zero vector and the
12744 // the zero vector has only one use, we could use a VPERM2X128 to save the
12745 // instruction bytes needed to explicitly generate the zero vector.
12747 // Blends are faster and handle all the non-lane-crossing cases.
12748 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12749 Zeroable, Subtarget, DAG))
12752 // If either input operand is a zero vector, use VPERM2X128 because its mask
12753 // allows us to replace the zero input with an implicit zero.
12754 if (!IsLowZero && !IsHighZero) {
12755 // Check for patterns which can be matched with a single insert of a 128-bit
12757 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12758 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12760 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12761 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12762 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12763 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12764 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12765 OnlyUsesV1 ? V1 : V2,
12766 DAG.getIntPtrConstant(0, DL));
12767 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
12768 DAG.getIntPtrConstant(2, DL));
12772 // Try to use SHUF128 if possible.
12773 if (Subtarget.hasVLX()) {
12774 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
12775 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
12776 ((WidenedMask[1] % 2) << 1);
12777 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
12778 DAG.getConstant(PermMask, DL, MVT::i8));
12783 // Otherwise form a 128-bit permutation. After accounting for undefs,
12784 // convert the 64-bit shuffle mask selection values into 128-bit
12785 // selection bits by dividing the indexes by 2 and shifting into positions
12786 // defined by a vperm2*128 instruction's immediate control byte.
12788 // The immediate permute control byte looks like this:
12789 // [1:0] - select 128 bits from sources for low half of destination
12791 // [3] - zero low half of destination
12792 // [5:4] - select 128 bits from sources for high half of destination
12794 // [7] - zero high half of destination
12796 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
12798 unsigned PermMask = 0;
12799 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
12800 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
12802 // Check the immediate mask and replace unused sources with undef.
12803 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
12804 V1 = DAG.getUNDEF(VT);
12805 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
12806 V2 = DAG.getUNDEF(VT);
12808 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12809 DAG.getConstant(PermMask, DL, MVT::i8));
12812 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12813 /// shuffling each lane.
12815 /// This will only succeed when the result of fixing the 128-bit lanes results
12816 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12817 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12818 /// the lane crosses early and then use simpler shuffles within each lane.
12820 /// FIXME: It might be worthwhile at some point to support this without
12821 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12822 /// in x86 only floating point has interesting non-repeating shuffles, and even
12823 /// those are still *marginally* more expensive.
12824 static SDValue lowerVectorShuffleByMerging128BitLanes(
12825 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12826 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12827 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12829 int Size = Mask.size();
12830 int LaneSize = 128 / VT.getScalarSizeInBits();
12831 int NumLanes = Size / LaneSize;
12832 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12834 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12835 // check whether the in-128-bit lane shuffles share a repeating pattern.
12836 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12837 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12838 for (int i = 0; i < Size; ++i) {
12842 int j = i / LaneSize;
12844 if (Lanes[j] < 0) {
12845 // First entry we've seen for this lane.
12846 Lanes[j] = Mask[i] / LaneSize;
12847 } else if (Lanes[j] != Mask[i] / LaneSize) {
12848 // This doesn't match the lane selected previously!
12852 // Check that within each lane we have a consistent shuffle mask.
12853 int k = i % LaneSize;
12854 if (InLaneMask[k] < 0) {
12855 InLaneMask[k] = Mask[i] % LaneSize;
12856 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12857 // This doesn't fit a repeating in-lane mask.
12862 // First shuffle the lanes into place.
12863 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12864 VT.getSizeInBits() / 64);
12865 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12866 for (int i = 0; i < NumLanes; ++i)
12867 if (Lanes[i] >= 0) {
12868 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12869 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12872 V1 = DAG.getBitcast(LaneVT, V1);
12873 V2 = DAG.getBitcast(LaneVT, V2);
12874 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12876 // Cast it back to the type we actually want.
12877 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12879 // Now do a simple shuffle that isn't lane crossing.
12880 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12881 for (int i = 0; i < Size; ++i)
12883 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12884 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12885 "Must not introduce lane crosses at this point!");
12887 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12890 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12891 /// This allows for fast cases such as subvector extraction/insertion
12892 /// or shuffling smaller vector types which can lower more efficiently.
12893 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12894 SDValue V1, SDValue V2,
12895 ArrayRef<int> Mask,
12896 const X86Subtarget &Subtarget,
12897 SelectionDAG &DAG) {
12898 assert((VT.is256BitVector() || VT.is512BitVector()) &&
12899 "Expected 256-bit or 512-bit vector");
12901 unsigned NumElts = VT.getVectorNumElements();
12902 unsigned HalfNumElts = NumElts / 2;
12903 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12905 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12906 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12907 if (!UndefLower && !UndefUpper)
12910 // Upper half is undef and lower half is whole upper subvector.
12911 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12913 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12914 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12915 DAG.getIntPtrConstant(HalfNumElts, DL));
12916 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12917 DAG.getIntPtrConstant(0, DL));
12920 // Lower half is undef and upper half is whole lower subvector.
12921 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12923 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12924 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12925 DAG.getIntPtrConstant(0, DL));
12926 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12927 DAG.getIntPtrConstant(HalfNumElts, DL));
12930 // If the shuffle only uses two of the four halves of the input operands,
12931 // then extract them and perform the 'half' shuffle at half width.
12932 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12933 int HalfIdx1 = -1, HalfIdx2 = -1;
12934 SmallVector<int, 8> HalfMask(HalfNumElts);
12935 unsigned Offset = UndefLower ? HalfNumElts : 0;
12936 for (unsigned i = 0; i != HalfNumElts; ++i) {
12937 int M = Mask[i + Offset];
12943 // Determine which of the 4 half vectors this element is from.
12944 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12945 int HalfIdx = M / HalfNumElts;
12947 // Determine the element index into its half vector source.
12948 int HalfElt = M % HalfNumElts;
12950 // We can shuffle with up to 2 half vectors, set the new 'half'
12951 // shuffle mask accordingly.
12952 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12953 HalfMask[i] = HalfElt;
12954 HalfIdx1 = HalfIdx;
12957 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12958 HalfMask[i] = HalfElt + HalfNumElts;
12959 HalfIdx2 = HalfIdx;
12963 // Too many half vectors referenced.
12966 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12968 // Only shuffle the halves of the inputs when useful.
12969 int NumLowerHalves =
12970 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12971 int NumUpperHalves =
12972 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12974 // uuuuXXXX - don't extract uppers just to insert again.
12975 if (UndefLower && NumUpperHalves != 0)
12978 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12979 if (UndefUpper && NumUpperHalves == 2)
12982 // AVX2 - XXXXuuuu - always extract lowers.
12983 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12984 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12985 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12987 // AVX2 supports variable 32-bit element cross-lane shuffles.
12988 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12989 // XXXXuuuu - don't extract lowers and uppers.
12990 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12995 // AVX512 - XXXXuuuu - always extract lowers.
12996 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
12999 auto GetHalfVector = [&](int HalfIdx) {
13001 return DAG.getUNDEF(HalfVT);
13002 SDValue V = (HalfIdx < 2 ? V1 : V2);
13003 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13004 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13005 DAG.getIntPtrConstant(HalfIdx, DL));
13008 SDValue Half1 = GetHalfVector(HalfIdx1);
13009 SDValue Half2 = GetHalfVector(HalfIdx2);
13010 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13011 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13012 DAG.getIntPtrConstant(Offset, DL));
13015 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
13018 /// This returns true if the elements from a particular input are already in the
13019 /// slot required by the given mask and require no permutation.
13020 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13021 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13022 int Size = Mask.size();
13023 for (int i = 0; i < Size; ++i)
13024 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13030 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13031 /// every lane can be represented as the same repeating mask - allowing us to
13032 /// shuffle the sources with the repeating shuffle and then permute the result
13033 /// to the destination lanes.
13034 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13035 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13036 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13037 int NumElts = VT.getVectorNumElements();
13038 int NumLanes = VT.getSizeInBits() / 128;
13039 int NumLaneElts = NumElts / NumLanes;
13041 // On AVX2 we may be able to just shuffle the lowest elements and then
13042 // broadcast the result.
13043 if (Subtarget.hasAVX2()) {
13044 for (unsigned BroadcastSize : {16, 32, 64}) {
13045 if (BroadcastSize <= VT.getScalarSizeInBits())
13047 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13049 // Attempt to match a repeating pattern every NumBroadcastElts,
13050 // accounting for UNDEFs but only references the lowest 128-bit
13051 // lane of the inputs.
13052 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13053 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13054 for (int j = 0; j != NumBroadcastElts; ++j) {
13055 int M = Mask[i + j];
13058 int &R = RepeatMask[j];
13059 if (0 != ((M % NumElts) / NumLaneElts))
13061 if (0 <= R && R != M)
13068 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13069 if (!FindRepeatingBroadcastMask(RepeatMask))
13072 // Shuffle the (lowest) repeated elements in place for broadcast.
13073 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13075 // Shuffle the actual broadcast.
13076 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13077 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13078 for (int j = 0; j != NumBroadcastElts; ++j)
13079 BroadcastMask[i + j] = j;
13080 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13085 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13086 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13089 // Bail if we already have a repeated lane shuffle mask.
13090 SmallVector<int, 8> RepeatedShuffleMask;
13091 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13094 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13095 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13096 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13097 int NumSubLanes = NumLanes * SubLaneScale;
13098 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13100 // Check that all the sources are coming from the same lane and see if we can
13101 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13102 // determine the source sub-lane for each destination sub-lane.
13103 int TopSrcSubLane = -1;
13104 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13105 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13106 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13107 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13109 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13110 // Extract the sub-lane mask, check that it all comes from the same lane
13111 // and normalize the mask entries to come from the first lane.
13113 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13114 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13115 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13118 int Lane = (M % NumElts) / NumLaneElts;
13119 if ((0 <= SrcLane) && (SrcLane != Lane))
13122 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13123 SubLaneMask[Elt] = LocalM;
13126 // Whole sub-lane is UNDEF.
13130 // Attempt to match against the candidate repeated sub-lane masks.
13131 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13132 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13133 for (int i = 0; i != NumSubLaneElts; ++i) {
13134 if (M1[i] < 0 || M2[i] < 0)
13136 if (M1[i] != M2[i])
13142 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13143 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13146 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13147 for (int i = 0; i != NumSubLaneElts; ++i) {
13148 int M = SubLaneMask[i];
13151 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13152 "Unexpected mask element");
13153 RepeatedSubLaneMask[i] = M;
13156 // Track the top most source sub-lane - by setting the remaining to UNDEF
13157 // we can greatly simplify shuffle matching.
13158 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13159 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13160 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13164 // Bail if we failed to find a matching repeated sub-lane mask.
13165 if (Dst2SrcSubLanes[DstSubLane] < 0)
13168 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13169 "Unexpected source lane");
13171 // Create a repeating shuffle mask for the entire vector.
13172 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13173 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13174 int Lane = SubLane / SubLaneScale;
13175 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13176 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13177 int M = RepeatedSubLaneMask[Elt];
13180 int Idx = (SubLane * NumSubLaneElts) + Elt;
13181 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13184 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13186 // Shuffle each source sub-lane to its destination.
13187 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13188 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13189 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13190 if (SrcSubLane < 0)
13192 for (int j = 0; j != NumSubLaneElts; ++j)
13193 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13196 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13200 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13201 unsigned &ShuffleImm,
13202 ArrayRef<int> Mask) {
13203 int NumElts = VT.getVectorNumElements();
13204 assert(VT.getScalarSizeInBits() == 64 &&
13205 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13206 "Unexpected data type for VSHUFPD");
13208 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13209 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13211 bool ShufpdMask = true;
13212 bool CommutableMask = true;
13213 for (int i = 0; i < NumElts; ++i) {
13214 if (Mask[i] == SM_SentinelUndef)
13218 int Val = (i & 6) + NumElts * (i & 1);
13219 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13220 if (Mask[i] < Val || Mask[i] > Val + 1)
13221 ShufpdMask = false;
13222 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13223 CommutableMask = false;
13224 ShuffleImm |= (Mask[i] % 2) << i;
13229 if (CommutableMask) {
13237 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13238 ArrayRef<int> Mask, SDValue V1,
13239 SDValue V2, SelectionDAG &DAG) {
13240 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13241 "Unexpected data type for VSHUFPD");
13243 unsigned Immediate = 0;
13244 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13247 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13248 DAG.getConstant(Immediate, DL, MVT::i8));
13251 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
13253 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13254 /// isn't available.
13255 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13256 const APInt &Zeroable,
13257 SDValue V1, SDValue V2,
13258 const X86Subtarget &Subtarget,
13259 SelectionDAG &DAG) {
13260 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13261 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13262 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13264 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13265 Zeroable, Subtarget, DAG))
13268 if (V2.isUndef()) {
13269 // Check for being able to broadcast a single element.
13270 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13271 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13274 // Use low duplicate instructions for masks that match their pattern.
13275 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13276 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13278 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13279 // Non-half-crossing single input shuffles can be lowered with an
13280 // interleaved permutation.
13281 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13282 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13283 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13284 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13287 // With AVX2 we have direct support for this permutation.
13288 if (Subtarget.hasAVX2())
13289 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13290 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13292 // Try to create an in-lane repeating shuffle mask and then shuffle the
13293 // the results into the target lanes.
13294 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13295 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13298 // Otherwise, fall back.
13299 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13303 // Use dedicated unpack instructions for masks that match their pattern.
13305 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13308 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13309 Zeroable, Subtarget, DAG))
13312 // Check if the blend happens to exactly fit that of SHUFPD.
13314 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13317 // Try to create an in-lane repeating shuffle mask and then shuffle the
13318 // the results into the target lanes.
13319 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13320 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13323 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13324 // shuffle. However, if we have AVX2 and either inputs are already in place,
13325 // we will be able to shuffle even across lanes the other input in a single
13326 // instruction so skip this pattern.
13327 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13328 isShuffleMaskInputInPlace(1, Mask))))
13329 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13330 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13332 // If we have VLX support, we can use VEXPAND.
13333 if (Subtarget.hasVLX())
13334 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13335 V1, V2, DAG, Subtarget))
13338 // If we have AVX2 then we always want to lower with a blend because an v4 we
13339 // can fully permute the elements.
13340 if (Subtarget.hasAVX2())
13341 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13344 // Otherwise fall back on generic lowering.
13345 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13348 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13350 /// This routine is only called when we have AVX2 and thus a reasonable
13351 /// instruction set for v4i64 shuffling..
13352 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13353 const APInt &Zeroable,
13354 SDValue V1, SDValue V2,
13355 const X86Subtarget &Subtarget,
13356 SelectionDAG &DAG) {
13357 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13358 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13359 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13360 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13362 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13363 Zeroable, Subtarget, DAG))
13366 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13367 Zeroable, Subtarget, DAG))
13370 // Check for being able to broadcast a single element.
13371 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13372 Mask, Subtarget, DAG))
13375 if (V2.isUndef()) {
13376 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13377 // can use lower latency instructions that will operate on both lanes.
13378 SmallVector<int, 2> RepeatedMask;
13379 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13380 SmallVector<int, 4> PSHUFDMask;
13381 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13382 return DAG.getBitcast(
13384 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13385 DAG.getBitcast(MVT::v8i32, V1),
13386 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13389 // AVX2 provides a direct instruction for permuting a single input across
13391 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13392 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13395 // Try to use shift instructions.
13396 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13397 Zeroable, Subtarget, DAG))
13400 // If we have VLX support, we can use VALIGN or VEXPAND.
13401 if (Subtarget.hasVLX()) {
13402 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13403 Mask, Subtarget, DAG))
13406 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13407 V1, V2, DAG, Subtarget))
13411 // Try to use PALIGNR.
13412 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13413 Mask, Subtarget, DAG))
13416 // Use dedicated unpack instructions for masks that match their pattern.
13418 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13421 // Try to create an in-lane repeating shuffle mask and then shuffle the
13422 // the results into the target lanes.
13423 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13424 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13427 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13428 // shuffle. However, if we have AVX2 and either inputs are already in place,
13429 // we will be able to shuffle even across lanes the other input in a single
13430 // instruction so skip this pattern.
13431 if (!isShuffleMaskInputInPlace(0, Mask) &&
13432 !isShuffleMaskInputInPlace(1, Mask))
13433 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13434 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13437 // Otherwise fall back on generic blend lowering.
13438 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13442 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13444 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13445 /// isn't available.
13446 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13447 const APInt &Zeroable,
13448 SDValue V1, SDValue V2,
13449 const X86Subtarget &Subtarget,
13450 SelectionDAG &DAG) {
13451 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13452 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13453 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13455 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13456 Zeroable, Subtarget, DAG))
13459 // Check for being able to broadcast a single element.
13460 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13461 Mask, Subtarget, DAG))
13464 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13465 // options to efficiently lower the shuffle.
13466 SmallVector<int, 4> RepeatedMask;
13467 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13468 assert(RepeatedMask.size() == 4 &&
13469 "Repeated masks must be half the mask width!");
13471 // Use even/odd duplicate instructions for masks that match their pattern.
13472 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13473 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13474 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13475 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13478 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13479 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13481 // Use dedicated unpack instructions for masks that match their pattern.
13483 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13486 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13487 // have already handled any direct blends.
13488 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13491 // Try to create an in-lane repeating shuffle mask and then shuffle the
13492 // the results into the target lanes.
13493 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13494 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13497 // If we have a single input shuffle with different shuffle patterns in the
13498 // two 128-bit lanes use the variable mask to VPERMILPS.
13499 if (V2.isUndef()) {
13500 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13501 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13502 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13504 if (Subtarget.hasAVX2())
13505 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13507 // Otherwise, fall back.
13508 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13512 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13514 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13515 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13517 // If we have VLX support, we can use VEXPAND.
13518 if (Subtarget.hasVLX())
13519 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13520 V1, V2, DAG, Subtarget))
13523 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13524 // since after split we get a more efficient code using vpunpcklwd and
13525 // vpunpckhwd instrs than vblend.
13526 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13527 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13531 // If we have AVX2 then we always want to lower with a blend because at v8 we
13532 // can fully permute the elements.
13533 if (Subtarget.hasAVX2())
13534 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13537 // Otherwise fall back on generic lowering.
13538 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13541 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13543 /// This routine is only called when we have AVX2 and thus a reasonable
13544 /// instruction set for v8i32 shuffling..
13545 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13546 const APInt &Zeroable,
13547 SDValue V1, SDValue V2,
13548 const X86Subtarget &Subtarget,
13549 SelectionDAG &DAG) {
13550 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13551 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13552 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13553 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13555 // Whenever we can lower this as a zext, that instruction is strictly faster
13556 // than any alternative. It also allows us to fold memory operands into the
13557 // shuffle in many cases.
13558 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13559 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13562 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13563 // since after split we get a more efficient code than vblend by using
13564 // vpunpcklwd and vpunpckhwd instrs.
13565 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13566 !Subtarget.hasAVX512())
13568 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13571 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13572 Zeroable, Subtarget, DAG))
13575 // Check for being able to broadcast a single element.
13576 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13577 Mask, Subtarget, DAG))
13580 // If the shuffle mask is repeated in each 128-bit lane we can use more
13581 // efficient instructions that mirror the shuffles across the two 128-bit
13583 SmallVector<int, 4> RepeatedMask;
13584 bool Is128BitLaneRepeatedShuffle =
13585 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13586 if (Is128BitLaneRepeatedShuffle) {
13587 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13589 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13590 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13592 // Use dedicated unpack instructions for masks that match their pattern.
13594 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13598 // Try to use shift instructions.
13599 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13600 Zeroable, Subtarget, DAG))
13603 // If we have VLX support, we can use VALIGN or EXPAND.
13604 if (Subtarget.hasVLX()) {
13605 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13606 Mask, Subtarget, DAG))
13609 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13610 V1, V2, DAG, Subtarget))
13614 // Try to use byte rotation instructions.
13615 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13616 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13619 // Try to create an in-lane repeating shuffle mask and then shuffle the
13620 // results into the target lanes.
13621 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13622 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13625 // If the shuffle patterns aren't repeated but it is a single input, directly
13626 // generate a cross-lane VPERMD instruction.
13627 if (V2.isUndef()) {
13628 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13629 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13632 // Assume that a single SHUFPS is faster than an alternative sequence of
13633 // multiple instructions (even if the CPU has a domain penalty).
13634 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13635 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13636 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13637 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13638 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13639 CastV1, CastV2, DAG);
13640 return DAG.getBitcast(MVT::v8i32, ShufPS);
13643 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13645 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13646 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13649 // Otherwise fall back on generic blend lowering.
13650 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13654 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13656 /// This routine is only called when we have AVX2 and thus a reasonable
13657 /// instruction set for v16i16 shuffling..
13658 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13659 const APInt &Zeroable,
13660 SDValue V1, SDValue V2,
13661 const X86Subtarget &Subtarget,
13662 SelectionDAG &DAG) {
13663 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13664 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13665 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13666 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13668 // Whenever we can lower this as a zext, that instruction is strictly faster
13669 // than any alternative. It also allows us to fold memory operands into the
13670 // shuffle in many cases.
13671 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13672 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13675 // Check for being able to broadcast a single element.
13676 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13677 Mask, Subtarget, DAG))
13680 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13681 Zeroable, Subtarget, DAG))
13684 // Use dedicated unpack instructions for masks that match their pattern.
13686 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13689 // Use dedicated pack instructions for masks that match their pattern.
13690 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13694 // Try to use shift instructions.
13695 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13696 Zeroable, Subtarget, DAG))
13699 // Try to use byte rotation instructions.
13700 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13701 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13704 // Try to create an in-lane repeating shuffle mask and then shuffle the
13705 // the results into the target lanes.
13706 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13707 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13710 if (V2.isUndef()) {
13711 // There are no generalized cross-lane shuffle operations available on i16
13713 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13714 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13715 Mask, DAG, Subtarget);
13717 SmallVector<int, 8> RepeatedMask;
13718 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13719 // As this is a single-input shuffle, the repeated mask should be
13720 // a strictly valid v8i16 mask that we can pass through to the v8i16
13721 // lowering to handle even the v16 case.
13722 return lowerV8I16GeneralSingleInputVectorShuffle(
13723 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13727 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13728 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13731 // AVX512BWVL can lower to VPERMW.
13732 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13733 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13735 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13737 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13738 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13741 // Otherwise fall back on generic lowering.
13742 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13745 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13747 /// This routine is only called when we have AVX2 and thus a reasonable
13748 /// instruction set for v32i8 shuffling..
13749 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13750 const APInt &Zeroable,
13751 SDValue V1, SDValue V2,
13752 const X86Subtarget &Subtarget,
13753 SelectionDAG &DAG) {
13754 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13755 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13756 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13757 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13759 // Whenever we can lower this as a zext, that instruction is strictly faster
13760 // than any alternative. It also allows us to fold memory operands into the
13761 // shuffle in many cases.
13762 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13763 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13766 // Check for being able to broadcast a single element.
13767 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13768 Mask, Subtarget, DAG))
13771 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13772 Zeroable, Subtarget, DAG))
13775 // Use dedicated unpack instructions for masks that match their pattern.
13777 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13780 // Use dedicated pack instructions for masks that match their pattern.
13781 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
13785 // Try to use shift instructions.
13786 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13787 Zeroable, Subtarget, DAG))
13790 // Try to use byte rotation instructions.
13791 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13792 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13795 // Try to create an in-lane repeating shuffle mask and then shuffle the
13796 // the results into the target lanes.
13797 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13798 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13801 // There are no generalized cross-lane shuffle operations available on i8
13803 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13804 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13807 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13808 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13811 // AVX512VBMIVL can lower to VPERMB.
13812 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
13813 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
13815 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13817 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13818 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13821 // Otherwise fall back on generic lowering.
13822 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13825 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13827 /// This routine either breaks down the specific type of a 256-bit x86 vector
13828 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13829 /// together based on the available instructions.
13830 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13831 MVT VT, SDValue V1, SDValue V2,
13832 const APInt &Zeroable,
13833 const X86Subtarget &Subtarget,
13834 SelectionDAG &DAG) {
13835 // If we have a single input to the zero element, insert that into V1 if we
13836 // can do so cheaply.
13837 int NumElts = VT.getVectorNumElements();
13838 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13840 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13841 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13842 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13845 // Handle special cases where the lower or upper half is UNDEF.
13847 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13850 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13851 // can check for those subtargets here and avoid much of the subtarget
13852 // querying in the per-vector-type lowering routines. With AVX1 we have
13853 // essentially *zero* ability to manipulate a 256-bit vector with integer
13854 // types. Since we'll use floating point types there eventually, just
13855 // immediately cast everything to a float and operate entirely in that domain.
13856 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13857 int ElementBits = VT.getScalarSizeInBits();
13858 if (ElementBits < 32) {
13859 // No floating point type available, if we can't use the bit operations
13860 // for masking/blending then decompose into 128-bit vectors.
13862 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13864 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13866 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13869 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13870 VT.getVectorNumElements());
13871 V1 = DAG.getBitcast(FpVT, V1);
13872 V2 = DAG.getBitcast(FpVT, V2);
13873 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13876 switch (VT.SimpleTy) {
13878 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13880 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13882 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13884 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13886 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13888 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13891 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13895 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13896 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13897 ArrayRef<int> Mask,
13898 const APInt &Zeroable,
13899 SDValue V1, SDValue V2,
13900 const X86Subtarget &Subtarget,
13901 SelectionDAG &DAG) {
13902 assert(VT.getScalarSizeInBits() == 64 &&
13903 "Unexpected element type size for 128bit shuffle.");
13905 // To handle 256 bit vector requires VLX and most probably
13906 // function lowerV2X128VectorShuffle() is better solution.
13907 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13909 SmallVector<int, 4> WidenedMask;
13910 if (!canWidenShuffleElements(Mask, WidenedMask))
13913 // Try to use an insert into a zero vector.
13914 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
13915 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
13916 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
13917 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
13918 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13919 DAG.getIntPtrConstant(0, DL));
13920 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
13921 getZeroVector(VT, Subtarget, DAG, DL), LoV,
13922 DAG.getIntPtrConstant(0, DL));
13925 // Check for patterns which can be matched with a single insert of a 256-bit
13927 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13928 {0, 1, 2, 3, 0, 1, 2, 3});
13929 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13930 {0, 1, 2, 3, 8, 9, 10, 11})) {
13931 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13932 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13933 OnlyUsesV1 ? V1 : V2,
13934 DAG.getIntPtrConstant(0, DL));
13935 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13936 DAG.getIntPtrConstant(4, DL));
13939 assert(WidenedMask.size() == 4);
13941 // See if this is an insertion of the lower 128-bits of V2 into V1.
13942 bool IsInsert = true;
13944 for (int i = 0; i < 4; ++i) {
13945 assert(WidenedMask[i] >= -1);
13946 if (WidenedMask[i] < 0)
13949 // Make sure all V1 subvectors are in place.
13950 if (WidenedMask[i] < 4) {
13951 if (WidenedMask[i] != i) {
13956 // Make sure we only have a single V2 index and its the lowest 128-bits.
13957 if (V2Index >= 0 || WidenedMask[i] != 4) {
13964 if (IsInsert && V2Index >= 0) {
13965 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13966 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13967 DAG.getIntPtrConstant(0, DL));
13968 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13971 // Try to lower to vshuf64x2/vshuf32x4.
13972 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13973 unsigned PermMask = 0;
13974 // Insure elements came from the same Op.
13975 for (int i = 0; i < 4; ++i) {
13976 assert(WidenedMask[i] >= -1);
13977 if (WidenedMask[i] < 0)
13980 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13981 unsigned OpIndex = i / 2;
13982 if (Ops[OpIndex].isUndef())
13984 else if (Ops[OpIndex] != Op)
13987 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13988 // bits defined by a vshuf64x2 instruction's immediate control byte.
13989 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13992 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13993 DAG.getConstant(PermMask, DL, MVT::i8));
13996 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13997 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13998 const APInt &Zeroable,
13999 SDValue V1, SDValue V2,
14000 const X86Subtarget &Subtarget,
14001 SelectionDAG &DAG) {
14002 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14003 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14004 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14006 if (V2.isUndef()) {
14007 // Use low duplicate instructions for masks that match their pattern.
14008 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14009 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14011 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14012 // Non-half-crossing single input shuffles can be lowered with an
14013 // interleaved permutation.
14014 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14015 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14016 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14017 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14018 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14019 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14022 SmallVector<int, 4> RepeatedMask;
14023 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14024 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14025 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14028 if (SDValue Shuf128 =
14029 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14033 if (SDValue Unpck =
14034 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14037 // Check if the blend happens to exactly fit that of SHUFPD.
14039 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14042 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14043 V2, DAG, Subtarget))
14046 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14047 Zeroable, Subtarget, DAG))
14050 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14053 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
14054 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14055 const APInt &Zeroable,
14056 SDValue V1, SDValue V2,
14057 const X86Subtarget &Subtarget,
14058 SelectionDAG &DAG) {
14059 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14060 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14061 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14063 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14064 // options to efficiently lower the shuffle.
14065 SmallVector<int, 4> RepeatedMask;
14066 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14067 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14069 // Use even/odd duplicate instructions for masks that match their pattern.
14070 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14071 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14072 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14073 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14076 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14077 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14079 // Use dedicated unpack instructions for masks that match their pattern.
14080 if (SDValue Unpck =
14081 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14084 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14085 Zeroable, Subtarget, DAG))
14088 // Otherwise, fall back to a SHUFPS sequence.
14089 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14092 // If we have a single input shuffle with different shuffle patterns in the
14093 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14094 if (V2.isUndef() &&
14095 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14096 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14097 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14100 // If we have AVX512F support, we can use VEXPAND.
14101 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14102 V1, V2, DAG, Subtarget))
14105 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14108 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
14109 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14110 const APInt &Zeroable,
14111 SDValue V1, SDValue V2,
14112 const X86Subtarget &Subtarget,
14113 SelectionDAG &DAG) {
14114 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14115 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14116 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14118 if (V2.isUndef()) {
14119 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14120 // can use lower latency instructions that will operate on all four
14122 SmallVector<int, 2> Repeated128Mask;
14123 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14124 SmallVector<int, 4> PSHUFDMask;
14125 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14126 return DAG.getBitcast(
14128 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14129 DAG.getBitcast(MVT::v16i32, V1),
14130 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14133 SmallVector<int, 4> Repeated256Mask;
14134 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14135 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14136 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14139 if (SDValue Shuf128 =
14140 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14141 V1, V2, Subtarget, DAG))
14144 // Try to use shift instructions.
14145 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14146 Zeroable, Subtarget, DAG))
14149 // Try to use VALIGN.
14150 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14151 Mask, Subtarget, DAG))
14154 // Try to use PALIGNR.
14155 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14156 Mask, Subtarget, DAG))
14159 if (SDValue Unpck =
14160 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14162 // If we have AVX512F support, we can use VEXPAND.
14163 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14164 V2, DAG, Subtarget))
14167 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14168 Zeroable, Subtarget, DAG))
14171 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14174 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
14175 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14176 const APInt &Zeroable,
14177 SDValue V1, SDValue V2,
14178 const X86Subtarget &Subtarget,
14179 SelectionDAG &DAG) {
14180 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14181 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14182 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14184 // Whenever we can lower this as a zext, that instruction is strictly faster
14185 // than any alternative. It also allows us to fold memory operands into the
14186 // shuffle in many cases.
14187 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14188 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14191 // If the shuffle mask is repeated in each 128-bit lane we can use more
14192 // efficient instructions that mirror the shuffles across the four 128-bit
14194 SmallVector<int, 4> RepeatedMask;
14195 bool Is128BitLaneRepeatedShuffle =
14196 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14197 if (Is128BitLaneRepeatedShuffle) {
14198 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14200 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14201 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14203 // Use dedicated unpack instructions for masks that match their pattern.
14205 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14209 // Try to use shift instructions.
14210 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14211 Zeroable, Subtarget, DAG))
14214 // Try to use VALIGN.
14215 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14216 Mask, Subtarget, DAG))
14219 // Try to use byte rotation instructions.
14220 if (Subtarget.hasBWI())
14221 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14222 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14225 // Assume that a single SHUFPS is faster than using a permv shuffle.
14226 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14227 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14228 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14229 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14230 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14231 CastV1, CastV2, DAG);
14232 return DAG.getBitcast(MVT::v16i32, ShufPS);
14234 // If we have AVX512F support, we can use VEXPAND.
14235 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14236 V1, V2, DAG, Subtarget))
14239 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14240 Zeroable, Subtarget, DAG))
14242 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14245 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
14246 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14247 const APInt &Zeroable,
14248 SDValue V1, SDValue V2,
14249 const X86Subtarget &Subtarget,
14250 SelectionDAG &DAG) {
14251 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14252 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14253 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14254 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14256 // Whenever we can lower this as a zext, that instruction is strictly faster
14257 // than any alternative. It also allows us to fold memory operands into the
14258 // shuffle in many cases.
14259 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14260 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14263 // Use dedicated unpack instructions for masks that match their pattern.
14265 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14268 // Try to use shift instructions.
14269 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14270 Zeroable, Subtarget, DAG))
14273 // Try to use byte rotation instructions.
14274 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14275 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14278 if (V2.isUndef()) {
14279 SmallVector<int, 8> RepeatedMask;
14280 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14281 // As this is a single-input shuffle, the repeated mask should be
14282 // a strictly valid v8i16 mask that we can pass through to the v8i16
14283 // lowering to handle even the v32 case.
14284 return lowerV8I16GeneralSingleInputVectorShuffle(
14285 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14289 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14290 Zeroable, Subtarget, DAG))
14293 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14294 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14297 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14300 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14301 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14302 const APInt &Zeroable,
14303 SDValue V1, SDValue V2,
14304 const X86Subtarget &Subtarget,
14305 SelectionDAG &DAG) {
14306 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14307 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14308 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14309 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14311 // Whenever we can lower this as a zext, that instruction is strictly faster
14312 // than any alternative. It also allows us to fold memory operands into the
14313 // shuffle in many cases.
14314 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14315 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14318 // Use dedicated unpack instructions for masks that match their pattern.
14320 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14323 // Try to use shift instructions.
14324 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14325 Zeroable, Subtarget, DAG))
14328 // Try to use byte rotation instructions.
14329 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14330 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14333 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14334 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14337 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14338 if (Subtarget.hasVBMI())
14339 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14341 // Try to create an in-lane repeating shuffle mask and then shuffle the
14342 // the results into the target lanes.
14343 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14344 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14347 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14348 Zeroable, Subtarget, DAG))
14351 // FIXME: Implement direct support for this type!
14352 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14355 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14357 /// This routine either breaks down the specific type of a 512-bit x86 vector
14358 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14359 /// together based on the available instructions.
14360 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14361 MVT VT, SDValue V1, SDValue V2,
14362 const APInt &Zeroable,
14363 const X86Subtarget &Subtarget,
14364 SelectionDAG &DAG) {
14365 assert(Subtarget.hasAVX512() &&
14366 "Cannot lower 512-bit vectors w/ basic ISA!");
14368 // If we have a single input to the zero element, insert that into V1 if we
14369 // can do so cheaply.
14370 int NumElts = Mask.size();
14371 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14373 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14374 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14375 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14378 // Handle special cases where the lower or upper half is UNDEF.
14380 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14383 // Check for being able to broadcast a single element.
14384 if (SDValue Broadcast =
14385 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14388 // Dispatch to each element type for lowering. If we don't have support for
14389 // specific element type shuffles at 512 bits, immediately split them and
14390 // lower them. Each lowering routine of a given type is allowed to assume that
14391 // the requisite ISA extensions for that element type are available.
14392 switch (VT.SimpleTy) {
14394 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14396 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14398 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14400 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14402 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14404 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14407 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14411 // Lower vXi1 vector shuffles.
14412 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14413 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14414 // vector, shuffle and then truncate it back.
14415 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14416 MVT VT, SDValue V1, SDValue V2,
14417 const APInt &Zeroable,
14418 const X86Subtarget &Subtarget,
14419 SelectionDAG &DAG) {
14420 unsigned NumElts = Mask.size();
14422 // Try to recognize shuffles that are just padding a subvector with zeros.
14423 unsigned SubvecElts = 0;
14424 for (int i = 0; i != (int)NumElts; ++i) {
14425 if (Mask[i] >= 0 && Mask[i] != i)
14430 assert(SubvecElts != NumElts && "Identity shuffle?");
14432 // Clip to a power 2.
14433 SubvecElts = PowerOf2Floor(SubvecElts);
14435 // Make sure the number of zeroable bits in the top at least covers the bits
14436 // not covered by the subvector.
14437 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14438 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14439 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14440 V1, DAG.getIntPtrConstant(0, DL));
14441 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14442 getZeroVector(VT, Subtarget, DAG, DL),
14443 Extract, DAG.getIntPtrConstant(0, DL));
14447 assert(Subtarget.hasAVX512() &&
14448 "Cannot lower 512-bit vectors w/o basic ISA!");
14450 switch (VT.SimpleTy) {
14452 llvm_unreachable("Expected a vector of i1 elements");
14454 ExtVT = MVT::v2i64;
14457 ExtVT = MVT::v4i32;
14460 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14462 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14465 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14466 // 256-bit operation available.
14467 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14470 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14471 // 256-bit operation available.
14472 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14473 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14476 ExtVT = MVT::v64i8;
14480 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14481 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14483 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14484 // i1 was sign extended we can use X86ISD::CVT2MASK.
14485 int NumElems = VT.getVectorNumElements();
14486 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14487 (Subtarget.hasDQI() && (NumElems < 32)))
14488 return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, ExtVT),
14489 Shuffle, DAG.getConstant(6, DL, MVT::i8));
14491 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14494 /// Helper function that returns true if the shuffle mask should be
14495 /// commuted to improve canonicalization.
14496 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14497 int NumElements = Mask.size();
14499 int NumV1Elements = 0, NumV2Elements = 0;
14503 else if (M < NumElements)
14508 // Commute the shuffle as needed such that more elements come from V1 than
14509 // V2. This allows us to match the shuffle pattern strictly on how many
14510 // elements come from V1 without handling the symmetric cases.
14511 if (NumV2Elements > NumV1Elements)
14514 assert(NumV1Elements > 0 && "No V1 indices");
14516 if (NumV2Elements == 0)
14519 // When the number of V1 and V2 elements are the same, try to minimize the
14520 // number of uses of V2 in the low half of the vector. When that is tied,
14521 // ensure that the sum of indices for V1 is equal to or lower than the sum
14522 // indices for V2. When those are equal, try to ensure that the number of odd
14523 // indices for V1 is lower than the number of odd indices for V2.
14524 if (NumV1Elements == NumV2Elements) {
14525 int LowV1Elements = 0, LowV2Elements = 0;
14526 for (int M : Mask.slice(0, NumElements / 2))
14527 if (M >= NumElements)
14531 if (LowV2Elements > LowV1Elements)
14533 if (LowV2Elements == LowV1Elements) {
14534 int SumV1Indices = 0, SumV2Indices = 0;
14535 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14536 if (Mask[i] >= NumElements)
14538 else if (Mask[i] >= 0)
14540 if (SumV2Indices < SumV1Indices)
14542 if (SumV2Indices == SumV1Indices) {
14543 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14544 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14545 if (Mask[i] >= NumElements)
14546 NumV2OddIndices += i % 2;
14547 else if (Mask[i] >= 0)
14548 NumV1OddIndices += i % 2;
14549 if (NumV2OddIndices < NumV1OddIndices)
14558 /// \brief Top-level lowering for x86 vector shuffles.
14560 /// This handles decomposition, canonicalization, and lowering of all x86
14561 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14562 /// above in helper routines. The canonicalization attempts to widen shuffles
14563 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14564 /// s.t. only one of the two inputs needs to be tested, etc.
14565 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14566 SelectionDAG &DAG) {
14567 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14568 ArrayRef<int> Mask = SVOp->getMask();
14569 SDValue V1 = Op.getOperand(0);
14570 SDValue V2 = Op.getOperand(1);
14571 MVT VT = Op.getSimpleValueType();
14572 int NumElements = VT.getVectorNumElements();
14574 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14576 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14577 "Can't lower MMX shuffles");
14579 bool V1IsUndef = V1.isUndef();
14580 bool V2IsUndef = V2.isUndef();
14581 if (V1IsUndef && V2IsUndef)
14582 return DAG.getUNDEF(VT);
14584 // When we create a shuffle node we put the UNDEF node to second operand,
14585 // but in some cases the first operand may be transformed to UNDEF.
14586 // In this case we should just commute the node.
14588 return DAG.getCommutedVectorShuffle(*SVOp);
14590 // Check for non-undef masks pointing at an undef vector and make the masks
14591 // undef as well. This makes it easier to match the shuffle based solely on
14595 if (M >= NumElements) {
14596 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14597 for (int &M : NewMask)
14598 if (M >= NumElements)
14600 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14603 // Check for illegal shuffle mask element index values.
14604 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14605 assert(llvm::all_of(Mask,
14606 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14607 "Out of bounds shuffle index");
14609 // We actually see shuffles that are entirely re-arrangements of a set of
14610 // zero inputs. This mostly happens while decomposing complex shuffles into
14611 // simple ones. Directly lower these as a buildvector of zeros.
14612 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14613 if (Zeroable.isAllOnesValue())
14614 return getZeroVector(VT, Subtarget, DAG, DL);
14616 // Try to collapse shuffles into using a vector type with fewer elements but
14617 // wider element types. We cap this to not form integers or floating point
14618 // elements wider than 64 bits, but it might be interesting to form i128
14619 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14620 SmallVector<int, 16> WidenedMask;
14621 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14622 canWidenShuffleElements(Mask, WidenedMask)) {
14623 MVT NewEltVT = VT.isFloatingPoint()
14624 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14625 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14626 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14627 // Make sure that the new vector type is legal. For example, v2f64 isn't
14629 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14630 V1 = DAG.getBitcast(NewVT, V1);
14631 V2 = DAG.getBitcast(NewVT, V2);
14632 return DAG.getBitcast(
14633 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14637 // Commute the shuffle if it will improve canonicalization.
14638 if (canonicalizeShuffleMaskWithCommute(Mask))
14639 return DAG.getCommutedVectorShuffle(*SVOp);
14641 // For each vector width, delegate to a specialized lowering routine.
14642 if (VT.is128BitVector())
14643 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14646 if (VT.is256BitVector())
14647 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14650 if (VT.is512BitVector())
14651 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14655 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14658 llvm_unreachable("Unimplemented!");
14661 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14662 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14663 const X86Subtarget &Subtarget,
14664 SelectionDAG &DAG) {
14665 SDValue Cond = Op.getOperand(0);
14666 SDValue LHS = Op.getOperand(1);
14667 SDValue RHS = Op.getOperand(2);
14669 MVT VT = Op.getSimpleValueType();
14671 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14673 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14675 // Only non-legal VSELECTs reach this lowering, convert those into generic
14676 // shuffles and re-use the shuffle lowering path for blends.
14677 SmallVector<int, 32> Mask;
14678 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14679 SDValue CondElt = CondBV->getOperand(i);
14681 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14684 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14687 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14688 // A vselect where all conditions and data are constants can be optimized into
14689 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14690 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14691 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14692 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14695 // Try to lower this to a blend-style vector shuffle. This can handle all
14696 // constant condition cases.
14697 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14700 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14701 // with patterns on the mask registers on AVX-512.
14702 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14705 // Variable blends are only legal from SSE4.1 onward.
14706 if (!Subtarget.hasSSE41())
14710 MVT VT = Op.getSimpleValueType();
14712 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14713 // into an i1 condition so that we can use the mask-based 512-bit blend
14715 if (VT.getSizeInBits() == 512) {
14716 SDValue Cond = Op.getOperand(0);
14717 // The vNi1 condition case should be handled above as it can be trivially
14719 assert(Cond.getValueType().getScalarSizeInBits() ==
14720 VT.getScalarSizeInBits() &&
14721 "Should have a size-matched integer condition!");
14722 // Build a mask by testing the condition against zero.
14723 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14724 SDValue Mask = DAG.getNode(X86ISD::CMPM, dl, MaskVT, Cond,
14725 getZeroVector(VT, Subtarget, DAG, dl),
14726 DAG.getConstant(4, dl, MVT::i8));
14727 // Now return a new VSELECT using the mask.
14728 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14731 // Only some types will be legal on some subtargets. If we can emit a legal
14732 // VSELECT-matching blend, return Op, and but if we need to expand, return
14734 switch (VT.SimpleTy) {
14736 // Most of the vector types have blends past SSE4.1.
14740 // The byte blends for AVX vectors were introduced only in AVX2.
14741 if (Subtarget.hasAVX2())
14748 // FIXME: We should custom lower this by fixing the condition and using i8
14754 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14755 MVT VT = Op.getSimpleValueType();
14758 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14761 if (VT.getSizeInBits() == 8) {
14762 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14763 Op.getOperand(0), Op.getOperand(1));
14764 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14767 if (VT == MVT::f32) {
14768 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14769 // the result back to FR32 register. It's only worth matching if the
14770 // result has a single use which is a store or a bitcast to i32. And in
14771 // the case of a store, it's not worth it if the index is a constant 0,
14772 // because a MOVSSmr can be used instead, which is smaller and faster.
14773 if (!Op.hasOneUse())
14775 SDNode *User = *Op.getNode()->use_begin();
14776 if ((User->getOpcode() != ISD::STORE ||
14777 isNullConstant(Op.getOperand(1))) &&
14778 (User->getOpcode() != ISD::BITCAST ||
14779 User->getValueType(0) != MVT::i32))
14781 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14782 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14784 return DAG.getBitcast(MVT::f32, Extract);
14787 if (VT == MVT::i32 || VT == MVT::i64) {
14788 // ExtractPS/pextrq works with constant index.
14789 if (isa<ConstantSDNode>(Op.getOperand(1)))
14796 /// Extract one bit from mask vector, like v16i1 or v8i1.
14797 /// AVX-512 feature.
14798 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
14799 const X86Subtarget &Subtarget) {
14800 SDValue Vec = Op.getOperand(0);
14802 MVT VecVT = Vec.getSimpleValueType();
14803 SDValue Idx = Op.getOperand(1);
14804 MVT EltVT = Op.getSimpleValueType();
14806 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14807 "Unexpected vector type in ExtractBitFromMaskVector");
14809 // variable index can't be handled in mask registers,
14810 // extend vector to VR512/128
14811 if (!isa<ConstantSDNode>(Idx)) {
14812 unsigned NumElts = VecVT.getVectorNumElements();
14813 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14814 // than extending to 128/256bit.
14815 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
14816 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
14817 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
14818 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
14819 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14822 // Canonicalize result type to MVT::i32.
14823 if (EltVT != MVT::i32) {
14824 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14826 return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
14829 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14831 // Extracts from element 0 are always allowed.
14835 // If the kshift instructions of the correct width aren't natively supported
14836 // then we need to promote the vector to the native size to get the correct
14837 // zeroing behavior.
14838 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14839 (VecVT.getVectorNumElements() < 8)) {
14840 VecVT = MVT::v16i1;
14841 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14842 DAG.getUNDEF(VecVT),
14844 DAG.getIntPtrConstant(0, dl));
14847 // Use kshiftr instruction to move to the lower element.
14848 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14849 DAG.getConstant(IdxVal, dl, MVT::i8));
14850 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
14851 DAG.getIntPtrConstant(0, dl));
14855 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14856 SelectionDAG &DAG) const {
14858 SDValue Vec = Op.getOperand(0);
14859 MVT VecVT = Vec.getSimpleValueType();
14860 SDValue Idx = Op.getOperand(1);
14862 if (VecVT.getVectorElementType() == MVT::i1)
14863 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
14865 if (!isa<ConstantSDNode>(Idx)) {
14866 // Its more profitable to go through memory (1 cycles throughput)
14867 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14868 // IACA tool was used to get performance estimation
14869 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14871 // example : extractelement <16 x i8> %a, i32 %i
14873 // Block Throughput: 3.00 Cycles
14874 // Throughput Bottleneck: Port5
14876 // | Num Of | Ports pressure in cycles | |
14877 // | Uops | 0 - DV | 5 | 6 | 7 | |
14878 // ---------------------------------------------
14879 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14880 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14881 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14882 // Total Num Of Uops: 4
14885 // Block Throughput: 1.00 Cycles
14886 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14888 // | | Ports pressure in cycles | |
14889 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14890 // ---------------------------------------------------------
14891 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14892 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14893 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14894 // Total Num Of Uops: 4
14899 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14901 // If this is a 256-bit vector result, first extract the 128-bit vector and
14902 // then extract the element from the 128-bit vector.
14903 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14904 // Get the 128-bit vector.
14905 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14906 MVT EltVT = VecVT.getVectorElementType();
14908 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14909 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14911 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14912 // this can be done with a mask.
14913 IdxVal &= ElemsPerChunk - 1;
14914 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14915 DAG.getConstant(IdxVal, dl, MVT::i32));
14918 assert(VecVT.is128BitVector() && "Unexpected vector length");
14920 MVT VT = Op.getSimpleValueType();
14922 if (VT.getSizeInBits() == 16) {
14923 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14924 // we're going to zero extend the register or fold the store (SSE41 only).
14925 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14926 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14927 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14928 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14929 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14931 // Transform it so it match pextrw which produces a 32-bit result.
14932 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14933 Op.getOperand(0), Op.getOperand(1));
14934 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14937 if (Subtarget.hasSSE41())
14938 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14941 // TODO: We only extract a single element from v16i8, we can probably afford
14942 // to be more aggressive here before using the default approach of spilling to
14944 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14945 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14946 int DWordIdx = IdxVal / 4;
14947 if (DWordIdx == 0) {
14948 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14949 DAG.getBitcast(MVT::v4i32, Vec),
14950 DAG.getIntPtrConstant(DWordIdx, dl));
14951 int ShiftVal = (IdxVal % 4) * 8;
14953 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14954 DAG.getConstant(ShiftVal, dl, MVT::i32));
14955 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14958 int WordIdx = IdxVal / 2;
14959 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14960 DAG.getBitcast(MVT::v8i16, Vec),
14961 DAG.getIntPtrConstant(WordIdx, dl));
14962 int ShiftVal = (IdxVal % 2) * 8;
14964 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14965 DAG.getConstant(ShiftVal, dl, MVT::i16));
14966 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14969 if (VT.getSizeInBits() == 32) {
14973 // SHUFPS the element to the lowest double word, then movss.
14974 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14975 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14976 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14977 DAG.getIntPtrConstant(0, dl));
14980 if (VT.getSizeInBits() == 64) {
14981 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14982 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14983 // to match extract_elt for f64.
14987 // UNPCKHPD the element to the lowest double word, then movsd.
14988 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14989 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14990 int Mask[2] = { 1, -1 };
14991 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14992 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14993 DAG.getIntPtrConstant(0, dl));
14999 /// Insert one bit to mask vector, like v16i1 or v8i1.
15000 /// AVX-512 feature.
15001 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15002 const X86Subtarget &Subtarget) {
15004 SDValue Vec = Op.getOperand(0);
15005 SDValue Elt = Op.getOperand(1);
15006 SDValue Idx = Op.getOperand(2);
15007 MVT VecVT = Vec.getSimpleValueType();
15009 if (!isa<ConstantSDNode>(Idx)) {
15010 // Non constant index. Extend source and destination,
15011 // insert element and then truncate the result.
15012 unsigned NumElts = VecVT.getVectorNumElements();
15013 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15014 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15015 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15016 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15017 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15018 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15021 // Copy into a k-register, extract to v1i1 and insert_subvector.
15022 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15024 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15028 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15029 SelectionDAG &DAG) const {
15030 MVT VT = Op.getSimpleValueType();
15031 MVT EltVT = VT.getVectorElementType();
15032 unsigned NumElts = VT.getVectorNumElements();
15034 if (EltVT == MVT::i1)
15035 return InsertBitToMaskVector(Op, DAG, Subtarget);
15038 SDValue N0 = Op.getOperand(0);
15039 SDValue N1 = Op.getOperand(1);
15040 SDValue N2 = Op.getOperand(2);
15041 if (!isa<ConstantSDNode>(N2))
15043 auto *N2C = cast<ConstantSDNode>(N2);
15044 unsigned IdxVal = N2C->getZExtValue();
15046 bool IsZeroElt = X86::isZeroNode(N1);
15047 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15049 // If we are inserting a element, see if we can do this more efficiently with
15050 // a blend shuffle with a rematerializable vector than a costly integer
15052 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15053 16 <= EltVT.getSizeInBits()) {
15054 SmallVector<int, 8> BlendMask;
15055 for (unsigned i = 0; i != NumElts; ++i)
15056 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15057 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15058 : getOnesVector(VT, DAG, dl);
15059 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15062 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15063 // into that, and then insert the subvector back into the result.
15064 if (VT.is256BitVector() || VT.is512BitVector()) {
15065 // With a 256-bit vector, we can insert into the zero element efficiently
15066 // using a blend if we have AVX or AVX2 and the right data type.
15067 if (VT.is256BitVector() && IdxVal == 0) {
15068 // TODO: It is worthwhile to cast integer to floating point and back
15069 // and incur a domain crossing penalty if that's what we'll end up
15070 // doing anyway after extracting to a 128-bit vector.
15071 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15072 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15073 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15074 N2 = DAG.getIntPtrConstant(1, dl);
15075 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15079 // Get the desired 128-bit vector chunk.
15080 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15082 // Insert the element into the desired chunk.
15083 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15084 assert(isPowerOf2_32(NumEltsIn128));
15085 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15086 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15088 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15089 DAG.getConstant(IdxIn128, dl, MVT::i32));
15091 // Insert the changed part back into the bigger vector
15092 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15094 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15096 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15097 // argument. SSE41 required for pinsrb.
15098 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15100 if (VT == MVT::v8i16) {
15101 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15102 Opc = X86ISD::PINSRW;
15104 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15105 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15106 Opc = X86ISD::PINSRB;
15109 if (N1.getValueType() != MVT::i32)
15110 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15111 if (N2.getValueType() != MVT::i32)
15112 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15113 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15116 if (Subtarget.hasSSE41()) {
15117 if (EltVT == MVT::f32) {
15118 // Bits [7:6] of the constant are the source select. This will always be
15119 // zero here. The DAG Combiner may combine an extract_elt index into
15120 // these bits. For example (insert (extract, 3), 2) could be matched by
15121 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15122 // Bits [5:4] of the constant are the destination select. This is the
15123 // value of the incoming immediate.
15124 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15125 // combine either bitwise AND or insert of float 0.0 to set these bits.
15127 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15128 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15129 // If this is an insertion of 32-bits into the low 32-bits of
15130 // a vector, we prefer to generate a blend with immediate rather
15131 // than an insertps. Blends are simpler operations in hardware and so
15132 // will always have equal or better performance than insertps.
15133 // But if optimizing for size and there's a load folding opportunity,
15134 // generate insertps because blendps does not have a 32-bit memory
15136 N2 = DAG.getIntPtrConstant(1, dl);
15137 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15138 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15140 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15141 // Create this as a scalar to vector..
15142 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15143 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15146 // PINSR* works with constant index.
15147 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15154 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15155 SelectionDAG &DAG) {
15157 MVT OpVT = Op.getSimpleValueType();
15159 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15161 if (X86::isZeroNode(Op.getOperand(0)))
15162 return getZeroVector(OpVT, Subtarget, DAG, dl);
15164 // If this is a 256-bit vector result, first insert into a 128-bit
15165 // vector and then insert into the 256-bit vector.
15166 if (!OpVT.is128BitVector()) {
15167 // Insert into a 128-bit vector.
15168 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15169 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15170 OpVT.getVectorNumElements() / SizeFactor);
15172 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15174 // Insert the 128-bit vector.
15175 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15177 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15179 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15180 if (OpVT == MVT::v4i32)
15183 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15184 return DAG.getBitcast(
15185 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15188 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15189 // simple superregister reference or explicit instructions to insert
15190 // the upper bits of a vector.
15191 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15192 SelectionDAG &DAG) {
15193 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15195 return insert1BitVector(Op, DAG, Subtarget);
15198 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15199 SelectionDAG &DAG) {
15200 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15201 "Only vXi1 extract_subvectors need custom lowering");
15204 SDValue Vec = Op.getOperand(0);
15205 SDValue Idx = Op.getOperand(1);
15207 if (!isa<ConstantSDNode>(Idx))
15210 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15211 if (IdxVal == 0) // the operation is legal
15214 MVT VecVT = Vec.getSimpleValueType();
15215 unsigned NumElems = VecVT.getVectorNumElements();
15217 // Extend to natively supported kshift.
15218 MVT WideVecVT = VecVT;
15219 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15220 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15221 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15222 DAG.getUNDEF(WideVecVT), Vec,
15223 DAG.getIntPtrConstant(0, dl));
15226 // Shift to the LSB.
15227 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15228 DAG.getConstant(IdxVal, dl, MVT::i8));
15230 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15231 DAG.getIntPtrConstant(0, dl));
15234 // Returns the appropriate wrapper opcode for a global reference.
15235 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
15236 // References to absolute symbols are never PC-relative.
15237 if (GV && GV->isAbsoluteSymbolRef())
15238 return X86ISD::Wrapper;
15240 CodeModel::Model M = getTargetMachine().getCodeModel();
15241 if (Subtarget.isPICStyleRIPRel() &&
15242 (M == CodeModel::Small || M == CodeModel::Kernel))
15243 return X86ISD::WrapperRIP;
15245 return X86ISD::Wrapper;
15248 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15249 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15250 // one of the above mentioned nodes. It has to be wrapped because otherwise
15251 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15252 // be used to form addressing mode. These wrapped nodes will be selected
15255 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15256 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15258 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15259 // global base reg.
15260 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15262 auto PtrVT = getPointerTy(DAG.getDataLayout());
15263 SDValue Result = DAG.getTargetConstantPool(
15264 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15266 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15267 // With PIC, the address is actually $g + Offset.
15270 DAG.getNode(ISD::ADD, DL, PtrVT,
15271 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15277 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15278 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15280 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15281 // global base reg.
15282 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15284 auto PtrVT = getPointerTy(DAG.getDataLayout());
15285 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15287 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15289 // With PIC, the address is actually $g + Offset.
15292 DAG.getNode(ISD::ADD, DL, PtrVT,
15293 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15299 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15300 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15302 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15303 // global base reg.
15304 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15305 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15307 auto PtrVT = getPointerTy(DAG.getDataLayout());
15308 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15311 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15313 // With PIC, the address is actually $g + Offset.
15314 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15316 DAG.getNode(ISD::ADD, DL, PtrVT,
15317 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15320 // For symbols that require a load from a stub to get the address, emit the
15322 if (isGlobalStubReference(OpFlag))
15323 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15324 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15330 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15331 // Create the TargetBlockAddressAddress node.
15332 unsigned char OpFlags =
15333 Subtarget.classifyBlockAddressReference();
15334 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15335 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15337 auto PtrVT = getPointerTy(DAG.getDataLayout());
15338 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15339 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15341 // With PIC, the address is actually $g + Offset.
15342 if (isGlobalRelativeToPICBase(OpFlags)) {
15343 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15344 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15350 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15351 const SDLoc &dl, int64_t Offset,
15352 SelectionDAG &DAG) const {
15353 // Create the TargetGlobalAddress node, folding in the constant
15354 // offset if it is legal.
15355 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15356 CodeModel::Model M = DAG.getTarget().getCodeModel();
15357 auto PtrVT = getPointerTy(DAG.getDataLayout());
15359 if (OpFlags == X86II::MO_NO_FLAG &&
15360 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15361 // A direct static reference to a global.
15362 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15365 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15368 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15370 // With PIC, the address is actually $g + Offset.
15371 if (isGlobalRelativeToPICBase(OpFlags)) {
15372 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15373 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15376 // For globals that require a load from a stub to get the address, emit the
15378 if (isGlobalStubReference(OpFlags))
15379 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15380 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15382 // If there was a non-zero offset that we didn't fold, create an explicit
15383 // addition for it.
15385 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15386 DAG.getConstant(Offset, dl, PtrVT));
15392 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15393 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15394 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15395 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15399 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15400 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15401 unsigned char OperandFlags, bool LocalDynamic = false) {
15402 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15403 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15405 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15406 GA->getValueType(0),
15410 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15414 SDValue Ops[] = { Chain, TGA, *InFlag };
15415 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15417 SDValue Ops[] = { Chain, TGA };
15418 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15421 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15422 MFI.setAdjustsStack(true);
15423 MFI.setHasCalls(true);
15425 SDValue Flag = Chain.getValue(1);
15426 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15429 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15431 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15434 SDLoc dl(GA); // ? function entry point might be better
15435 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15436 DAG.getNode(X86ISD::GlobalBaseReg,
15437 SDLoc(), PtrVT), InFlag);
15438 InFlag = Chain.getValue(1);
15440 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15443 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15445 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15447 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15448 X86::RAX, X86II::MO_TLSGD);
15451 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15457 // Get the start address of the TLS block for this module.
15458 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15459 .getInfo<X86MachineFunctionInfo>();
15460 MFI->incNumLocalDynamicTLSAccesses();
15464 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15465 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15468 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15469 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15470 InFlag = Chain.getValue(1);
15471 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15472 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15475 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15479 unsigned char OperandFlags = X86II::MO_DTPOFF;
15480 unsigned WrapperKind = X86ISD::Wrapper;
15481 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15482 GA->getValueType(0),
15483 GA->getOffset(), OperandFlags);
15484 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15486 // Add x@dtpoff with the base.
15487 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15490 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15491 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15492 const EVT PtrVT, TLSModel::Model model,
15493 bool is64Bit, bool isPIC) {
15496 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15497 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15498 is64Bit ? 257 : 256));
15500 SDValue ThreadPointer =
15501 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15502 MachinePointerInfo(Ptr));
15504 unsigned char OperandFlags = 0;
15505 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15507 unsigned WrapperKind = X86ISD::Wrapper;
15508 if (model == TLSModel::LocalExec) {
15509 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15510 } else if (model == TLSModel::InitialExec) {
15512 OperandFlags = X86II::MO_GOTTPOFF;
15513 WrapperKind = X86ISD::WrapperRIP;
15515 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15518 llvm_unreachable("Unexpected model");
15521 // emit "addl x@ntpoff,%eax" (local exec)
15522 // or "addl x@indntpoff,%eax" (initial exec)
15523 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15525 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15526 GA->getOffset(), OperandFlags);
15527 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15529 if (model == TLSModel::InitialExec) {
15530 if (isPIC && !is64Bit) {
15531 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15532 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15536 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15537 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15540 // The address of the thread local variable is the add of the thread
15541 // pointer with the offset of the variable.
15542 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15546 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15548 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15550 if (DAG.getTarget().Options.EmulatedTLS)
15551 return LowerToTLSEmulatedModel(GA, DAG);
15553 const GlobalValue *GV = GA->getGlobal();
15554 auto PtrVT = getPointerTy(DAG.getDataLayout());
15555 bool PositionIndependent = isPositionIndependent();
15557 if (Subtarget.isTargetELF()) {
15558 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15560 case TLSModel::GeneralDynamic:
15561 if (Subtarget.is64Bit())
15562 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15563 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15564 case TLSModel::LocalDynamic:
15565 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15566 Subtarget.is64Bit());
15567 case TLSModel::InitialExec:
15568 case TLSModel::LocalExec:
15569 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15570 PositionIndependent);
15572 llvm_unreachable("Unknown TLS model.");
15575 if (Subtarget.isTargetDarwin()) {
15576 // Darwin only has one model of TLS. Lower to that.
15577 unsigned char OpFlag = 0;
15578 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15579 X86ISD::WrapperRIP : X86ISD::Wrapper;
15581 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15582 // global base reg.
15583 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15585 OpFlag = X86II::MO_TLVP_PIC_BASE;
15587 OpFlag = X86II::MO_TLVP;
15589 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15590 GA->getValueType(0),
15591 GA->getOffset(), OpFlag);
15592 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15594 // With PIC32, the address is actually $g + Offset.
15596 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15597 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15600 // Lowering the machine isd will make sure everything is in the right
15602 SDValue Chain = DAG.getEntryNode();
15603 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15604 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15605 SDValue Args[] = { Chain, Offset };
15606 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15607 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15608 DAG.getIntPtrConstant(0, DL, true),
15609 Chain.getValue(1), DL);
15611 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15612 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15613 MFI.setAdjustsStack(true);
15615 // And our return value (tls address) is in the standard call return value
15617 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15618 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15621 if (Subtarget.isTargetKnownWindowsMSVC() ||
15622 Subtarget.isTargetWindowsItanium() ||
15623 Subtarget.isTargetWindowsGNU()) {
15624 // Just use the implicit TLS architecture
15625 // Need to generate something similar to:
15626 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15628 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15629 // mov rcx, qword [rdx+rcx*8]
15630 // mov eax, .tls$:tlsvar
15631 // [rax+rcx] contains the address
15632 // Windows 64bit: gs:0x58
15633 // Windows 32bit: fs:__tls_array
15636 SDValue Chain = DAG.getEntryNode();
15638 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15639 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15640 // use its literal value of 0x2C.
15641 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15642 ? Type::getInt8PtrTy(*DAG.getContext(),
15644 : Type::getInt32PtrTy(*DAG.getContext(),
15647 SDValue TlsArray = Subtarget.is64Bit()
15648 ? DAG.getIntPtrConstant(0x58, dl)
15649 : (Subtarget.isTargetWindowsGNU()
15650 ? DAG.getIntPtrConstant(0x2C, dl)
15651 : DAG.getExternalSymbol("_tls_array", PtrVT));
15653 SDValue ThreadPointer =
15654 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15657 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15658 res = ThreadPointer;
15660 // Load the _tls_index variable
15661 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15662 if (Subtarget.is64Bit())
15663 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15664 MachinePointerInfo(), MVT::i32);
15666 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15668 auto &DL = DAG.getDataLayout();
15670 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15671 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15673 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15676 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15678 // Get the offset of start of .tls section
15679 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15680 GA->getValueType(0),
15681 GA->getOffset(), X86II::MO_SECREL);
15682 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15684 // The address of the thread local variable is the add of the thread
15685 // pointer with the offset of the variable.
15686 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15689 llvm_unreachable("TLS not implemented for this target.");
15692 /// Lower SRA_PARTS and friends, which return two i32 values
15693 /// and take a 2 x i32 value to shift plus a shift amount.
15694 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15695 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15696 MVT VT = Op.getSimpleValueType();
15697 unsigned VTBits = VT.getSizeInBits();
15699 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15700 SDValue ShOpLo = Op.getOperand(0);
15701 SDValue ShOpHi = Op.getOperand(1);
15702 SDValue ShAmt = Op.getOperand(2);
15703 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15704 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15706 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15707 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15708 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15709 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15710 : DAG.getConstant(0, dl, VT);
15712 SDValue Tmp2, Tmp3;
15713 if (Op.getOpcode() == ISD::SHL_PARTS) {
15714 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15715 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15717 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15718 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15721 // If the shift amount is larger or equal than the width of a part we can't
15722 // rely on the results of shld/shrd. Insert a test and select the appropriate
15723 // values for large shift amounts.
15724 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15725 DAG.getConstant(VTBits, dl, MVT::i8));
15726 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15727 AndNode, DAG.getConstant(0, dl, MVT::i8));
15730 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15731 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15732 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15734 if (Op.getOpcode() == ISD::SHL_PARTS) {
15735 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15736 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15738 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15739 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15742 SDValue Ops[2] = { Lo, Hi };
15743 return DAG.getMergeValues(Ops, dl);
15746 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15747 SelectionDAG &DAG) const {
15748 SDValue Src = Op.getOperand(0);
15749 MVT SrcVT = Src.getSimpleValueType();
15750 MVT VT = Op.getSimpleValueType();
15753 if (SrcVT.isVector()) {
15754 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15755 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15756 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15757 DAG.getUNDEF(SrcVT)));
15759 if (SrcVT == MVT::v2i1) {
15760 // For v2i1, we need to widen to v4i1 first.
15761 assert(VT == MVT::v2f64 && "Unexpected type");
15762 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Src,
15763 DAG.getUNDEF(MVT::v2i1));
15764 return DAG.getNode(X86ISD::CVTSI2P, dl, Op.getValueType(),
15765 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Src));
15770 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15771 "Unknown SINT_TO_FP to lower!");
15773 // These are really Legal; return the operand so the caller accepts it as
15775 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15777 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15778 Subtarget.is64Bit()) {
15782 SDValue ValueToStore = Op.getOperand(0);
15783 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15784 !Subtarget.is64Bit())
15785 // Bitcasting to f64 here allows us to do a single 64-bit store from
15786 // an SSE register, avoiding the store forwarding penalty that would come
15787 // with two 32-bit stores.
15788 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15790 unsigned Size = SrcVT.getSizeInBits()/8;
15791 MachineFunction &MF = DAG.getMachineFunction();
15792 auto PtrVT = getPointerTy(MF.getDataLayout());
15793 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15794 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15795 SDValue Chain = DAG.getStore(
15796 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15797 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15798 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15801 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15803 SelectionDAG &DAG) const {
15807 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15809 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15811 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15813 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15815 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15816 MachineMemOperand *MMO;
15818 int SSFI = FI->getIndex();
15819 MMO = DAG.getMachineFunction().getMachineMemOperand(
15820 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15821 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15823 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15824 StackSlot = StackSlot.getOperand(1);
15826 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15827 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15829 Tys, Ops, SrcVT, MMO);
15832 Chain = Result.getValue(1);
15833 SDValue InFlag = Result.getValue(2);
15835 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15836 // shouldn't be necessary except that RFP cannot be live across
15837 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15838 MachineFunction &MF = DAG.getMachineFunction();
15839 unsigned SSFISize = Op.getValueSizeInBits()/8;
15840 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15841 auto PtrVT = getPointerTy(MF.getDataLayout());
15842 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15843 Tys = DAG.getVTList(MVT::Other);
15845 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15847 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15848 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15849 MachineMemOperand::MOStore, SSFISize, SSFISize);
15851 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15852 Ops, Op.getValueType(), MMO);
15853 Result = DAG.getLoad(
15854 Op.getValueType(), DL, Chain, StackSlot,
15855 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15861 /// 64-bit unsigned integer to double expansion.
15862 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
15863 const X86Subtarget &Subtarget) {
15864 // This algorithm is not obvious. Here it is what we're trying to output:
15867 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15868 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15870 haddpd %xmm0, %xmm0
15872 pshufd $0x4e, %xmm0, %xmm1
15878 LLVMContext *Context = DAG.getContext();
15880 // Build some magic constants.
15881 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15882 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15883 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
15884 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15886 SmallVector<Constant*,2> CV1;
15888 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15889 APInt(64, 0x4330000000000000ULL))));
15891 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15892 APInt(64, 0x4530000000000000ULL))));
15893 Constant *C1 = ConstantVector::get(CV1);
15894 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15896 // Load the 64-bit value into an XMM register.
15897 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15900 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15901 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15902 /* Alignment = */ 16);
15904 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15907 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15908 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15909 /* Alignment = */ 16);
15910 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15911 // TODO: Are there any fast-math-flags to propagate here?
15912 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15915 if (Subtarget.hasSSE3()) {
15916 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15917 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15919 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15920 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15921 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15922 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15925 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15926 DAG.getIntPtrConstant(0, dl));
15929 /// 32-bit unsigned integer to float expansion.
15930 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
15931 const X86Subtarget &Subtarget) {
15933 // FP constant to bias correct the final result.
15934 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15937 // Load the 32-bit value into an XMM register.
15938 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15941 // Zero out the upper parts of the register.
15942 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15944 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15945 DAG.getBitcast(MVT::v2f64, Load),
15946 DAG.getIntPtrConstant(0, dl));
15948 // Or the load with the bias.
15949 SDValue Or = DAG.getNode(
15950 ISD::OR, dl, MVT::v2i64,
15951 DAG.getBitcast(MVT::v2i64,
15952 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15953 DAG.getBitcast(MVT::v2i64,
15954 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15956 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15957 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15959 // Subtract the bias.
15960 // TODO: Are there any fast-math-flags to propagate here?
15961 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15963 // Handle final rounding.
15964 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
15967 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15968 const X86Subtarget &Subtarget,
15970 if (Op.getSimpleValueType() != MVT::v2f64)
15973 SDValue N0 = Op.getOperand(0);
15974 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15976 // Legalize to v4i32 type.
15977 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15978 DAG.getUNDEF(MVT::v2i32));
15980 if (Subtarget.hasAVX512())
15981 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15983 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15984 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15985 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15986 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15988 // Two to the power of half-word-size.
15989 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15991 // Clear upper part of LO, lower HI.
15992 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15993 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15995 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15996 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15997 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15999 // Add the two halves.
16000 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16003 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16004 const X86Subtarget &Subtarget) {
16005 // The algorithm is the following:
16006 // #ifdef __SSE4_1__
16007 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16008 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16009 // (uint4) 0x53000000, 0xaa);
16011 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16012 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16014 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16015 // return (float4) lo + fhi;
16017 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16018 // reassociate the two FADDs, and if we do that, the algorithm fails
16019 // spectacularly (PR24512).
16020 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16021 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16022 // there's also the MachineCombiner reassociations happening on Machine IR.
16023 if (DAG.getTarget().Options.UnsafeFPMath)
16027 SDValue V = Op->getOperand(0);
16028 MVT VecIntVT = V.getSimpleValueType();
16029 bool Is128 = VecIntVT == MVT::v4i32;
16030 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16031 // If we convert to something else than the supported type, e.g., to v4f64,
16033 if (VecFloatVT != Op->getSimpleValueType(0))
16036 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16037 "Unsupported custom type");
16039 // In the #idef/#else code, we have in common:
16040 // - The vector of constants:
16046 // Create the splat vector for 0x4b000000.
16047 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16048 // Create the splat vector for 0x53000000.
16049 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16051 // Create the right shift.
16052 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16053 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16056 if (Subtarget.hasSSE41()) {
16057 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16058 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16059 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16060 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16061 // Low will be bitcasted right away, so do not bother bitcasting back to its
16063 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16064 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16065 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16066 // (uint4) 0x53000000, 0xaa);
16067 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16068 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16069 // High will be bitcasted right away, so do not bother bitcasting back to
16070 // its original type.
16071 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16072 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16074 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16075 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16076 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16077 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16079 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16080 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16083 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16084 SDValue VecCstFAdd = DAG.getConstantFP(
16085 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16087 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16088 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16089 // TODO: Are there any fast-math-flags to propagate here?
16091 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16092 // return (float4) lo + fhi;
16093 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16094 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16097 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16098 const X86Subtarget &Subtarget) {
16099 SDValue N0 = Op.getOperand(0);
16100 MVT SrcVT = N0.getSimpleValueType();
16103 if (SrcVT == MVT::v2i1) {
16104 // For v2i1, we need to widen to v4i1 first.
16105 assert(Op.getValueType() == MVT::v2f64 && "Unexpected type");
16106 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, N0,
16107 DAG.getUNDEF(MVT::v2i1));
16108 return DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v2f64,
16109 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0));
16112 switch (SrcVT.SimpleTy) {
16114 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16116 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16119 assert(!Subtarget.hasAVX512());
16120 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16124 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16125 SelectionDAG &DAG) const {
16126 SDValue N0 = Op.getOperand(0);
16128 auto PtrVT = getPointerTy(DAG.getDataLayout());
16130 if (Op.getSimpleValueType().isVector())
16131 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16133 MVT SrcVT = N0.getSimpleValueType();
16134 MVT DstVT = Op.getSimpleValueType();
16136 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16137 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16138 // Conversions from unsigned i32 to f32/f64 are legal,
16139 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16143 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16144 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16145 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16146 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16147 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16150 // Make a 64-bit buffer, and use it to build an FILD.
16151 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16152 if (SrcVT == MVT::i32) {
16153 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16154 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16155 StackSlot, MachinePointerInfo());
16156 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16157 OffsetSlot, MachinePointerInfo());
16158 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16162 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16163 SDValue ValueToStore = Op.getOperand(0);
16164 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16165 // Bitcasting to f64 here allows us to do a single 64-bit store from
16166 // an SSE register, avoiding the store forwarding penalty that would come
16167 // with two 32-bit stores.
16168 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16169 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16170 MachinePointerInfo());
16171 // For i64 source, we need to add the appropriate power of 2 if the input
16172 // was negative. This is the same as the optimization in
16173 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16174 // we must be careful to do the computation in x87 extended precision, not
16175 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16176 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16177 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16178 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16179 MachineMemOperand::MOLoad, 8, 8);
16181 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16182 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16183 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16186 APInt FF(32, 0x5F800000ULL);
16188 // Check whether the sign bit is set.
16189 SDValue SignSet = DAG.getSetCC(
16190 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16191 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16193 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16194 SDValue FudgePtr = DAG.getConstantPool(
16195 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16197 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16198 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16199 SDValue Four = DAG.getIntPtrConstant(4, dl);
16200 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16201 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16203 // Load the value out, extending it from f32 to f80.
16204 // FIXME: Avoid the extend by constructing the right constant pool?
16205 SDValue Fudge = DAG.getExtLoad(
16206 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16207 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16208 /* Alignment = */ 4);
16209 // Extend everything to 80 bits to force it to be done on x87.
16210 // TODO: Are there any fast-math-flags to propagate here?
16211 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16212 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16213 DAG.getIntPtrConstant(0, dl));
16216 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16217 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16218 // just return an <SDValue(), SDValue()> pair.
16219 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16220 // to i16, i32 or i64, and we lower it to a legal sequence.
16221 // If lowered to the final integer result we return a <result, SDValue()> pair.
16222 // Otherwise we lower it to a sequence ending with a FIST, return a
16223 // <FIST, StackSlot> pair, and the caller is responsible for loading
16224 // the final integer result from StackSlot.
16225 std::pair<SDValue,SDValue>
16226 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16227 bool IsSigned, bool IsReplace) const {
16230 EVT DstTy = Op.getValueType();
16231 EVT TheVT = Op.getOperand(0).getValueType();
16232 auto PtrVT = getPointerTy(DAG.getDataLayout());
16234 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16235 // f16 must be promoted before using the lowering in this routine.
16236 // fp128 does not use this lowering.
16237 return std::make_pair(SDValue(), SDValue());
16240 // If using FIST to compute an unsigned i64, we'll need some fixup
16241 // to handle values above the maximum signed i64. A FIST is always
16242 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16243 bool UnsignedFixup = !IsSigned &&
16244 DstTy == MVT::i64 &&
16245 (!Subtarget.is64Bit() ||
16246 !isScalarFPTypeInSSEReg(TheVT));
16248 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16249 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16250 // The low 32 bits of the fist result will have the correct uint32 result.
16251 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16255 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16256 DstTy.getSimpleVT() >= MVT::i16 &&
16257 "Unknown FP_TO_INT to lower!");
16259 // These are really Legal.
16260 if (DstTy == MVT::i32 &&
16261 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16262 return std::make_pair(SDValue(), SDValue());
16263 if (Subtarget.is64Bit() &&
16264 DstTy == MVT::i64 &&
16265 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16266 return std::make_pair(SDValue(), SDValue());
16268 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16270 MachineFunction &MF = DAG.getMachineFunction();
16271 unsigned MemSize = DstTy.getSizeInBits()/8;
16272 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16273 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16276 switch (DstTy.getSimpleVT().SimpleTy) {
16277 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16278 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16279 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16280 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16283 SDValue Chain = DAG.getEntryNode();
16284 SDValue Value = Op.getOperand(0);
16285 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16287 if (UnsignedFixup) {
16289 // Conversion to unsigned i64 is implemented with a select,
16290 // depending on whether the source value fits in the range
16291 // of a signed i64. Let Thresh be the FP equivalent of
16292 // 0x8000000000000000ULL.
16294 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16295 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16296 // Fist-to-mem64 FistSrc
16297 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16298 // to XOR'ing the high 32 bits with Adjust.
16300 // Being a power of 2, Thresh is exactly representable in all FP formats.
16301 // For X87 we'd like to use the smallest FP type for this constant, but
16302 // for DAG type consistency we have to match the FP operand type.
16304 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16305 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16306 bool LosesInfo = false;
16307 if (TheVT == MVT::f64)
16308 // The rounding mode is irrelevant as the conversion should be exact.
16309 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16311 else if (TheVT == MVT::f80)
16312 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16313 APFloat::rmNearestTiesToEven, &LosesInfo);
16315 assert(Status == APFloat::opOK && !LosesInfo &&
16316 "FP conversion should have been exact");
16318 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16320 SDValue Cmp = DAG.getSetCC(DL,
16321 getSetCCResultType(DAG.getDataLayout(),
16322 *DAG.getContext(), TheVT),
16323 Value, ThreshVal, ISD::SETLT);
16324 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16325 DAG.getConstant(0, DL, MVT::i32),
16326 DAG.getConstant(0x80000000, DL, MVT::i32));
16327 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16328 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16329 *DAG.getContext(), TheVT),
16330 Value, ThreshVal, ISD::SETLT);
16331 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16334 // FIXME This causes a redundant load/store if the SSE-class value is already
16335 // in memory, such as if it is on the callstack.
16336 if (isScalarFPTypeInSSEReg(TheVT)) {
16337 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16338 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16339 MachinePointerInfo::getFixedStack(MF, SSFI));
16340 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16342 Chain, StackSlot, DAG.getValueType(TheVT)
16345 MachineMemOperand *MMO =
16346 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16347 MachineMemOperand::MOLoad, MemSize, MemSize);
16348 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16349 Chain = Value.getValue(1);
16350 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16351 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16354 MachineMemOperand *MMO =
16355 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16356 MachineMemOperand::MOStore, MemSize, MemSize);
16358 if (UnsignedFixup) {
16360 // Insert the FIST, load its result as two i32's,
16361 // and XOR the high i32 with Adjust.
16363 SDValue FistOps[] = { Chain, Value, StackSlot };
16364 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16365 FistOps, DstTy, MMO);
16368 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16369 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16372 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16373 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16375 if (Subtarget.is64Bit()) {
16376 // Join High32 and Low32 into a 64-bit result.
16377 // (High32 << 32) | Low32
16378 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16379 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16380 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16381 DAG.getConstant(32, DL, MVT::i8));
16382 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16383 return std::make_pair(Result, SDValue());
16386 SDValue ResultOps[] = { Low32, High32 };
16388 SDValue pair = IsReplace
16389 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16390 : DAG.getMergeValues(ResultOps, DL);
16391 return std::make_pair(pair, SDValue());
16393 // Build the FP_TO_INT*_IN_MEM
16394 SDValue Ops[] = { Chain, Value, StackSlot };
16395 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16397 return std::make_pair(FIST, StackSlot);
16401 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16402 const X86Subtarget &Subtarget) {
16403 MVT VT = Op->getSimpleValueType(0);
16404 SDValue In = Op->getOperand(0);
16405 MVT InVT = In.getSimpleValueType();
16408 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16409 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16410 "Expected same number of elements");
16411 assert((VT.getVectorElementType() == MVT::i16 ||
16412 VT.getVectorElementType() == MVT::i32 ||
16413 VT.getVectorElementType() == MVT::i64) &&
16414 "Unexpected element type");
16415 assert((InVT.getVectorElementType() == MVT::i8 ||
16416 InVT.getVectorElementType() == MVT::i16 ||
16417 InVT.getVectorElementType() == MVT::i32) &&
16418 "Unexpected element type");
16420 if (Subtarget.hasInt256())
16421 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16423 // Optimize vectors in AVX mode:
16426 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16427 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16428 // Concat upper and lower parts.
16431 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16432 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16433 // Concat upper and lower parts.
16436 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16437 SDValue Undef = DAG.getUNDEF(InVT);
16438 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16439 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16440 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16442 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16443 VT.getVectorNumElements()/2);
16445 OpLo = DAG.getBitcast(HVT, OpLo);
16446 OpHi = DAG.getBitcast(HVT, OpHi);
16448 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16451 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16452 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16453 const SDLoc &dl, SelectionDAG &DAG) {
16454 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16455 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16456 DAG.getIntPtrConstant(0, dl));
16457 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16458 DAG.getIntPtrConstant(8, dl));
16459 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16460 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16461 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16462 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16465 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16466 const X86Subtarget &Subtarget,
16467 SelectionDAG &DAG) {
16468 MVT VT = Op->getSimpleValueType(0);
16469 SDValue In = Op->getOperand(0);
16470 MVT InVT = In.getSimpleValueType();
16471 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16473 unsigned NumElts = VT.getVectorNumElements();
16475 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
16477 if (!Subtarget.hasBWI() &&
16478 (VT.getVectorElementType().getSizeInBits() <= 16)) {
16479 // If v16i32 is to be avoided, we'll need to split and concatenate.
16480 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16481 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16483 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16486 // Widen to 512-bits if VLX is not supported.
16487 MVT WideVT = ExtVT;
16488 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16489 NumElts *= 512 / ExtVT.getSizeInBits();
16490 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16491 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16492 In, DAG.getIntPtrConstant(0, DL));
16493 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16497 SDValue One = DAG.getConstant(1, DL, WideVT);
16498 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16500 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16502 // Truncate if we had to extend i16/i8 above.
16504 WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
16505 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
16508 // Extract back to 128/256-bit if we widened.
16510 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16511 DAG.getIntPtrConstant(0, DL));
16513 return SelectedVal;
16516 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16517 SelectionDAG &DAG) {
16518 SDValue In = Op.getOperand(0);
16519 MVT SVT = In.getSimpleValueType();
16521 if (SVT.getVectorElementType() == MVT::i1)
16522 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16524 assert(Subtarget.hasFp256() && "Expected AVX support");
16525 return LowerAVXExtend(Op, DAG, Subtarget);
16528 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16529 /// It makes use of the fact that vectors with enough leading sign/zero bits
16530 /// prevent the PACKSS/PACKUS from saturating the results.
16531 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16532 /// within each 128-bit lane.
16533 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16534 const SDLoc &DL, SelectionDAG &DAG,
16535 const X86Subtarget &Subtarget) {
16536 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16537 "Unexpected PACK opcode");
16539 // Requires SSE2 but AVX512 has fast truncate.
16540 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
16543 EVT SrcVT = In.getValueType();
16545 // No truncation required, we might get here due to recursive calls.
16546 if (SrcVT == DstVT)
16549 // We only support vector truncation to 128bits or greater from a
16550 // 256bits or greater source.
16551 unsigned DstSizeInBits = DstVT.getSizeInBits();
16552 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16553 if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
16556 LLVMContext &Ctx = *DAG.getContext();
16557 unsigned NumElems = SrcVT.getVectorNumElements();
16558 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16559 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16561 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16563 // Extract lower/upper subvectors.
16564 unsigned NumSubElts = NumElems / 2;
16565 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16566 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16568 // Pack to the largest type possible:
16569 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16570 EVT InVT = MVT::i16, OutVT = MVT::i8;
16571 if (DstVT.getScalarSizeInBits() > 8 &&
16572 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16577 unsigned SubSizeInBits = SrcSizeInBits / 2;
16578 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16579 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16581 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16582 if (SrcVT.is256BitVector()) {
16583 Lo = DAG.getBitcast(InVT, Lo);
16584 Hi = DAG.getBitcast(InVT, Hi);
16585 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16586 return DAG.getBitcast(DstVT, Res);
16589 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16590 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16591 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16592 Lo = DAG.getBitcast(InVT, Lo);
16593 Hi = DAG.getBitcast(InVT, Hi);
16594 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16596 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16597 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16598 Res = DAG.getBitcast(MVT::v4i64, Res);
16599 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16601 if (DstVT.is256BitVector())
16602 return DAG.getBitcast(DstVT, Res);
16604 // If 512bit -> 128bit truncate another stage.
16605 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16606 Res = DAG.getBitcast(PackedVT, Res);
16607 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16610 // Recursively pack lower/upper subvectors, concat result and pack again.
16611 assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
16612 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16613 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16614 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16616 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16617 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16618 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16621 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16622 const X86Subtarget &Subtarget) {
16625 MVT VT = Op.getSimpleValueType();
16626 SDValue In = Op.getOperand(0);
16627 MVT InVT = In.getSimpleValueType();
16629 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16631 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16632 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16633 if (InVT.getScalarSizeInBits() <= 16) {
16634 if (Subtarget.hasBWI()) {
16635 // legal, will go to VPMOVB2M, VPMOVW2M
16636 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16637 // We need to shift to get the lsb into sign position.
16638 // Shift packed bytes not supported natively, bitcast to word
16639 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16640 In = DAG.getNode(ISD::SHL, DL, ExtVT,
16641 DAG.getBitcast(ExtVT, In),
16642 DAG.getConstant(ShiftInx, DL, ExtVT));
16643 In = DAG.getBitcast(InVT, In);
16645 return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT),
16646 In, DAG.getConstant(6, DL, MVT::i8));
16648 // Use TESTD/Q, extended vector to packed dword/qword.
16649 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16650 "Unexpected vector type.");
16651 unsigned NumElts = InVT.getVectorNumElements();
16652 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
16653 // We need to change to a wider element type that we have support for.
16654 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
16655 // For 16 element vectors we extend to v16i32 unless we are explicitly
16656 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
16657 // we need to split into two 8 element vectors which we can extend to v8i32,
16658 // truncate and concat the results. There's an additional complication if
16659 // the original type is v16i8. In that case we can't split the v16i8 so
16660 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
16661 // to v8i32, truncate that to v8i1 and concat the two halves.
16662 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
16663 if (InVT == MVT::v16i8) {
16664 // First we need to sign extend up to 256-bits so we can split that.
16665 InVT = MVT::v16i16;
16666 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
16668 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
16669 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
16670 // We're split now, just emit two truncates and a concat. The two
16671 // truncates will trigger legalization to come back to this function.
16672 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
16673 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
16674 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16676 // We either have 8 elements or we're allowed to use 512-bit vectors.
16677 // If we have VLX, we want to use the narrowest vector that can get the
16678 // job done so we use vXi32.
16679 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
16680 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
16681 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16683 ShiftInx = InVT.getScalarSizeInBits() - 1;
16686 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16687 // We need to shift to get the lsb into sign position.
16688 In = DAG.getNode(ISD::SHL, DL, InVT, In,
16689 DAG.getConstant(ShiftInx, DL, InVT));
16691 return DAG.getNode(X86ISD::CMPM, DL, VT, In,
16692 getZeroVector(InVT, Subtarget, DAG, DL),
16693 DAG.getConstant(4, DL, MVT::i8));
16696 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16698 MVT VT = Op.getSimpleValueType();
16699 SDValue In = Op.getOperand(0);
16700 MVT InVT = In.getSimpleValueType();
16701 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16703 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16704 "Invalid TRUNCATE operation");
16706 if (VT.getVectorElementType() == MVT::i1)
16707 return LowerTruncateVecI1(Op, DAG, Subtarget);
16709 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16710 if (Subtarget.hasAVX512()) {
16711 // word to byte only under BWI
16712 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
16713 // Make sure we're allowed to promote 512-bits.
16714 if (Subtarget.canExtendTo512DQ())
16715 return DAG.getNode(ISD::TRUNCATE, DL, VT,
16716 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In,
16719 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16723 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16724 // extend all the way to the packed/truncated value.
16725 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16726 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16728 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16731 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16732 // that extend all the way to the packed/truncated value.
16733 // Pre-SSE41 we can only use PACKUSWB.
16735 DAG.computeKnownBits(In, Known);
16736 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16737 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16739 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16742 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16743 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16744 if (Subtarget.hasInt256()) {
16745 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16746 In = DAG.getBitcast(MVT::v8i32, In);
16747 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16748 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16749 DAG.getIntPtrConstant(0, DL));
16752 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16753 DAG.getIntPtrConstant(0, DL));
16754 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16755 DAG.getIntPtrConstant(2, DL));
16756 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16757 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16758 static const int ShufMask[] = {0, 2, 4, 6};
16759 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16762 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16763 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16764 if (Subtarget.hasInt256()) {
16765 In = DAG.getBitcast(MVT::v32i8, In);
16767 // The PSHUFB mask:
16768 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16769 -1, -1, -1, -1, -1, -1, -1, -1,
16770 16, 17, 20, 21, 24, 25, 28, 29,
16771 -1, -1, -1, -1, -1, -1, -1, -1 };
16772 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16773 In = DAG.getBitcast(MVT::v4i64, In);
16775 static const int ShufMask2[] = {0, 2, -1, -1};
16776 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16777 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16778 DAG.getIntPtrConstant(0, DL));
16779 return DAG.getBitcast(VT, In);
16782 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16783 DAG.getIntPtrConstant(0, DL));
16785 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16786 DAG.getIntPtrConstant(4, DL));
16788 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16789 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16791 // The PSHUFB mask:
16792 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16793 -1, -1, -1, -1, -1, -1, -1, -1};
16795 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16796 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16798 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16799 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16801 // The MOVLHPS Mask:
16802 static const int ShufMask2[] = {0, 1, 4, 5};
16803 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16804 return DAG.getBitcast(MVT::v8i16, res);
16807 // Handle truncation of V256 to V128 using shuffles.
16808 if (!VT.is128BitVector() || !InVT.is256BitVector())
16811 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16813 unsigned NumElems = VT.getVectorNumElements();
16814 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16816 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16817 // Prepare truncation shuffle mask
16818 for (unsigned i = 0; i != NumElems; ++i)
16819 MaskVec[i] = i * 2;
16820 In = DAG.getBitcast(NVT, In);
16821 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16822 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16823 DAG.getIntPtrConstant(0, DL));
16826 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16827 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16828 MVT VT = Op.getSimpleValueType();
16830 if (VT.isVector()) {
16831 SDValue Src = Op.getOperand(0);
16834 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
16835 MVT ResVT = MVT::v4i32;
16836 MVT TruncVT = MVT::v4i1;
16837 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
16838 if (!IsSigned && !Subtarget.hasVLX()) {
16839 // Widen to 512-bits.
16840 ResVT = MVT::v8i32;
16841 TruncVT = MVT::v8i1;
16842 Opc = ISD::FP_TO_UINT;
16843 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
16844 DAG.getUNDEF(MVT::v8f64),
16845 Src, DAG.getIntPtrConstant(0, dl));
16847 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
16848 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
16849 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
16850 DAG.getIntPtrConstant(0, dl));
16853 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16854 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16855 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16856 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16857 DAG.getUNDEF(MVT::v2f32)));
16863 assert(!VT.isVector());
16865 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16866 IsSigned, /*IsReplace=*/ false);
16867 SDValue FIST = Vals.first, StackSlot = Vals.second;
16868 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16869 if (!FIST.getNode())
16872 if (StackSlot.getNode())
16873 // Load the result.
16874 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16876 // The node is the result.
16880 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16882 MVT VT = Op.getSimpleValueType();
16883 SDValue In = Op.getOperand(0);
16884 MVT SVT = In.getSimpleValueType();
16886 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16888 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16889 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16890 In, DAG.getUNDEF(SVT)));
16893 /// The only differences between FABS and FNEG are the mask and the logic op.
16894 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16895 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16896 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16897 "Wrong opcode for lowering FABS or FNEG.");
16899 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16901 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16902 // into an FNABS. We'll lower the FABS after that if it is still in use.
16904 for (SDNode *User : Op->uses())
16905 if (User->getOpcode() == ISD::FNEG)
16909 MVT VT = Op.getSimpleValueType();
16911 bool IsF128 = (VT == MVT::f128);
16913 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16914 // decide if we should generate a 16-byte constant mask when we only need 4 or
16915 // 8 bytes for the scalar case.
16920 if (VT.isVector()) {
16922 EltVT = VT.getVectorElementType();
16923 } else if (IsF128) {
16924 // SSE instructions are used for optimized f128 logical operations.
16925 LogicVT = MVT::f128;
16928 // There are no scalar bitwise logical SSE/AVX instructions, so we
16929 // generate a 16-byte vector constant and logic op even for the scalar case.
16930 // Using a 16-byte mask allows folding the load of the mask with
16931 // the logic op, so it can save (~4 bytes) on code size.
16932 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16936 unsigned EltBits = EltVT.getSizeInBits();
16937 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16939 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16940 const fltSemantics &Sem =
16941 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16942 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16943 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16945 SDValue Op0 = Op.getOperand(0);
16946 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16948 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16949 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16951 if (VT.isVector() || IsF128)
16952 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16954 // For the scalar case extend to a 128-bit vector, perform the logic op,
16955 // and extract the scalar result back out.
16956 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16957 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16958 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16959 DAG.getIntPtrConstant(0, dl));
16962 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16963 SDValue Mag = Op.getOperand(0);
16964 SDValue Sign = Op.getOperand(1);
16967 // If the sign operand is smaller, extend it first.
16968 MVT VT = Op.getSimpleValueType();
16969 if (Sign.getSimpleValueType().bitsLT(VT))
16970 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16972 // And if it is bigger, shrink it first.
16973 if (Sign.getSimpleValueType().bitsGT(VT))
16974 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16976 // At this point the operands and the result should have the same
16977 // type, and that won't be f80 since that is not custom lowered.
16978 bool IsF128 = (VT == MVT::f128);
16979 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16980 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16981 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16982 "Unexpected type in LowerFCOPYSIGN");
16984 MVT EltVT = VT.getScalarType();
16985 const fltSemantics &Sem =
16986 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16987 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16989 // Perform all scalar logic operations as 16-byte vectors because there are no
16990 // scalar FP logic instructions in SSE.
16991 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16992 // unnecessary splats, but we might miss load folding opportunities. Should
16993 // this decision be based on OptimizeForSize?
16994 bool IsFakeVector = !VT.isVector() && !IsF128;
16997 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16999 // The mask constants are automatically splatted for vector types.
17000 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17001 SDValue SignMask = DAG.getConstantFP(
17002 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17003 SDValue MagMask = DAG.getConstantFP(
17004 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17006 // First, clear all bits but the sign bit from the second operand (sign).
17008 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17009 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17011 // Next, clear the sign bit from the first operand (magnitude).
17012 // TODO: If we had general constant folding for FP logic ops, this check
17013 // wouldn't be necessary.
17015 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17016 APFloat APF = Op0CN->getValueAPF();
17018 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17020 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17022 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17023 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17026 // OR the magnitude value with the sign bit.
17027 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17028 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17029 DAG.getIntPtrConstant(0, dl));
17032 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17033 SDValue N0 = Op.getOperand(0);
17035 MVT VT = Op.getSimpleValueType();
17037 MVT OpVT = N0.getSimpleValueType();
17038 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17039 "Unexpected type for FGETSIGN");
17041 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17042 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17043 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17044 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17045 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17046 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17050 /// Helper for creating a X86ISD::SETCC node.
17051 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17052 SelectionDAG &DAG) {
17053 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17054 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17057 // Check whether an OR'd tree is PTEST-able.
17058 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17059 const X86Subtarget &Subtarget,
17060 SelectionDAG &DAG) {
17061 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17063 if (!Subtarget.hasSSE41())
17066 if (!Op->hasOneUse())
17069 SDNode *N = Op.getNode();
17072 SmallVector<SDValue, 8> Opnds;
17073 DenseMap<SDValue, unsigned> VecInMap;
17074 SmallVector<SDValue, 8> VecIns;
17075 EVT VT = MVT::Other;
17077 // Recognize a special case where a vector is casted into wide integer to
17079 Opnds.push_back(N->getOperand(0));
17080 Opnds.push_back(N->getOperand(1));
17082 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17083 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17084 // BFS traverse all OR'd operands.
17085 if (I->getOpcode() == ISD::OR) {
17086 Opnds.push_back(I->getOperand(0));
17087 Opnds.push_back(I->getOperand(1));
17088 // Re-evaluate the number of nodes to be traversed.
17089 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17093 // Quit if a non-EXTRACT_VECTOR_ELT
17094 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17097 // Quit if without a constant index.
17098 SDValue Idx = I->getOperand(1);
17099 if (!isa<ConstantSDNode>(Idx))
17102 SDValue ExtractedFromVec = I->getOperand(0);
17103 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17104 if (M == VecInMap.end()) {
17105 VT = ExtractedFromVec.getValueType();
17106 // Quit if not 128/256-bit vector.
17107 if (!VT.is128BitVector() && !VT.is256BitVector())
17109 // Quit if not the same type.
17110 if (VecInMap.begin() != VecInMap.end() &&
17111 VT != VecInMap.begin()->first.getValueType())
17113 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17114 VecIns.push_back(ExtractedFromVec);
17116 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17119 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17120 "Not extracted from 128-/256-bit vector.");
17122 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17124 for (DenseMap<SDValue, unsigned>::const_iterator
17125 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17126 // Quit if not all elements are used.
17127 if (I->second != FullMask)
17131 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17133 // Cast all vectors into TestVT for PTEST.
17134 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17135 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17137 // If more than one full vector is evaluated, OR them first before PTEST.
17138 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17139 // Each iteration will OR 2 nodes and append the result until there is only
17140 // 1 node left, i.e. the final OR'd value of all vectors.
17141 SDValue LHS = VecIns[Slot];
17142 SDValue RHS = VecIns[Slot + 1];
17143 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17146 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17147 VecIns.back(), VecIns.back());
17148 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17151 /// \brief return true if \c Op has a use that doesn't just read flags.
17152 static bool hasNonFlagsUse(SDValue Op) {
17153 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17155 SDNode *User = *UI;
17156 unsigned UOpNo = UI.getOperandNo();
17157 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17158 // Look pass truncate.
17159 UOpNo = User->use_begin().getOperandNo();
17160 User = *User->use_begin();
17163 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17164 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17170 /// Emit nodes that will be selected as "test Op0,Op0", or something
17172 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17173 SelectionDAG &DAG) const {
17174 // CF and OF aren't always set the way we want. Determine which
17175 // of these we need.
17176 bool NeedCF = false;
17177 bool NeedOF = false;
17180 case X86::COND_A: case X86::COND_AE:
17181 case X86::COND_B: case X86::COND_BE:
17184 case X86::COND_G: case X86::COND_GE:
17185 case X86::COND_L: case X86::COND_LE:
17186 case X86::COND_O: case X86::COND_NO: {
17187 // Check if we really need to set the
17188 // Overflow flag. If NoSignedWrap is present
17189 // that is not actually needed.
17190 switch (Op->getOpcode()) {
17195 if (Op.getNode()->getFlags().hasNoSignedWrap())
17205 // See if we can use the EFLAGS value from the operand instead of
17206 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17207 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17208 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17209 // Emit a CMP with 0, which is the TEST pattern.
17210 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17211 DAG.getConstant(0, dl, Op.getValueType()));
17213 unsigned Opcode = 0;
17214 unsigned NumOperands = 0;
17216 // Truncate operations may prevent the merge of the SETCC instruction
17217 // and the arithmetic instruction before it. Attempt to truncate the operands
17218 // of the arithmetic instruction and use a reduced bit-width instruction.
17219 bool NeedTruncation = false;
17220 SDValue ArithOp = Op;
17221 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17222 SDValue Arith = Op->getOperand(0);
17223 // Both the trunc and the arithmetic op need to have one user each.
17224 if (Arith->hasOneUse())
17225 switch (Arith.getOpcode()) {
17232 NeedTruncation = true;
17238 // Sometimes flags can be set either with an AND or with an SRL/SHL
17239 // instruction. SRL/SHL variant should be preferred for masks longer than this
17241 const int ShiftToAndMaxMaskWidth = 32;
17242 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17244 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17245 // which may be the result of a CAST. We use the variable 'Op', which is the
17246 // non-casted variable when we check for possible users.
17247 switch (ArithOp.getOpcode()) {
17249 // We only want to rewrite this as a target-specific node with attached
17250 // flags if there is a reasonable chance of either using that to do custom
17251 // instructions selection that can fold some of the memory operands, or if
17252 // only the flags are used. If there are other uses, leave the node alone
17253 // and emit a test instruction.
17254 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17255 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17256 if (UI->getOpcode() != ISD::CopyToReg &&
17257 UI->getOpcode() != ISD::SETCC &&
17258 UI->getOpcode() != ISD::STORE)
17261 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17262 // An add of one will be selected as an INC.
17264 (!Subtarget.slowIncDec() ||
17265 DAG.getMachineFunction().getFunction().optForSize())) {
17266 Opcode = X86ISD::INC;
17271 // An add of negative one (subtract of one) will be selected as a DEC.
17272 if (C->isAllOnesValue() &&
17273 (!Subtarget.slowIncDec() ||
17274 DAG.getMachineFunction().getFunction().optForSize())) {
17275 Opcode = X86ISD::DEC;
17281 // Otherwise use a regular EFLAGS-setting add.
17282 Opcode = X86ISD::ADD;
17287 // If we have a constant logical shift that's only used in a comparison
17288 // against zero turn it into an equivalent AND. This allows turning it into
17289 // a TEST instruction later.
17290 if (ZeroCheck && Op->hasOneUse() &&
17291 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17292 EVT VT = Op.getValueType();
17293 unsigned BitWidth = VT.getSizeInBits();
17294 unsigned ShAmt = Op->getConstantOperandVal(1);
17295 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17297 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17298 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17299 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17300 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17302 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17303 DAG.getConstant(Mask, dl, VT));
17308 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17309 // because a TEST instruction will be better. However, AND should be
17310 // preferred if the instruction can be combined into ANDN.
17311 if (!hasNonFlagsUse(Op)) {
17312 SDValue Op0 = ArithOp->getOperand(0);
17313 SDValue Op1 = ArithOp->getOperand(1);
17314 EVT VT = ArithOp.getValueType();
17315 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17316 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17317 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17319 // If we cannot select an ANDN instruction, check if we can replace
17320 // AND+IMM64 with a shift before giving up. This is possible for masks
17321 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17322 if (!isProperAndn) {
17326 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17327 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17331 const APInt &Mask = CN->getAPIntValue();
17332 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17333 break; // Prefer TEST instruction.
17335 unsigned BitWidth = Mask.getBitWidth();
17336 unsigned LeadingOnes = Mask.countLeadingOnes();
17337 unsigned TrailingZeros = Mask.countTrailingZeros();
17339 if (LeadingOnes + TrailingZeros == BitWidth) {
17340 assert(TrailingZeros < VT.getSizeInBits() &&
17341 "Shift amount should be less than the type width");
17342 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17343 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17344 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17348 unsigned LeadingZeros = Mask.countLeadingZeros();
17349 unsigned TrailingOnes = Mask.countTrailingOnes();
17351 if (LeadingZeros + TrailingOnes == BitWidth) {
17352 assert(LeadingZeros < VT.getSizeInBits() &&
17353 "Shift amount should be less than the type width");
17354 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17355 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17356 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17367 // Similar to ISD::ADD above, check if the uses will preclude useful
17368 // lowering of the target-specific node.
17369 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17370 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17371 if (UI->getOpcode() != ISD::CopyToReg &&
17372 UI->getOpcode() != ISD::SETCC &&
17373 UI->getOpcode() != ISD::STORE)
17376 // Otherwise use a regular EFLAGS-setting instruction.
17377 switch (ArithOp.getOpcode()) {
17378 default: llvm_unreachable("unexpected operator!");
17379 case ISD::SUB: Opcode = X86ISD::SUB; break;
17380 case ISD::XOR: Opcode = X86ISD::XOR; break;
17381 case ISD::AND: Opcode = X86ISD::AND; break;
17382 case ISD::OR: Opcode = X86ISD::OR; break;
17394 return SDValue(Op.getNode(), 1);
17400 // If we found that truncation is beneficial, perform the truncation and
17402 if (NeedTruncation) {
17403 EVT VT = Op.getValueType();
17404 SDValue WideVal = Op->getOperand(0);
17405 EVT WideVT = WideVal.getValueType();
17406 unsigned ConvertedOp = 0;
17407 // Use a target machine opcode to prevent further DAGCombine
17408 // optimizations that may separate the arithmetic operations
17409 // from the setcc node.
17410 switch (WideVal.getOpcode()) {
17412 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17413 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17414 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17415 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17416 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17420 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17421 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17422 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17423 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17424 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17425 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17431 // Emit a CMP with 0, which is the TEST pattern.
17432 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17433 DAG.getConstant(0, dl, Op.getValueType()));
17435 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17436 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17438 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17439 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17440 return SDValue(New.getNode(), 1);
17443 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17445 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17446 const SDLoc &dl, SelectionDAG &DAG) const {
17447 if (isNullConstant(Op1))
17448 return EmitTest(Op0, X86CC, dl, DAG);
17450 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17451 "Unexpected comparison operation for MVT::i1 operands");
17453 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17454 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17455 // Only promote the compare up to I32 if it is a 16 bit operation
17456 // with an immediate. 16 bit immediates are to be avoided.
17457 if ((Op0.getValueType() == MVT::i16 &&
17458 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17459 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17460 !Subtarget.isAtom()) {
17461 unsigned ExtendOp =
17462 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17463 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17464 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17466 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17467 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17468 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17469 return SDValue(Sub.getNode(), 1);
17471 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17474 /// Convert a comparison if required by the subtarget.
17475 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17476 SelectionDAG &DAG) const {
17477 // If the subtarget does not support the FUCOMI instruction, floating-point
17478 // comparisons have to be converted.
17479 if (Subtarget.hasCMov() ||
17480 Cmp.getOpcode() != X86ISD::CMP ||
17481 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17482 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17485 // The instruction selector will select an FUCOM instruction instead of
17486 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17487 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17488 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17490 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17491 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17492 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17493 DAG.getConstant(8, dl, MVT::i8));
17494 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17496 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17497 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17498 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17501 /// Check if replacement of SQRT with RSQRT should be disabled.
17502 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17503 EVT VT = Op.getValueType();
17505 // We never want to use both SQRT and RSQRT instructions for the same input.
17506 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17510 return Subtarget.hasFastVectorFSQRT();
17511 return Subtarget.hasFastScalarFSQRT();
17514 /// The minimum architected relative accuracy is 2^-12. We need one
17515 /// Newton-Raphson step to have a good float result (24 bits of precision).
17516 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17517 SelectionDAG &DAG, int Enabled,
17518 int &RefinementSteps,
17519 bool &UseOneConstNR,
17520 bool Reciprocal) const {
17521 EVT VT = Op.getValueType();
17523 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17524 // TODO: Add support for AVX512 (v16f32).
17525 // It is likely not profitable to do this for f64 because a double-precision
17526 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17527 // instructions: convert to single, rsqrtss, convert back to double, refine
17528 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17529 // along with FMA, this could be a throughput win.
17530 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17531 // after legalize types.
17532 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17533 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17534 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17535 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17536 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17537 RefinementSteps = 1;
17539 UseOneConstNR = false;
17540 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17545 /// The minimum architected relative accuracy is 2^-12. We need one
17546 /// Newton-Raphson step to have a good float result (24 bits of precision).
17547 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17549 int &RefinementSteps) const {
17550 EVT VT = Op.getValueType();
17552 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17553 // TODO: Add support for AVX512 (v16f32).
17554 // It is likely not profitable to do this for f64 because a double-precision
17555 // reciprocal estimate with refinement on x86 prior to FMA requires
17556 // 15 instructions: convert to single, rcpss, convert back to double, refine
17557 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17558 // along with FMA, this could be a throughput win.
17560 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17561 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17562 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17563 // Enable estimate codegen with 1 refinement step for vector division.
17564 // Scalar division estimates are disabled because they break too much
17565 // real-world code. These defaults are intended to match GCC behavior.
17566 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17569 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17570 RefinementSteps = 1;
17572 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17577 /// If we have at least two divisions that use the same divisor, convert to
17578 /// multiplication by a reciprocal. This may need to be adjusted for a given
17579 /// CPU if a division's cost is not at least twice the cost of a multiplication.
17580 /// This is because we still need one division to calculate the reciprocal and
17581 /// then we need two multiplies by that reciprocal as replacements for the
17582 /// original divisions.
17583 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17587 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17588 /// according to equal/not-equal condition code \p CC.
17589 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17590 const SDLoc &dl, SelectionDAG &DAG) {
17591 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17592 // instruction. Since the shift amount is in-range-or-undefined, we know
17593 // that doing a bittest on the i32 value is ok. We extend to i32 because
17594 // the encoding for the i16 version is larger than the i32 version.
17595 // Also promote i16 to i32 for performance / code size reason.
17596 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17597 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17599 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17600 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17601 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17602 // known to be zero.
17603 if (Src.getValueType() == MVT::i64 &&
17604 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17605 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17607 // If the operand types disagree, extend the shift amount to match. Since
17608 // BT ignores high bits (like shifts) we can use anyextend.
17609 if (Src.getValueType() != BitNo.getValueType())
17610 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17612 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17613 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17614 return getSETCC(Cond, BT, dl , DAG);
17617 /// Result of 'and' is compared against zero. Change to a BT node if possible.
17618 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17619 const SDLoc &dl, SelectionDAG &DAG) {
17620 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
17621 SDValue Op0 = And.getOperand(0);
17622 SDValue Op1 = And.getOperand(1);
17623 if (Op0.getOpcode() == ISD::TRUNCATE)
17624 Op0 = Op0.getOperand(0);
17625 if (Op1.getOpcode() == ISD::TRUNCATE)
17626 Op1 = Op1.getOperand(0);
17629 if (Op1.getOpcode() == ISD::SHL)
17630 std::swap(Op0, Op1);
17631 if (Op0.getOpcode() == ISD::SHL) {
17632 if (isOneConstant(Op0.getOperand(0))) {
17633 // If we looked past a truncate, check that it's only truncating away
17635 unsigned BitWidth = Op0.getValueSizeInBits();
17636 unsigned AndBitWidth = And.getValueSizeInBits();
17637 if (BitWidth > AndBitWidth) {
17639 DAG.computeKnownBits(Op0, Known);
17640 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17644 RHS = Op0.getOperand(1);
17646 } else if (Op1.getOpcode() == ISD::Constant) {
17647 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17648 uint64_t AndRHSVal = AndRHS->getZExtValue();
17649 SDValue AndLHS = Op0;
17651 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17652 LHS = AndLHS.getOperand(0);
17653 RHS = AndLHS.getOperand(1);
17656 // Use BT if the immediate can't be encoded in a TEST instruction.
17657 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17659 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17664 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17669 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17671 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17676 // SSE Condition code mapping:
17685 switch (SetCCOpcode) {
17686 default: llvm_unreachable("Unexpected SETCC condition");
17688 case ISD::SETEQ: SSECC = 0; break;
17690 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17692 case ISD::SETOLT: SSECC = 1; break;
17694 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17696 case ISD::SETOLE: SSECC = 2; break;
17697 case ISD::SETUO: SSECC = 3; break;
17699 case ISD::SETNE: SSECC = 4; break;
17700 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17701 case ISD::SETUGE: SSECC = 5; break;
17702 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17703 case ISD::SETUGT: SSECC = 6; break;
17704 case ISD::SETO: SSECC = 7; break;
17705 case ISD::SETUEQ: SSECC = 8; break;
17706 case ISD::SETONE: SSECC = 12; break;
17709 std::swap(Op0, Op1);
17714 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17715 /// concatenate the result back.
17716 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17717 MVT VT = Op.getSimpleValueType();
17719 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17720 "Unsupported value type for operation");
17722 unsigned NumElems = VT.getVectorNumElements();
17724 SDValue CC = Op.getOperand(2);
17726 // Extract the LHS vectors
17727 SDValue LHS = Op.getOperand(0);
17728 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17729 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17731 // Extract the RHS vectors
17732 SDValue RHS = Op.getOperand(1);
17733 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17734 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17736 // Issue the operation on the smaller types and concatenate the result back
17737 MVT EltVT = VT.getVectorElementType();
17738 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17739 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17740 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17741 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17744 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17745 SDValue Op0 = Op.getOperand(0);
17746 SDValue Op1 = Op.getOperand(1);
17747 SDValue CC = Op.getOperand(2);
17748 MVT VT = Op.getSimpleValueType();
17751 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17752 "Unexpected type for boolean compare operation");
17753 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17754 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17755 DAG.getConstant(-1, dl, VT));
17756 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17757 DAG.getConstant(-1, dl, VT));
17758 switch (SetCCOpcode) {
17759 default: llvm_unreachable("Unexpected SETCC condition");
17761 // (x == y) -> ~(x ^ y)
17762 return DAG.getNode(ISD::XOR, dl, VT,
17763 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17764 DAG.getConstant(-1, dl, VT));
17766 // (x != y) -> (x ^ y)
17767 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17770 // (x > y) -> (x & ~y)
17771 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17774 // (x < y) -> (~x & y)
17775 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17778 // (x <= y) -> (~x | y)
17779 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17782 // (x >=y) -> (x | ~y)
17783 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17787 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17789 SDValue Op0 = Op.getOperand(0);
17790 SDValue Op1 = Op.getOperand(1);
17791 SDValue CC = Op.getOperand(2);
17792 MVT VT = Op.getSimpleValueType();
17795 assert(VT.getVectorElementType() == MVT::i1 &&
17796 "Cannot set masked compare for this operation");
17798 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17801 switch (SetCCOpcode) {
17802 default: llvm_unreachable("Unexpected SETCC condition");
17803 case ISD::SETNE: SSECC = 4; break;
17804 case ISD::SETEQ: SSECC = 0; break;
17805 case ISD::SETULT: SSECC = 1; break;
17806 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17808 case ISD::SETGT: SSECC = 6; break;
17809 case ISD::SETUGE: SSECC = 5; break;
17810 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17812 case ISD::SETLE: SSECC = 2; break;
17815 std::swap(Op0, Op1);
17817 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU
17819 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17820 DAG.getConstant(SSECC, dl, MVT::i8));
17823 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17824 /// operand \p Op1. If non-trivial (for example because it's not constant)
17825 /// return an empty value.
17826 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17827 SelectionDAG &DAG) {
17828 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17832 MVT VT = Op1.getSimpleValueType();
17833 MVT EVT = VT.getVectorElementType();
17834 unsigned n = VT.getVectorNumElements();
17835 SmallVector<SDValue, 8> ULTOp1;
17837 for (unsigned i = 0; i < n; ++i) {
17838 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17839 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17842 // Avoid underflow.
17843 APInt Val = Elt->getAPIntValue();
17847 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17850 return DAG.getBuildVector(VT, dl, ULTOp1);
17853 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17854 SelectionDAG &DAG) {
17855 SDValue Op0 = Op.getOperand(0);
17856 SDValue Op1 = Op.getOperand(1);
17857 SDValue CC = Op.getOperand(2);
17858 MVT VT = Op.getSimpleValueType();
17859 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17860 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17865 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17866 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17870 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17871 assert(VT.getVectorNumElements() <= 16);
17872 Opc = X86ISD::CMPM;
17874 Opc = X86ISD::CMPP;
17875 // The SSE/AVX packed FP comparison nodes are defined with a
17876 // floating-point vector result that matches the operand type. This allows
17877 // them to work with an SSE1 target (integer vector types are not legal).
17878 VT = Op0.getSimpleValueType();
17881 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17882 // emit two comparisons and a logic op to tie them together.
17884 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17885 if (SSECC >= 8 && !Subtarget.hasAVX()) {
17886 // LLVM predicate is SETUEQ or SETONE.
17888 unsigned CombineOpc;
17889 if (Cond == ISD::SETUEQ) {
17892 CombineOpc = X86ISD::FOR;
17894 assert(Cond == ISD::SETONE);
17897 CombineOpc = X86ISD::FAND;
17900 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17901 DAG.getConstant(CC0, dl, MVT::i8));
17902 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17903 DAG.getConstant(CC1, dl, MVT::i8));
17904 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17906 // Handle all other FP comparisons here.
17907 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17908 DAG.getConstant(SSECC, dl, MVT::i8));
17911 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17912 // result type of SETCC. The bitcast is expected to be optimized away
17913 // during combining/isel.
17914 if (Opc == X86ISD::CMPP)
17915 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17920 MVT VTOp0 = Op0.getSimpleValueType();
17921 assert(VTOp0 == Op1.getSimpleValueType() &&
17922 "Expected operands with same type!");
17923 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17924 "Invalid number of packed elements for source and destination!");
17926 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17927 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17928 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17929 // legalizer firstly checks if the first operand in input to the setcc has
17930 // a legal type. If so, then it promotes the return type to that same type.
17931 // Otherwise, the return type is promoted to the 'next legal type' which,
17932 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17934 // We reach this code only if the following two conditions are met:
17935 // 1. Both return type and operand type have been promoted to wider types
17936 // by the type legalizer.
17937 // 2. The original operand type has been promoted to a 256-bit vector.
17939 // Note that condition 2. only applies for AVX targets.
17940 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17941 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17944 // The non-AVX512 code below works under the assumption that source and
17945 // destination types are the same.
17946 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17947 "Value types for source and destination must be the same!");
17949 // Break 256-bit integer vector compare into smaller ones.
17950 if (VT.is256BitVector() && !Subtarget.hasInt256())
17951 return Lower256IntVSETCC(Op, DAG);
17953 // Operands are boolean (vectors of i1)
17954 MVT OpVT = Op1.getSimpleValueType();
17955 if (OpVT.getVectorElementType() == MVT::i1)
17956 return LowerBoolVSETCC_AVX512(Op, DAG);
17958 // The result is boolean, but operands are int/float
17959 if (VT.getVectorElementType() == MVT::i1) {
17960 // In AVX-512 architecture setcc returns mask with i1 elements,
17961 // But there is no compare instruction for i8 and i16 elements in KNL.
17962 assert((OpVT.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
17963 "Unexpected operand type");
17964 return LowerIntVSETCC_AVX512(Op, DAG);
17967 // Lower using XOP integer comparisons.
17968 if (VT.is128BitVector() && Subtarget.hasXOP()) {
17969 // Translate compare code to XOP PCOM compare mode.
17970 unsigned CmpMode = 0;
17972 default: llvm_unreachable("Unexpected SETCC condition");
17974 case ISD::SETLT: CmpMode = 0x00; break;
17976 case ISD::SETLE: CmpMode = 0x01; break;
17978 case ISD::SETGT: CmpMode = 0x02; break;
17980 case ISD::SETGE: CmpMode = 0x03; break;
17981 case ISD::SETEQ: CmpMode = 0x04; break;
17982 case ISD::SETNE: CmpMode = 0x05; break;
17985 // Are we comparing unsigned or signed integers?
17987 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17989 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17990 DAG.getConstant(CmpMode, dl, MVT::i8));
17993 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17994 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17995 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
17996 SDValue BC0 = peekThroughBitcasts(Op0);
17997 if (BC0.getOpcode() == ISD::AND) {
17999 SmallVector<APInt, 64> EltBits;
18000 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18001 VT.getScalarSizeInBits(), UndefElts,
18002 EltBits, false, false)) {
18003 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18005 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18011 // We are handling one of the integer comparisons here. Since SSE only has
18012 // GT and EQ comparisons for integer, swapping operands and multiple
18013 // operations may be required for some comparisons.
18014 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18016 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18017 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18018 bool Invert = Cond == ISD::SETNE ||
18019 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18021 // If both operands are known non-negative, then an unsigned compare is the
18022 // same as a signed compare and there's no need to flip signbits.
18023 // TODO: We could check for more general simplifications here since we're
18024 // computing known bits.
18025 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18026 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18028 // Special case: Use min/max operations for SETULE/SETUGE
18029 MVT VET = VT.getVectorElementType();
18031 (Subtarget.hasAVX512() && VET == MVT::i64) ||
18032 (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
18033 (Subtarget.hasSSE2() && (VET == MVT::i8));
18034 bool MinMax = false;
18038 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
18039 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
18043 Swap = Invert = FlipSigns = false;
18046 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
18047 bool Subus = false;
18048 if (!MinMax && HasSubus) {
18049 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18051 // t = psubus Op0, Op1
18052 // pcmpeq t, <0..0>
18055 case ISD::SETULT: {
18056 // If the comparison is against a constant we can turn this into a
18057 // setule. With psubus, setule does not require a swap. This is
18058 // beneficial because the constant in the register is no longer
18059 // destructed as the destination so it can be hoisted out of a loop.
18060 // Only do this pre-AVX since vpcmp* is no longer destructive.
18061 if (Subtarget.hasAVX())
18063 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
18065 Subus = true; Invert = false; Swap = false;
18069 // Psubus is better than flip-sign because it requires no inversion.
18070 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
18071 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
18075 Opc = X86ISD::SUBUS;
18081 std::swap(Op0, Op1);
18083 // Check that the operation in question is available (most are plain SSE2,
18084 // but PCMPGTQ and PCMPEQQ have different requirements).
18085 if (VT == MVT::v2i64) {
18086 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18087 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18089 // First cast everything to the right type.
18090 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18091 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18093 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18094 // bits of the inputs before performing those operations. The lower
18095 // compare is always unsigned.
18098 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18100 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18101 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18102 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18104 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18105 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18107 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18108 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18109 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18111 // Create masks for only the low parts/high parts of the 64 bit integers.
18112 static const int MaskHi[] = { 1, 1, 3, 3 };
18113 static const int MaskLo[] = { 0, 0, 2, 2 };
18114 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18115 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18116 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18118 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18119 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18122 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18124 return DAG.getBitcast(VT, Result);
18127 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18128 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18129 // pcmpeqd + pshufd + pand.
18130 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18132 // First cast everything to the right type.
18133 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18134 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18137 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18139 // Make sure the lower and upper halves are both all-ones.
18140 static const int Mask[] = { 1, 0, 3, 2 };
18141 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18142 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18145 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18147 return DAG.getBitcast(VT, Result);
18151 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18152 // bits of the inputs before performing those operations.
18154 MVT EltVT = VT.getVectorElementType();
18155 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18157 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18158 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18161 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18163 // If the logical-not of the result is required, perform that now.
18165 Result = DAG.getNOT(dl, Result, VT);
18168 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18171 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18172 getZeroVector(VT, Subtarget, DAG, dl));
18177 // Try to select this as a KTEST+SETCC if possible.
18178 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18179 const SDLoc &dl, SelectionDAG &DAG,
18180 const X86Subtarget &Subtarget) {
18181 // Only support equality comparisons.
18182 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18185 // Must be a bitcast from vXi1.
18186 if (Op0.getOpcode() != ISD::BITCAST)
18189 Op0 = Op0.getOperand(0);
18190 MVT VT = Op0.getSimpleValueType();
18191 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18192 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18193 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18196 X86::CondCode X86CC;
18197 if (isNullConstant(Op1)) {
18198 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18199 } else if (isAllOnesConstant(Op1)) {
18200 // C flag is set for all ones.
18201 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18205 // If the input is an OR, we can combine it's operands into the KORTEST.
18208 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18209 LHS = Op0.getOperand(0);
18210 RHS = Op0.getOperand(1);
18213 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18214 return getSETCC(X86CC, KORTEST, dl, DAG);
18217 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18219 MVT VT = Op.getSimpleValueType();
18221 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18223 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18224 SDValue Op0 = Op.getOperand(0);
18225 SDValue Op1 = Op.getOperand(1);
18227 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18229 // Optimize to BT if possible.
18230 // Lower (X & (1 << N)) == 0 to BT(X, N).
18231 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18232 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18233 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18234 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18235 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18239 // Try to use PTEST for a tree ORs equality compared with 0.
18240 // TODO: We could do AND tree with all 1s as well by using the C flag.
18241 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18242 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18243 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18247 // Try to lower using KTEST.
18248 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18251 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18253 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18254 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18256 // If the input is a setcc, then reuse the input setcc or use a new one with
18257 // the inverted condition.
18258 if (Op0.getOpcode() == X86ISD::SETCC) {
18259 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18260 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18264 CCode = X86::GetOppositeBranchCondition(CCode);
18265 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18269 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18270 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18271 if (X86CC == X86::COND_INVALID)
18274 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18275 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18276 return getSETCC(X86CC, EFLAGS, dl, DAG);
18279 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18280 SDValue LHS = Op.getOperand(0);
18281 SDValue RHS = Op.getOperand(1);
18282 SDValue Carry = Op.getOperand(2);
18283 SDValue Cond = Op.getOperand(3);
18286 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18287 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18289 // Recreate the carry if needed.
18290 EVT CarryVT = Carry.getValueType();
18291 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18292 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18293 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18295 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18296 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18297 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18300 /// Return true if opcode is a X86 logical comparison.
18301 static bool isX86LogicalCmp(SDValue Op) {
18302 unsigned Opc = Op.getOpcode();
18303 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18304 Opc == X86ISD::SAHF)
18306 if (Op.getResNo() == 1 &&
18307 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18308 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18309 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18310 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18313 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18319 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18320 if (V.getOpcode() != ISD::TRUNCATE)
18323 SDValue VOp0 = V.getOperand(0);
18324 unsigned InBits = VOp0.getValueSizeInBits();
18325 unsigned Bits = V.getValueSizeInBits();
18326 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18329 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18330 bool AddTest = true;
18331 SDValue Cond = Op.getOperand(0);
18332 SDValue Op1 = Op.getOperand(1);
18333 SDValue Op2 = Op.getOperand(2);
18335 MVT VT = Op1.getSimpleValueType();
18338 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18339 // are available or VBLENDV if AVX is available.
18340 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18341 if (Cond.getOpcode() == ISD::SETCC &&
18342 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
18343 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18344 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18345 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18346 unsigned SSECC = translateX86FSETCC(
18347 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18349 if (Subtarget.hasAVX512()) {
18350 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18351 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18352 assert(!VT.isVector() && "Not a scalar type?");
18353 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18356 if (SSECC < 8 || Subtarget.hasAVX()) {
18357 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18358 DAG.getConstant(SSECC, DL, MVT::i8));
18360 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18361 // of 3 logic instructions for size savings and potentially speed.
18362 // Unfortunately, there is no scalar form of VBLENDV.
18364 // If either operand is a constant, don't try this. We can expect to
18365 // optimize away at least one of the logic instructions later in that
18366 // case, so that sequence would be faster than a variable blend.
18368 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18369 // uses XMM0 as the selection register. That may need just as many
18370 // instructions as the AND/ANDN/OR sequence due to register moves, so
18373 if (Subtarget.hasAVX() &&
18374 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18376 // Convert to vectors, do a VSELECT, and convert back to scalar.
18377 // All of the conversions should be optimized away.
18379 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18380 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18381 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18382 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18384 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18385 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18387 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18389 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18390 VSel, DAG.getIntPtrConstant(0, DL));
18392 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18393 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18394 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18398 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18399 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18400 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18401 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18404 // For v64i1 without 64-bit support we need to split and rejoin.
18405 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18406 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18407 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18408 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18409 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18410 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18411 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18412 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18413 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18416 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18418 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18419 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18420 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18421 Op1Scalar = Op1.getOperand(0);
18423 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18424 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18425 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18426 Op2Scalar = Op2.getOperand(0);
18427 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18428 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18429 Op1Scalar, Op2Scalar);
18430 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18431 return DAG.getBitcast(VT, newSelect);
18432 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18433 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18434 DAG.getIntPtrConstant(0, DL));
18438 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18439 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18440 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18441 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18442 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18443 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18444 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18445 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18448 if (Cond.getOpcode() == ISD::SETCC) {
18449 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18451 // If the condition was updated, it's possible that the operands of the
18452 // select were also updated (for example, EmitTest has a RAUW). Refresh
18453 // the local references to the select operands in case they got stale.
18454 Op1 = Op.getOperand(1);
18455 Op2 = Op.getOperand(2);
18459 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18460 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18461 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18462 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18463 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18464 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18465 if (Cond.getOpcode() == X86ISD::SETCC &&
18466 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18467 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18468 SDValue Cmp = Cond.getOperand(1);
18469 unsigned CondCode =
18470 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18472 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18473 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18474 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18475 SDValue CmpOp0 = Cmp.getOperand(0);
18477 // Apply further optimizations for special cases
18478 // (select (x != 0), -1, 0) -> neg & sbb
18479 // (select (x == 0), 0, -1) -> neg & sbb
18480 if (isNullConstant(Y) &&
18481 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18482 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18483 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18484 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18485 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18486 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18487 SDValue(Neg.getNode(), 1));
18491 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18492 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18493 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18495 SDValue Res = // Res = 0 or -1.
18496 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18497 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18499 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18500 Res = DAG.getNOT(DL, Res, Res.getValueType());
18502 if (!isNullConstant(Op2))
18503 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18505 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18506 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18507 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18508 SDValue CmpOp0 = Cmp.getOperand(0);
18509 SDValue Src1, Src2;
18510 // true if Op2 is XOR or OR operator and one of its operands
18512 // ( a , a op b) || ( b , a op b)
18513 auto isOrXorPattern = [&]() {
18514 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18515 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18517 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18524 if (isOrXorPattern()) {
18526 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18527 // we need mask of all zeros or ones with same size of the other
18529 if (CmpSz > VT.getSizeInBits())
18530 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18531 else if (CmpSz < VT.getSizeInBits())
18532 Neg = DAG.getNode(ISD::AND, DL, VT,
18533 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18534 DAG.getConstant(1, DL, VT));
18537 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18538 Neg); // -(and (x, 0x1))
18539 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18540 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18545 // Look past (and (setcc_carry (cmp ...)), 1).
18546 if (Cond.getOpcode() == ISD::AND &&
18547 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18548 isOneConstant(Cond.getOperand(1)))
18549 Cond = Cond.getOperand(0);
18551 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18552 // setting operand in place of the X86ISD::SETCC.
18553 unsigned CondOpcode = Cond.getOpcode();
18554 if (CondOpcode == X86ISD::SETCC ||
18555 CondOpcode == X86ISD::SETCC_CARRY) {
18556 CC = Cond.getOperand(0);
18558 SDValue Cmp = Cond.getOperand(1);
18559 unsigned Opc = Cmp.getOpcode();
18560 MVT VT = Op.getSimpleValueType();
18562 bool IllegalFPCMov = false;
18563 if (VT.isFloatingPoint() && !VT.isVector() &&
18564 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18565 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18567 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18568 Opc == X86ISD::BT) { // FIXME
18572 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18573 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18574 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18575 Cond.getOperand(0).getValueType() != MVT::i8)) {
18576 SDValue LHS = Cond.getOperand(0);
18577 SDValue RHS = Cond.getOperand(1);
18578 unsigned X86Opcode;
18581 switch (CondOpcode) {
18582 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18583 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18584 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18585 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18586 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18587 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18588 default: llvm_unreachable("unexpected overflowing operator");
18590 if (CondOpcode == ISD::UMULO)
18591 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18594 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18596 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18598 if (CondOpcode == ISD::UMULO)
18599 Cond = X86Op.getValue(2);
18601 Cond = X86Op.getValue(1);
18603 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18608 // Look past the truncate if the high bits are known zero.
18609 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18610 Cond = Cond.getOperand(0);
18612 // We know the result of AND is compared against zero. Try to match
18614 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18615 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18616 CC = NewSetCC.getOperand(0);
18617 Cond = NewSetCC.getOperand(1);
18624 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18625 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18628 // a < b ? -1 : 0 -> RES = ~setcc_carry
18629 // a < b ? 0 : -1 -> RES = setcc_carry
18630 // a >= b ? -1 : 0 -> RES = setcc_carry
18631 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18632 if (Cond.getOpcode() == X86ISD::SUB) {
18633 Cond = ConvertCmpIfNecessary(Cond, DAG);
18634 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18636 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18637 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18638 (isNullConstant(Op1) || isNullConstant(Op2))) {
18639 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18640 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18642 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18643 return DAG.getNOT(DL, Res, Res.getValueType());
18648 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18649 // widen the cmov and push the truncate through. This avoids introducing a new
18650 // branch during isel and doesn't add any extensions.
18651 if (Op.getValueType() == MVT::i8 &&
18652 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18653 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18654 if (T1.getValueType() == T2.getValueType() &&
18655 // Blacklist CopyFromReg to avoid partial register stalls.
18656 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18657 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18659 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18663 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18664 // condition is true.
18665 SDValue Ops[] = { Op2, Op1, CC, Cond };
18666 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18669 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
18670 const X86Subtarget &Subtarget,
18671 SelectionDAG &DAG) {
18672 MVT VT = Op->getSimpleValueType(0);
18673 SDValue In = Op->getOperand(0);
18674 MVT InVT = In.getSimpleValueType();
18675 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
18676 MVT VTElt = VT.getVectorElementType();
18679 unsigned NumElts = VT.getVectorNumElements();
18681 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
18683 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
18684 // If v16i32 is to be avoided, we'll need to split and concatenate.
18685 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
18686 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
18688 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18691 // Widen to 512-bits if VLX is not supported.
18692 MVT WideVT = ExtVT;
18693 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18694 NumElts *= 512 / ExtVT.getSizeInBits();
18695 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18696 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
18697 In, DAG.getIntPtrConstant(0, dl));
18698 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
18702 MVT WideEltVT = WideVT.getVectorElementType();
18703 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
18704 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
18705 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
18707 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
18708 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
18709 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
18712 // Truncate if we had to extend i16/i8 above.
18714 WideVT = MVT::getVectorVT(VTElt, NumElts);
18715 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
18718 // Extract back to 128/256-bit if we widened.
18720 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
18721 DAG.getIntPtrConstant(0, dl));
18726 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18727 SelectionDAG &DAG) {
18728 SDValue In = Op->getOperand(0);
18729 MVT InVT = In.getSimpleValueType();
18731 if (InVT.getVectorElementType() == MVT::i1)
18732 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18734 assert(Subtarget.hasFp256() && "Expected AVX support");
18735 return LowerAVXExtend(Op, DAG, Subtarget);
18738 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18739 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18740 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18741 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18742 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18743 const X86Subtarget &Subtarget,
18744 SelectionDAG &DAG) {
18745 SDValue In = Op->getOperand(0);
18746 MVT VT = Op->getSimpleValueType(0);
18747 MVT InVT = In.getSimpleValueType();
18748 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18750 MVT SVT = VT.getVectorElementType();
18751 MVT InSVT = InVT.getVectorElementType();
18752 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18754 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18756 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18758 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18759 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18760 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18765 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18766 // For 512-bit vectors, we need 128-bits or 256-bits.
18767 if (VT.getSizeInBits() > 128) {
18768 // Input needs to be at least the same number of elements as output, and
18769 // at least 128-bits.
18770 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18771 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18774 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18775 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18777 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18778 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18779 // need to be handled here for 256/512-bit results.
18780 if (Subtarget.hasInt256()) {
18781 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18782 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18783 X86ISD::VSEXT : X86ISD::VZEXT;
18784 return DAG.getNode(ExtOpc, dl, VT, In);
18787 // We should only get here for sign extend.
18788 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18789 "Unexpected opcode!");
18791 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18795 // As SRAI is only available on i16/i32 types, we expand only up to i32
18796 // and handle i64 separately.
18797 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18798 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18799 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18800 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18801 Curr = DAG.getBitcast(CurrVT, Curr);
18804 SDValue SignExt = Curr;
18805 if (CurrVT != InVT) {
18806 unsigned SignExtShift =
18807 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18808 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18809 DAG.getConstant(SignExtShift, dl, MVT::i8));
18815 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18816 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18817 DAG.getConstant(31, dl, MVT::i8));
18818 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18819 return DAG.getBitcast(VT, Ext);
18825 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18826 SelectionDAG &DAG) {
18827 MVT VT = Op->getSimpleValueType(0);
18828 SDValue In = Op->getOperand(0);
18829 MVT InVT = In.getSimpleValueType();
18832 if (InVT.getVectorElementType() == MVT::i1)
18833 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18835 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
18836 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
18837 "Expected same number of elements");
18838 assert((VT.getVectorElementType() == MVT::i16 ||
18839 VT.getVectorElementType() == MVT::i32 ||
18840 VT.getVectorElementType() == MVT::i64) &&
18841 "Unexpected element type");
18842 assert((InVT.getVectorElementType() == MVT::i8 ||
18843 InVT.getVectorElementType() == MVT::i16 ||
18844 InVT.getVectorElementType() == MVT::i32) &&
18845 "Unexpected element type");
18847 if (Subtarget.hasInt256())
18848 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18850 // Optimize vectors in AVX mode
18851 // Sign extend v8i16 to v8i32 and
18854 // Divide input vector into two parts
18855 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18856 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18857 // concat the vectors to original VT
18859 unsigned NumElems = InVT.getVectorNumElements();
18860 SDValue Undef = DAG.getUNDEF(InVT);
18862 SmallVector<int,8> ShufMask1(NumElems, -1);
18863 for (unsigned i = 0; i != NumElems/2; ++i)
18866 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18868 SmallVector<int,8> ShufMask2(NumElems, -1);
18869 for (unsigned i = 0; i != NumElems/2; ++i)
18870 ShufMask2[i] = i + NumElems/2;
18872 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18874 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18875 VT.getVectorNumElements() / 2);
18877 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18878 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18880 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18883 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18884 // may emit an illegal shuffle but the expansion is still better than scalar
18885 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18886 // we'll emit a shuffle and a arithmetic shift.
18887 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18888 // TODO: It is possible to support ZExt by zeroing the undef values during
18889 // the shuffle phase or after the shuffle.
18890 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18891 SelectionDAG &DAG) {
18892 MVT RegVT = Op.getSimpleValueType();
18893 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18894 assert(RegVT.isInteger() &&
18895 "We only custom lower integer vector sext loads.");
18897 // Nothing useful we can do without SSE2 shuffles.
18898 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18900 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18902 EVT MemVT = Ld->getMemoryVT();
18904 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18905 unsigned RegSz = RegVT.getSizeInBits();
18907 ISD::LoadExtType Ext = Ld->getExtensionType();
18909 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18910 && "Only anyext and sext are currently implemented.");
18911 assert(MemVT != RegVT && "Cannot extend to the same type");
18912 assert(MemVT.isVector() && "Must load a vector from memory");
18914 unsigned NumElems = RegVT.getVectorNumElements();
18915 unsigned MemSz = MemVT.getSizeInBits();
18916 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18918 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18919 // The only way in which we have a legal 256-bit vector result but not the
18920 // integer 256-bit operations needed to directly lower a sextload is if we
18921 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18922 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18923 // correctly legalized. We do this late to allow the canonical form of
18924 // sextload to persist throughout the rest of the DAG combiner -- it wants
18925 // to fold together any extensions it can, and so will fuse a sign_extend
18926 // of an sextload into a sextload targeting a wider value.
18928 if (MemSz == 128) {
18929 // Just switch this to a normal load.
18930 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18931 "it must be a legal 128-bit vector "
18933 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18934 Ld->getPointerInfo(), Ld->getAlignment(),
18935 Ld->getMemOperand()->getFlags());
18937 assert(MemSz < 128 &&
18938 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18939 // Do an sext load to a 128-bit vector type. We want to use the same
18940 // number of elements, but elements half as wide. This will end up being
18941 // recursively lowered by this routine, but will succeed as we definitely
18942 // have all the necessary features if we're using AVX1.
18944 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18945 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18947 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18948 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18949 Ld->getMemOperand()->getFlags());
18952 // Replace chain users with the new chain.
18953 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18954 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18956 // Finally, do a normal sign-extend to the desired register.
18957 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18960 // All sizes must be a power of two.
18961 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18962 "Non-power-of-two elements are not custom lowered!");
18964 // Attempt to load the original value using scalar loads.
18965 // Find the largest scalar type that divides the total loaded size.
18966 MVT SclrLoadTy = MVT::i8;
18967 for (MVT Tp : MVT::integer_valuetypes()) {
18968 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18973 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18974 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18976 SclrLoadTy = MVT::f64;
18978 // Calculate the number of scalar loads that we need to perform
18979 // in order to load our vector from memory.
18980 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18982 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18983 "Can only lower sext loads with a single scalar load!");
18985 unsigned loadRegZize = RegSz;
18986 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18989 // If we don't have BWI we won't be able to create the shuffle needed for
18991 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18992 MemVT == MVT::v8i8)
18995 // Represent our vector as a sequence of elements which are the
18996 // largest scalar that we can load.
18997 EVT LoadUnitVecVT = EVT::getVectorVT(
18998 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19000 // Represent the data using the same element type that is stored in
19001 // memory. In practice, we ''widen'' MemVT.
19003 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19004 loadRegZize / MemVT.getScalarSizeInBits());
19006 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19007 "Invalid vector type");
19009 // We can't shuffle using an illegal type.
19010 assert(TLI.isTypeLegal(WideVecVT) &&
19011 "We only lower types that form legal widened vector types");
19013 SmallVector<SDValue, 8> Chains;
19014 SDValue Ptr = Ld->getBasePtr();
19015 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19016 TLI.getPointerTy(DAG.getDataLayout()));
19017 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19019 for (unsigned i = 0; i < NumLoads; ++i) {
19020 // Perform a single load.
19021 SDValue ScalarLoad =
19022 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19023 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19024 Chains.push_back(ScalarLoad.getValue(1));
19025 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19026 // another round of DAGCombining.
19028 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19030 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19031 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19033 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19036 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19038 // Bitcast the loaded value to a vector of the original element type, in
19039 // the size of the target vector type.
19040 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19041 unsigned SizeRatio = RegSz / MemSz;
19043 if (Ext == ISD::SEXTLOAD) {
19044 // If we have SSE4.1, we can directly emit a VSEXT node.
19045 if (Subtarget.hasSSE41()) {
19046 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19047 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19051 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19053 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19054 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19056 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19057 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19061 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19062 MemVT == MVT::v8i8) {
19063 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19064 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19068 // Redistribute the loaded elements into the different locations.
19069 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19070 for (unsigned i = 0; i != NumElems; ++i)
19071 ShuffleVec[i * SizeRatio] = i;
19073 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19074 DAG.getUNDEF(WideVecVT), ShuffleVec);
19076 // Bitcast to the requested type.
19077 Shuff = DAG.getBitcast(RegVT, Shuff);
19078 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19082 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19083 /// each of which has no other use apart from the AND / OR.
19084 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19085 Opc = Op.getOpcode();
19086 if (Opc != ISD::OR && Opc != ISD::AND)
19088 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19089 Op.getOperand(0).hasOneUse() &&
19090 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19091 Op.getOperand(1).hasOneUse());
19094 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19095 /// SETCC node has a single use.
19096 static bool isXor1OfSetCC(SDValue Op) {
19097 if (Op.getOpcode() != ISD::XOR)
19099 if (isOneConstant(Op.getOperand(1)))
19100 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19101 Op.getOperand(0).hasOneUse();
19105 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19106 bool addTest = true;
19107 SDValue Chain = Op.getOperand(0);
19108 SDValue Cond = Op.getOperand(1);
19109 SDValue Dest = Op.getOperand(2);
19112 bool Inverted = false;
19114 if (Cond.getOpcode() == ISD::SETCC) {
19115 // Check for setcc([su]{add,sub,mul}o == 0).
19116 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19117 isNullConstant(Cond.getOperand(1)) &&
19118 Cond.getOperand(0).getResNo() == 1 &&
19119 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19120 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19121 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19122 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19123 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19124 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19126 Cond = Cond.getOperand(0);
19128 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19133 // FIXME: LowerXALUO doesn't handle these!!
19134 else if (Cond.getOpcode() == X86ISD::ADD ||
19135 Cond.getOpcode() == X86ISD::SUB ||
19136 Cond.getOpcode() == X86ISD::SMUL ||
19137 Cond.getOpcode() == X86ISD::UMUL)
19138 Cond = LowerXALUO(Cond, DAG);
19141 // Look pass (and (setcc_carry (cmp ...)), 1).
19142 if (Cond.getOpcode() == ISD::AND &&
19143 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19144 isOneConstant(Cond.getOperand(1)))
19145 Cond = Cond.getOperand(0);
19147 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19148 // setting operand in place of the X86ISD::SETCC.
19149 unsigned CondOpcode = Cond.getOpcode();
19150 if (CondOpcode == X86ISD::SETCC ||
19151 CondOpcode == X86ISD::SETCC_CARRY) {
19152 CC = Cond.getOperand(0);
19154 SDValue Cmp = Cond.getOperand(1);
19155 unsigned Opc = Cmp.getOpcode();
19156 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19157 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19161 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19165 // These can only come from an arithmetic instruction with overflow,
19166 // e.g. SADDO, UADDO.
19167 Cond = Cond.getOperand(1);
19173 CondOpcode = Cond.getOpcode();
19174 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19175 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19176 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19177 Cond.getOperand(0).getValueType() != MVT::i8)) {
19178 SDValue LHS = Cond.getOperand(0);
19179 SDValue RHS = Cond.getOperand(1);
19180 unsigned X86Opcode;
19183 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19184 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19186 switch (CondOpcode) {
19187 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19189 if (isOneConstant(RHS)) {
19190 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19193 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19194 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19196 if (isOneConstant(RHS)) {
19197 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19200 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19201 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19202 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19203 default: llvm_unreachable("unexpected overflowing operator");
19206 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19207 if (CondOpcode == ISD::UMULO)
19208 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19211 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19213 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19215 if (CondOpcode == ISD::UMULO)
19216 Cond = X86Op.getValue(2);
19218 Cond = X86Op.getValue(1);
19220 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19224 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19225 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19226 if (CondOpc == ISD::OR) {
19227 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19228 // two branches instead of an explicit OR instruction with a
19230 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19231 isX86LogicalCmp(Cmp)) {
19232 CC = Cond.getOperand(0).getOperand(0);
19233 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19234 Chain, Dest, CC, Cmp);
19235 CC = Cond.getOperand(1).getOperand(0);
19239 } else { // ISD::AND
19240 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19241 // two branches instead of an explicit AND instruction with a
19242 // separate test. However, we only do this if this block doesn't
19243 // have a fall-through edge, because this requires an explicit
19244 // jmp when the condition is false.
19245 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19246 isX86LogicalCmp(Cmp) &&
19247 Op.getNode()->hasOneUse()) {
19248 X86::CondCode CCode =
19249 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19250 CCode = X86::GetOppositeBranchCondition(CCode);
19251 CC = DAG.getConstant(CCode, dl, MVT::i8);
19252 SDNode *User = *Op.getNode()->use_begin();
19253 // Look for an unconditional branch following this conditional branch.
19254 // We need this because we need to reverse the successors in order
19255 // to implement FCMP_OEQ.
19256 if (User->getOpcode() == ISD::BR) {
19257 SDValue FalseBB = User->getOperand(1);
19259 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19260 assert(NewBR == User);
19264 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19265 Chain, Dest, CC, Cmp);
19266 X86::CondCode CCode =
19267 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19268 CCode = X86::GetOppositeBranchCondition(CCode);
19269 CC = DAG.getConstant(CCode, dl, MVT::i8);
19275 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19276 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19277 // It should be transformed during dag combiner except when the condition
19278 // is set by a arithmetics with overflow node.
19279 X86::CondCode CCode =
19280 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19281 CCode = X86::GetOppositeBranchCondition(CCode);
19282 CC = DAG.getConstant(CCode, dl, MVT::i8);
19283 Cond = Cond.getOperand(0).getOperand(1);
19285 } else if (Cond.getOpcode() == ISD::SETCC &&
19286 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19287 // For FCMP_OEQ, we can emit
19288 // two branches instead of an explicit AND instruction with a
19289 // separate test. However, we only do this if this block doesn't
19290 // have a fall-through edge, because this requires an explicit
19291 // jmp when the condition is false.
19292 if (Op.getNode()->hasOneUse()) {
19293 SDNode *User = *Op.getNode()->use_begin();
19294 // Look for an unconditional branch following this conditional branch.
19295 // We need this because we need to reverse the successors in order
19296 // to implement FCMP_OEQ.
19297 if (User->getOpcode() == ISD::BR) {
19298 SDValue FalseBB = User->getOperand(1);
19300 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19301 assert(NewBR == User);
19305 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19306 Cond.getOperand(0), Cond.getOperand(1));
19307 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19308 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19309 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19310 Chain, Dest, CC, Cmp);
19311 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19316 } else if (Cond.getOpcode() == ISD::SETCC &&
19317 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19318 // For FCMP_UNE, we can emit
19319 // two branches instead of an explicit AND instruction with a
19320 // separate test. However, we only do this if this block doesn't
19321 // have a fall-through edge, because this requires an explicit
19322 // jmp when the condition is false.
19323 if (Op.getNode()->hasOneUse()) {
19324 SDNode *User = *Op.getNode()->use_begin();
19325 // Look for an unconditional branch following this conditional branch.
19326 // We need this because we need to reverse the successors in order
19327 // to implement FCMP_UNE.
19328 if (User->getOpcode() == ISD::BR) {
19329 SDValue FalseBB = User->getOperand(1);
19331 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19332 assert(NewBR == User);
19335 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19336 Cond.getOperand(0), Cond.getOperand(1));
19337 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19338 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19339 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19340 Chain, Dest, CC, Cmp);
19341 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19351 // Look pass the truncate if the high bits are known zero.
19352 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19353 Cond = Cond.getOperand(0);
19355 // We know the result of AND is compared against zero. Try to match
19357 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19358 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19359 CC = NewSetCC.getOperand(0);
19360 Cond = NewSetCC.getOperand(1);
19367 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19368 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19369 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19371 Cond = ConvertCmpIfNecessary(Cond, DAG);
19372 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19373 Chain, Dest, CC, Cond);
19376 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19377 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19378 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19379 // that the guard pages used by the OS virtual memory manager are allocated in
19380 // correct sequence.
19382 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19383 SelectionDAG &DAG) const {
19384 MachineFunction &MF = DAG.getMachineFunction();
19385 bool SplitStack = MF.shouldSplitStack();
19386 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19387 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19388 SplitStack || EmitStackProbe;
19392 SDNode *Node = Op.getNode();
19393 SDValue Chain = Op.getOperand(0);
19394 SDValue Size = Op.getOperand(1);
19395 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19396 EVT VT = Node->getValueType(0);
19398 // Chain the dynamic stack allocation so that it doesn't modify the stack
19399 // pointer when other instructions are using the stack.
19400 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19402 bool Is64Bit = Subtarget.is64Bit();
19403 MVT SPTy = getPointerTy(DAG.getDataLayout());
19407 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19408 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19409 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19410 " not tell us which reg is the stack pointer!");
19412 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19413 Chain = SP.getValue(1);
19414 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19415 unsigned StackAlign = TFI.getStackAlignment();
19416 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19417 if (Align > StackAlign)
19418 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19419 DAG.getConstant(-(uint64_t)Align, dl, VT));
19420 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19421 } else if (SplitStack) {
19422 MachineRegisterInfo &MRI = MF.getRegInfo();
19425 // The 64 bit implementation of segmented stacks needs to clobber both r10
19426 // r11. This makes it impossible to use it along with nested parameters.
19427 const Function &F = MF.getFunction();
19428 for (const auto &A : F.args()) {
19429 if (A.hasNestAttr())
19430 report_fatal_error("Cannot use segmented stacks with functions that "
19431 "have nested arguments.");
19435 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19436 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19437 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19438 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19439 DAG.getRegister(Vreg, SPTy));
19441 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19442 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19443 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19445 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19446 unsigned SPReg = RegInfo->getStackRegister();
19447 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19448 Chain = SP.getValue(1);
19451 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19452 DAG.getConstant(-(uint64_t)Align, dl, VT));
19453 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19459 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19460 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19462 SDValue Ops[2] = {Result, Chain};
19463 return DAG.getMergeValues(Ops, dl);
19466 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19467 MachineFunction &MF = DAG.getMachineFunction();
19468 auto PtrVT = getPointerTy(MF.getDataLayout());
19469 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19471 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19474 if (!Subtarget.is64Bit() ||
19475 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19476 // vastart just stores the address of the VarArgsFrameIndex slot into the
19477 // memory location argument.
19478 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19479 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19480 MachinePointerInfo(SV));
19484 // gp_offset (0 - 6 * 8)
19485 // fp_offset (48 - 48 + 8 * 16)
19486 // overflow_arg_area (point to parameters coming in memory).
19488 SmallVector<SDValue, 8> MemOps;
19489 SDValue FIN = Op.getOperand(1);
19491 SDValue Store = DAG.getStore(
19492 Op.getOperand(0), DL,
19493 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19494 MachinePointerInfo(SV));
19495 MemOps.push_back(Store);
19498 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19499 Store = DAG.getStore(
19500 Op.getOperand(0), DL,
19501 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19502 MachinePointerInfo(SV, 4));
19503 MemOps.push_back(Store);
19505 // Store ptr to overflow_arg_area
19506 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19507 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19509 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19510 MemOps.push_back(Store);
19512 // Store ptr to reg_save_area.
19513 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19514 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19515 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19516 Store = DAG.getStore(
19517 Op.getOperand(0), DL, RSFIN, FIN,
19518 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19519 MemOps.push_back(Store);
19520 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19523 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19524 assert(Subtarget.is64Bit() &&
19525 "LowerVAARG only handles 64-bit va_arg!");
19526 assert(Op.getNumOperands() == 4);
19528 MachineFunction &MF = DAG.getMachineFunction();
19529 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
19530 // The Win64 ABI uses char* instead of a structure.
19531 return DAG.expandVAArg(Op.getNode());
19533 SDValue Chain = Op.getOperand(0);
19534 SDValue SrcPtr = Op.getOperand(1);
19535 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19536 unsigned Align = Op.getConstantOperandVal(3);
19539 EVT ArgVT = Op.getNode()->getValueType(0);
19540 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19541 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19544 // Decide which area this value should be read from.
19545 // TODO: Implement the AMD64 ABI in its entirety. This simple
19546 // selection mechanism works only for the basic types.
19547 if (ArgVT == MVT::f80) {
19548 llvm_unreachable("va_arg for f80 not yet implemented");
19549 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19550 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19551 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19552 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19554 llvm_unreachable("Unhandled argument type in LowerVAARG");
19557 if (ArgMode == 2) {
19558 // Sanity Check: Make sure using fp_offset makes sense.
19559 assert(!Subtarget.useSoftFloat() &&
19560 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
19561 Subtarget.hasSSE1());
19564 // Insert VAARG_64 node into the DAG
19565 // VAARG_64 returns two values: Variable Argument Address, Chain
19566 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19567 DAG.getConstant(ArgMode, dl, MVT::i8),
19568 DAG.getConstant(Align, dl, MVT::i32)};
19569 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19570 SDValue VAARG = DAG.getMemIntrinsicNode(
19571 X86ISD::VAARG_64, dl,
19572 VTs, InstOps, MVT::i64,
19573 MachinePointerInfo(SV),
19575 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
19576 Chain = VAARG.getValue(1);
19578 // Load the next argument and return it
19579 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19582 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19583 SelectionDAG &DAG) {
19584 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19585 // where a va_list is still an i8*.
19586 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19587 if (Subtarget.isCallingConvWin64(
19588 DAG.getMachineFunction().getFunction().getCallingConv()))
19589 // Probably a Win64 va_copy.
19590 return DAG.expandVACopy(Op.getNode());
19592 SDValue Chain = Op.getOperand(0);
19593 SDValue DstPtr = Op.getOperand(1);
19594 SDValue SrcPtr = Op.getOperand(2);
19595 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19596 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19599 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19600 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19602 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19605 /// Handle vector element shifts where the shift amount is a constant.
19606 /// Takes immediate version of shift as input.
19607 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19608 SDValue SrcOp, uint64_t ShiftAmt,
19609 SelectionDAG &DAG) {
19610 MVT ElementType = VT.getVectorElementType();
19612 // Bitcast the source vector to the output type, this is mainly necessary for
19613 // vXi8/vXi64 shifts.
19614 if (VT != SrcOp.getSimpleValueType())
19615 SrcOp = DAG.getBitcast(VT, SrcOp);
19617 // Fold this packed shift into its first operand if ShiftAmt is 0.
19621 // Check for ShiftAmt >= element width
19622 if (ShiftAmt >= ElementType.getSizeInBits()) {
19623 if (Opc == X86ISD::VSRAI)
19624 ShiftAmt = ElementType.getSizeInBits() - 1;
19626 return DAG.getConstant(0, dl, VT);
19629 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19630 && "Unknown target vector shift-by-constant node");
19632 // Fold this packed vector shift into a build vector if SrcOp is a
19633 // vector of Constants or UNDEFs.
19634 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19635 SmallVector<SDValue, 8> Elts;
19636 unsigned NumElts = SrcOp->getNumOperands();
19637 ConstantSDNode *ND;
19640 default: llvm_unreachable("Unknown opcode!");
19641 case X86ISD::VSHLI:
19642 for (unsigned i=0; i!=NumElts; ++i) {
19643 SDValue CurrentOp = SrcOp->getOperand(i);
19644 if (CurrentOp->isUndef()) {
19645 Elts.push_back(CurrentOp);
19648 ND = cast<ConstantSDNode>(CurrentOp);
19649 const APInt &C = ND->getAPIntValue();
19650 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19653 case X86ISD::VSRLI:
19654 for (unsigned i=0; i!=NumElts; ++i) {
19655 SDValue CurrentOp = SrcOp->getOperand(i);
19656 if (CurrentOp->isUndef()) {
19657 Elts.push_back(CurrentOp);
19660 ND = cast<ConstantSDNode>(CurrentOp);
19661 const APInt &C = ND->getAPIntValue();
19662 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19665 case X86ISD::VSRAI:
19666 for (unsigned i=0; i!=NumElts; ++i) {
19667 SDValue CurrentOp = SrcOp->getOperand(i);
19668 if (CurrentOp->isUndef()) {
19669 Elts.push_back(CurrentOp);
19672 ND = cast<ConstantSDNode>(CurrentOp);
19673 const APInt &C = ND->getAPIntValue();
19674 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19679 return DAG.getBuildVector(VT, dl, Elts);
19682 return DAG.getNode(Opc, dl, VT, SrcOp,
19683 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19686 /// Handle vector element shifts where the shift amount may or may not be a
19687 /// constant. Takes immediate version of shift as input.
19688 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19689 SDValue SrcOp, SDValue ShAmt,
19690 const X86Subtarget &Subtarget,
19691 SelectionDAG &DAG) {
19692 MVT SVT = ShAmt.getSimpleValueType();
19693 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19695 // Catch shift-by-constant.
19696 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19697 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19698 CShAmt->getZExtValue(), DAG);
19700 // Change opcode to non-immediate version
19702 default: llvm_unreachable("Unknown target vector shift node");
19703 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19704 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19705 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19708 // Need to build a vector containing shift amount.
19709 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19710 // +=================+============+=======================================+
19711 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19712 // +=================+============+=======================================+
19713 // | i64 | Yes, No | Use ShAmt as lowest elt |
19714 // | i32 | Yes | zero-extend in-reg |
19715 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19716 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19717 // +=================+============+=======================================+
19719 if (SVT == MVT::i64)
19720 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19721 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19722 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19723 ShAmt = ShAmt.getOperand(0);
19724 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19725 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19726 } else if (Subtarget.hasSSE41() &&
19727 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19728 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19729 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19731 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19732 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19733 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19736 // The return type has to be a 128-bit type with the same element
19737 // type as the input type.
19738 MVT EltVT = VT.getVectorElementType();
19739 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19741 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19742 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19745 /// \brief Return Mask with the necessary casting or extending
19746 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19747 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19748 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19751 if (isAllOnesConstant(Mask))
19752 return DAG.getConstant(1, dl, MaskVT);
19753 if (X86::isZeroNode(Mask))
19754 return DAG.getConstant(0, dl, MaskVT);
19756 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19757 // Mask should be extended
19758 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19759 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19762 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19763 if (MaskVT == MVT::v64i1) {
19764 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19765 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19767 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19768 DAG.getConstant(0, dl, MVT::i32));
19769 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19770 DAG.getConstant(1, dl, MVT::i32));
19772 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19773 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19775 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19777 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19779 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19780 return DAG.getBitcast(MaskVT,
19781 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19785 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19786 Mask.getSimpleValueType().getSizeInBits());
19787 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19788 // are extracted by EXTRACT_SUBVECTOR.
19789 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19790 DAG.getBitcast(BitcastVT, Mask),
19791 DAG.getIntPtrConstant(0, dl));
19795 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19796 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19797 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19798 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19799 SDValue PreservedSrc,
19800 const X86Subtarget &Subtarget,
19801 SelectionDAG &DAG) {
19802 MVT VT = Op.getSimpleValueType();
19803 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19804 unsigned OpcodeSelect = ISD::VSELECT;
19807 if (isAllOnesConstant(Mask))
19810 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19812 switch (Op.getOpcode()) {
19815 case X86ISD::CMPM_RND:
19816 case X86ISD::CMPMU:
19817 case X86ISD::VPSHUFBITQMB:
19818 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19819 case X86ISD::VFPCLASS:
19820 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19821 case ISD::TRUNCATE:
19822 case X86ISD::VTRUNC:
19823 case X86ISD::VTRUNCS:
19824 case X86ISD::VTRUNCUS:
19825 case X86ISD::CVTPS2PH:
19826 // We can't use ISD::VSELECT here because it is not always "Legal"
19827 // for the destination type. For example vpmovqb require only AVX512
19828 // and vselect that can operate on byte element type require BWI
19829 OpcodeSelect = X86ISD::SELECT;
19832 if (PreservedSrc.isUndef())
19833 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19834 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19837 /// \brief Creates an SDNode for a predicated scalar operation.
19838 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19839 /// The mask is coming as MVT::i8 and it should be transformed
19840 /// to MVT::v1i1 while lowering masking intrinsics.
19841 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19842 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19843 /// for a scalar instruction.
19844 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19845 SDValue PreservedSrc,
19846 const X86Subtarget &Subtarget,
19847 SelectionDAG &DAG) {
19849 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19850 if (MaskConst->getZExtValue() & 0x1)
19853 MVT VT = Op.getSimpleValueType();
19856 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19857 if (Op.getOpcode() == X86ISD::FSETCCM ||
19858 Op.getOpcode() == X86ISD::FSETCCM_RND)
19859 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19860 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19861 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19863 if (PreservedSrc.isUndef())
19864 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19865 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19868 static int getSEHRegistrationNodeSize(const Function *Fn) {
19869 if (!Fn->hasPersonalityFn())
19870 report_fatal_error(
19871 "querying registration node size for function without personality");
19872 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19873 // WinEHStatePass for the full struct definition.
19874 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19875 case EHPersonality::MSVC_X86SEH: return 24;
19876 case EHPersonality::MSVC_CXX: return 16;
19879 report_fatal_error(
19880 "can only recover FP for 32-bit MSVC EH personality functions");
19883 /// When the MSVC runtime transfers control to us, either to an outlined
19884 /// function or when returning to a parent frame after catching an exception, we
19885 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19886 /// Here's the math:
19887 /// RegNodeBase = EntryEBP - RegNodeSize
19888 /// ParentFP = RegNodeBase - ParentFrameOffset
19889 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19890 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19891 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19892 SDValue EntryEBP) {
19893 MachineFunction &MF = DAG.getMachineFunction();
19896 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19897 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19899 // It's possible that the parent function no longer has a personality function
19900 // if the exceptional code was optimized away, in which case we just return
19901 // the incoming EBP.
19902 if (!Fn->hasPersonalityFn())
19905 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19906 // registration, or the .set_setframe offset.
19907 MCSymbol *OffsetSym =
19908 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19909 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19910 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19911 SDValue ParentFrameOffset =
19912 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19914 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19915 // prologue to RBP in the parent function.
19916 const X86Subtarget &Subtarget =
19917 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19918 if (Subtarget.is64Bit())
19919 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19921 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19922 // RegNodeBase = EntryEBP - RegNodeSize
19923 // ParentFP = RegNodeBase - ParentFrameOffset
19924 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19925 DAG.getConstant(RegNodeSize, dl, PtrVT));
19926 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19929 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
19930 SelectionDAG &DAG) const {
19931 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19932 auto isRoundModeCurDirection = [](SDValue Rnd) {
19933 if (!isa<ConstantSDNode>(Rnd))
19936 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19937 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19941 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19942 MVT VT = Op.getSimpleValueType();
19943 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19945 switch(IntrData->Type) {
19946 case INTR_TYPE_1OP:
19947 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19948 case INTR_TYPE_2OP:
19949 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19951 case INTR_TYPE_3OP:
19952 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19953 Op.getOperand(2), Op.getOperand(3));
19954 case INTR_TYPE_4OP:
19955 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19956 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19957 case INTR_TYPE_1OP_MASK_RM: {
19958 SDValue Src = Op.getOperand(1);
19959 SDValue PassThru = Op.getOperand(2);
19960 SDValue Mask = Op.getOperand(3);
19961 SDValue RoundingMode;
19962 // We always add rounding mode to the Node.
19963 // If the rounding mode is not specified, we add the
19964 // "current direction" mode.
19965 if (Op.getNumOperands() == 4)
19967 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19969 RoundingMode = Op.getOperand(4);
19970 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19971 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19973 Mask, PassThru, Subtarget, DAG);
19975 case INTR_TYPE_1OP_MASK: {
19976 SDValue Src = Op.getOperand(1);
19977 SDValue PassThru = Op.getOperand(2);
19978 SDValue Mask = Op.getOperand(3);
19979 // We add rounding mode to the Node when
19980 // - RM Opcode is specified and
19981 // - RM is not "current direction".
19982 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19983 if (IntrWithRoundingModeOpcode != 0) {
19984 SDValue Rnd = Op.getOperand(4);
19985 if (!isRoundModeCurDirection(Rnd)) {
19986 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19987 dl, Op.getValueType(),
19989 Mask, PassThru, Subtarget, DAG);
19992 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19993 Mask, PassThru, Subtarget, DAG);
19995 case INTR_TYPE_SCALAR_MASK: {
19996 SDValue Src1 = Op.getOperand(1);
19997 SDValue Src2 = Op.getOperand(2);
19998 SDValue passThru = Op.getOperand(3);
19999 SDValue Mask = Op.getOperand(4);
20000 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20001 // There are 2 kinds of intrinsics in this group:
20002 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20003 // (2) With rounding mode and sae - 7 operands.
20004 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20005 if (Op.getNumOperands() == (5U + HasRounding)) {
20007 SDValue Rnd = Op.getOperand(5);
20008 if (!isRoundModeCurDirection(Rnd))
20009 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20010 dl, VT, Src1, Src2, Rnd),
20011 Mask, passThru, Subtarget, DAG);
20013 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20015 Mask, passThru, Subtarget, DAG);
20018 assert(Op.getNumOperands() == (6U + HasRounding) &&
20019 "Unexpected intrinsic form");
20020 SDValue RoundingMode = Op.getOperand(5);
20022 SDValue Sae = Op.getOperand(6);
20023 if (!isRoundModeCurDirection(Sae))
20024 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20025 dl, VT, Src1, Src2,
20026 RoundingMode, Sae),
20027 Mask, passThru, Subtarget, DAG);
20029 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20030 Src2, RoundingMode),
20031 Mask, passThru, Subtarget, DAG);
20033 case INTR_TYPE_SCALAR_MASK_RM: {
20034 SDValue Src1 = Op.getOperand(1);
20035 SDValue Src2 = Op.getOperand(2);
20036 SDValue Src0 = Op.getOperand(3);
20037 SDValue Mask = Op.getOperand(4);
20038 // There are 2 kinds of intrinsics in this group:
20039 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20040 // (2) With rounding mode and sae - 7 operands.
20041 if (Op.getNumOperands() == 6) {
20042 SDValue Sae = Op.getOperand(5);
20043 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20045 Mask, Src0, Subtarget, DAG);
20047 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20048 SDValue RoundingMode = Op.getOperand(5);
20049 SDValue Sae = Op.getOperand(6);
20050 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20051 RoundingMode, Sae),
20052 Mask, Src0, Subtarget, DAG);
20054 case INTR_TYPE_2OP_MASK:
20055 case INTR_TYPE_2OP_IMM8_MASK: {
20056 SDValue Src1 = Op.getOperand(1);
20057 SDValue Src2 = Op.getOperand(2);
20058 SDValue PassThru = Op.getOperand(3);
20059 SDValue Mask = Op.getOperand(4);
20061 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
20062 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20064 // We specify 2 possible opcodes for intrinsics with rounding modes.
20065 // First, we check if the intrinsic may have non-default rounding mode,
20066 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20067 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20068 if (IntrWithRoundingModeOpcode != 0) {
20069 SDValue Rnd = Op.getOperand(5);
20070 if (!isRoundModeCurDirection(Rnd)) {
20071 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20072 dl, Op.getValueType(),
20074 Mask, PassThru, Subtarget, DAG);
20077 // TODO: Intrinsics should have fast-math-flags to propagate.
20078 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20079 Mask, PassThru, Subtarget, DAG);
20081 case INTR_TYPE_2OP_MASK_RM: {
20082 SDValue Src1 = Op.getOperand(1);
20083 SDValue Src2 = Op.getOperand(2);
20084 SDValue PassThru = Op.getOperand(3);
20085 SDValue Mask = Op.getOperand(4);
20086 // We specify 2 possible modes for intrinsics, with/without rounding
20088 // First, we check if the intrinsic have rounding mode (6 operands),
20089 // if not, we set rounding mode to "current".
20091 if (Op.getNumOperands() == 6)
20092 Rnd = Op.getOperand(5);
20094 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20095 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20097 Mask, PassThru, Subtarget, DAG);
20099 case INTR_TYPE_3OP_SCALAR_MASK: {
20100 SDValue Src1 = Op.getOperand(1);
20101 SDValue Src2 = Op.getOperand(2);
20102 SDValue Src3 = Op.getOperand(3);
20103 SDValue PassThru = Op.getOperand(4);
20104 SDValue Mask = Op.getOperand(5);
20106 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20107 if (IntrWithRoundingModeOpcode != 0) {
20108 SDValue Rnd = Op.getOperand(6);
20109 if (!isRoundModeCurDirection(Rnd))
20110 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20111 dl, VT, Src1, Src2, Src3, Rnd),
20112 Mask, PassThru, Subtarget, DAG);
20114 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20116 Mask, PassThru, Subtarget, DAG);
20118 case INTR_TYPE_3OP_IMM8_MASK:
20119 case INTR_TYPE_3OP_MASK: {
20120 SDValue Src1 = Op.getOperand(1);
20121 SDValue Src2 = Op.getOperand(2);
20122 SDValue Src3 = Op.getOperand(3);
20123 SDValue PassThru = Op.getOperand(4);
20124 SDValue Mask = Op.getOperand(5);
20126 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
20127 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20129 // We specify 2 possible opcodes for intrinsics with rounding modes.
20130 // First, we check if the intrinsic may have non-default rounding mode,
20131 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20132 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20133 if (IntrWithRoundingModeOpcode != 0) {
20134 SDValue Rnd = Op.getOperand(6);
20135 if (!isRoundModeCurDirection(Rnd)) {
20136 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20137 dl, Op.getValueType(),
20138 Src1, Src2, Src3, Rnd),
20139 Mask, PassThru, Subtarget, DAG);
20142 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20144 Mask, PassThru, Subtarget, DAG);
20146 case VPERM_2OP_MASK : {
20147 SDValue Src1 = Op.getOperand(1);
20148 SDValue Src2 = Op.getOperand(2);
20149 SDValue PassThru = Op.getOperand(3);
20150 SDValue Mask = Op.getOperand(4);
20152 // Swap Src1 and Src2 in the node creation
20153 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
20154 Mask, PassThru, Subtarget, DAG);
20156 case VPERM_3OP_MASKZ:
20157 case VPERM_3OP_MASK:{
20158 MVT VT = Op.getSimpleValueType();
20159 // Src2 is the PassThru
20160 SDValue Src1 = Op.getOperand(1);
20161 // PassThru needs to be the same type as the destination in order
20162 // to pattern match correctly.
20163 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
20164 SDValue Src3 = Op.getOperand(3);
20165 SDValue Mask = Op.getOperand(4);
20166 SDValue PassThru = SDValue();
20168 // set PassThru element
20169 if (IntrData->Type == VPERM_3OP_MASKZ)
20170 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20174 // Swap Src1 and Src2 in the node creation
20175 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20176 dl, Op.getValueType(),
20178 Mask, PassThru, Subtarget, DAG);
20182 case FMA_OP_MASK: {
20183 SDValue Src1 = Op.getOperand(1);
20184 SDValue Src2 = Op.getOperand(2);
20185 SDValue Src3 = Op.getOperand(3);
20186 SDValue Mask = Op.getOperand(4);
20187 MVT VT = Op.getSimpleValueType();
20188 SDValue PassThru = SDValue();
20190 // set PassThru element
20191 if (IntrData->Type == FMA_OP_MASKZ)
20192 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20193 else if (IntrData->Type == FMA_OP_MASK3)
20198 // We specify 2 possible opcodes for intrinsics with rounding modes.
20199 // First, we check if the intrinsic may have non-default rounding mode,
20200 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20201 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20202 if (IntrWithRoundingModeOpcode != 0) {
20203 SDValue Rnd = Op.getOperand(5);
20204 if (!isRoundModeCurDirection(Rnd))
20205 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20206 dl, Op.getValueType(),
20207 Src1, Src2, Src3, Rnd),
20208 Mask, PassThru, Subtarget, DAG);
20210 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20211 dl, Op.getValueType(),
20213 Mask, PassThru, Subtarget, DAG);
20215 case FMA_OP_SCALAR_MASK:
20216 case FMA_OP_SCALAR_MASK3:
20217 case FMA_OP_SCALAR_MASKZ: {
20218 SDValue Src1 = Op.getOperand(1);
20219 SDValue Src2 = Op.getOperand(2);
20220 SDValue Src3 = Op.getOperand(3);
20221 SDValue Mask = Op.getOperand(4);
20222 MVT VT = Op.getSimpleValueType();
20223 SDValue PassThru = SDValue();
20225 // set PassThru element
20226 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20227 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20228 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20233 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20234 if (IntrWithRoundingModeOpcode != 0) {
20235 SDValue Rnd = Op.getOperand(5);
20236 if (!isRoundModeCurDirection(Rnd))
20237 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20238 Op.getValueType(), Src1, Src2,
20240 Mask, PassThru, Subtarget, DAG);
20243 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20244 Op.getValueType(), Src1, Src2,
20246 Mask, PassThru, Subtarget, DAG);
20248 case IFMA_OP_MASKZ:
20249 case IFMA_OP_MASK: {
20250 SDValue Src1 = Op.getOperand(1);
20251 SDValue Src2 = Op.getOperand(2);
20252 SDValue Src3 = Op.getOperand(3);
20253 SDValue Mask = Op.getOperand(4);
20254 MVT VT = Op.getSimpleValueType();
20255 SDValue PassThru = Src1;
20257 // set PassThru element
20258 if (IntrData->Type == IFMA_OP_MASKZ)
20259 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20261 // Node we need to swizzle the operands to pass the multiply operands
20263 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20264 dl, Op.getValueType(),
20266 Mask, PassThru, Subtarget, DAG);
20268 case TERLOG_OP_MASK:
20269 case TERLOG_OP_MASKZ: {
20270 SDValue Src1 = Op.getOperand(1);
20271 SDValue Src2 = Op.getOperand(2);
20272 SDValue Src3 = Op.getOperand(3);
20273 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20274 SDValue Mask = Op.getOperand(5);
20275 MVT VT = Op.getSimpleValueType();
20276 SDValue PassThru = Src1;
20277 // Set PassThru element.
20278 if (IntrData->Type == TERLOG_OP_MASKZ)
20279 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20281 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20282 Src1, Src2, Src3, Src4),
20283 Mask, PassThru, Subtarget, DAG);
20286 // ISD::FP_ROUND has a second argument that indicates if the truncation
20287 // does not change the value. Set it to 0 since it can change.
20288 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20289 DAG.getIntPtrConstant(0, dl));
20290 case CVTPD2PS_MASK: {
20291 SDValue Src = Op.getOperand(1);
20292 SDValue PassThru = Op.getOperand(2);
20293 SDValue Mask = Op.getOperand(3);
20294 // We add rounding mode to the Node when
20295 // - RM Opcode is specified and
20296 // - RM is not "current direction".
20297 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20298 if (IntrWithRoundingModeOpcode != 0) {
20299 SDValue Rnd = Op.getOperand(4);
20300 if (!isRoundModeCurDirection(Rnd)) {
20301 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20302 dl, Op.getValueType(),
20304 Mask, PassThru, Subtarget, DAG);
20307 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20308 // ISD::FP_ROUND has a second argument that indicates if the truncation
20309 // does not change the value. Set it to 0 since it can change.
20310 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20311 DAG.getIntPtrConstant(0, dl)),
20312 Mask, PassThru, Subtarget, DAG);
20315 // FPclass intrinsics with mask
20316 SDValue Src1 = Op.getOperand(1);
20317 MVT VT = Src1.getSimpleValueType();
20318 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20319 SDValue Imm = Op.getOperand(2);
20320 SDValue Mask = Op.getOperand(3);
20321 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20322 Mask.getSimpleValueType().getSizeInBits());
20323 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20324 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
20326 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20327 DAG.getUNDEF(BitcastVT), FPclassMask,
20328 DAG.getIntPtrConstant(0, dl));
20329 return DAG.getBitcast(Op.getValueType(), Res);
20332 SDValue Src1 = Op.getOperand(1);
20333 SDValue Imm = Op.getOperand(2);
20334 SDValue Mask = Op.getOperand(3);
20335 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20336 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20338 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
20339 DAG.getIntPtrConstant(0, dl));
20342 case CMP_MASK_CC: {
20343 // Comparison intrinsics with masks.
20344 // Example of transformation:
20345 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20346 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20348 // (v8i1 (insert_subvector undef,
20349 // (v2i1 (and (PCMPEQM %a, %b),
20350 // (extract_subvector
20351 // (v8i1 (bitcast %mask)), 0))), 0))))
20352 MVT VT = Op.getOperand(1).getSimpleValueType();
20353 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20354 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20355 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20356 Mask.getSimpleValueType().getSizeInBits());
20358 if (IntrData->Type == CMP_MASK_CC) {
20359 SDValue CC = Op.getOperand(3);
20360 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20361 // We specify 2 possible opcodes for intrinsics with rounding modes.
20362 // First, we check if the intrinsic may have non-default rounding mode,
20363 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20364 if (IntrData->Opc1 != 0) {
20365 SDValue Rnd = Op.getOperand(5);
20366 if (!isRoundModeCurDirection(Rnd))
20367 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20368 Op.getOperand(2), CC, Rnd);
20370 //default rounding mode
20372 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20373 Op.getOperand(2), CC);
20376 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
20377 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20380 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20382 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20383 DAG.getUNDEF(BitcastVT), CmpMask,
20384 DAG.getIntPtrConstant(0, dl));
20385 return DAG.getBitcast(Op.getValueType(), Res);
20387 case CMP_MASK_SCALAR_CC: {
20388 SDValue Src1 = Op.getOperand(1);
20389 SDValue Src2 = Op.getOperand(2);
20390 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20391 SDValue Mask = Op.getOperand(4);
20394 if (IntrData->Opc1 != 0) {
20395 SDValue Rnd = Op.getOperand(5);
20396 if (!isRoundModeCurDirection(Rnd))
20397 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20399 //default rounding mode
20401 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20403 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20405 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
20406 DAG.getIntPtrConstant(0, dl));
20408 case COMI: { // Comparison intrinsics
20409 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20410 SDValue LHS = Op.getOperand(1);
20411 SDValue RHS = Op.getOperand(2);
20412 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20413 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20416 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20417 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20418 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20419 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20422 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20423 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20424 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20425 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20428 case ISD::SETGT: // (CF = 0 and ZF = 0)
20429 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20431 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20432 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20435 case ISD::SETGE: // CF = 0
20436 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20438 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20439 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20442 llvm_unreachable("Unexpected illegal condition!");
20444 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20446 case COMI_RM: { // Comparison intrinsics with Sae
20447 SDValue LHS = Op.getOperand(1);
20448 SDValue RHS = Op.getOperand(2);
20449 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20450 SDValue Sae = Op.getOperand(4);
20453 if (isRoundModeCurDirection(Sae))
20454 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20455 DAG.getConstant(CondVal, dl, MVT::i8));
20457 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20458 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20459 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
20460 DAG.getIntPtrConstant(0, dl));
20463 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20464 Op.getOperand(1), Op.getOperand(2), Subtarget,
20466 case COMPRESS_EXPAND_IN_REG: {
20467 SDValue Mask = Op.getOperand(3);
20468 SDValue DataToCompress = Op.getOperand(1);
20469 SDValue PassThru = Op.getOperand(2);
20470 if (isAllOnesConstant(Mask)) // return data as is
20471 return Op.getOperand(1);
20473 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20475 Mask, PassThru, Subtarget, DAG);
20478 MVT VT = Op.getSimpleValueType();
20479 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20481 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20482 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20483 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
20484 return DAG.getBitcast(VT, Res);
20487 case FIXUPIMMS_MASKZ:
20489 case FIXUPIMM_MASKZ:{
20490 SDValue Src1 = Op.getOperand(1);
20491 SDValue Src2 = Op.getOperand(2);
20492 SDValue Src3 = Op.getOperand(3);
20493 SDValue Imm = Op.getOperand(4);
20494 SDValue Mask = Op.getOperand(5);
20495 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20496 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20497 // We specify 2 possible modes for intrinsics, with/without rounding
20499 // First, we check if the intrinsic have rounding mode (7 operands),
20500 // if not, we set rounding mode to "current".
20502 if (Op.getNumOperands() == 7)
20503 Rnd = Op.getOperand(6);
20505 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20506 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20507 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20508 Src1, Src2, Src3, Imm, Rnd),
20509 Mask, Passthru, Subtarget, DAG);
20510 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20511 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20512 Src1, Src2, Src3, Imm, Rnd),
20513 Mask, Passthru, Subtarget, DAG);
20516 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20517 // Clear the upper bits of the rounding immediate so that the legacy
20518 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20519 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20521 DAG.getConstant(0xf, dl, MVT::i32));
20522 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20523 Op.getOperand(1), RoundingMode);
20526 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20527 // Clear the upper bits of the rounding immediate so that the legacy
20528 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20529 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20531 DAG.getConstant(0xf, dl, MVT::i32));
20532 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20533 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20541 default: return SDValue(); // Don't custom lower most intrinsics.
20543 case Intrinsic::x86_avx2_permd:
20544 case Intrinsic::x86_avx2_permps:
20545 // Operands intentionally swapped. Mask is last operand to intrinsic,
20546 // but second operand for node/instruction.
20547 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20548 Op.getOperand(2), Op.getOperand(1));
20550 // ptest and testp intrinsics. The intrinsic these come from are designed to
20551 // return an integer value, not just an instruction so lower it to the ptest
20552 // or testp pattern and a setcc for the result.
20553 case Intrinsic::x86_sse41_ptestz:
20554 case Intrinsic::x86_sse41_ptestc:
20555 case Intrinsic::x86_sse41_ptestnzc:
20556 case Intrinsic::x86_avx_ptestz_256:
20557 case Intrinsic::x86_avx_ptestc_256:
20558 case Intrinsic::x86_avx_ptestnzc_256:
20559 case Intrinsic::x86_avx_vtestz_ps:
20560 case Intrinsic::x86_avx_vtestc_ps:
20561 case Intrinsic::x86_avx_vtestnzc_ps:
20562 case Intrinsic::x86_avx_vtestz_pd:
20563 case Intrinsic::x86_avx_vtestc_pd:
20564 case Intrinsic::x86_avx_vtestnzc_pd:
20565 case Intrinsic::x86_avx_vtestz_ps_256:
20566 case Intrinsic::x86_avx_vtestc_ps_256:
20567 case Intrinsic::x86_avx_vtestnzc_ps_256:
20568 case Intrinsic::x86_avx_vtestz_pd_256:
20569 case Intrinsic::x86_avx_vtestc_pd_256:
20570 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20571 bool IsTestPacked = false;
20572 X86::CondCode X86CC;
20574 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20575 case Intrinsic::x86_avx_vtestz_ps:
20576 case Intrinsic::x86_avx_vtestz_pd:
20577 case Intrinsic::x86_avx_vtestz_ps_256:
20578 case Intrinsic::x86_avx_vtestz_pd_256:
20579 IsTestPacked = true;
20581 case Intrinsic::x86_sse41_ptestz:
20582 case Intrinsic::x86_avx_ptestz_256:
20584 X86CC = X86::COND_E;
20586 case Intrinsic::x86_avx_vtestc_ps:
20587 case Intrinsic::x86_avx_vtestc_pd:
20588 case Intrinsic::x86_avx_vtestc_ps_256:
20589 case Intrinsic::x86_avx_vtestc_pd_256:
20590 IsTestPacked = true;
20592 case Intrinsic::x86_sse41_ptestc:
20593 case Intrinsic::x86_avx_ptestc_256:
20595 X86CC = X86::COND_B;
20597 case Intrinsic::x86_avx_vtestnzc_ps:
20598 case Intrinsic::x86_avx_vtestnzc_pd:
20599 case Intrinsic::x86_avx_vtestnzc_ps_256:
20600 case Intrinsic::x86_avx_vtestnzc_pd_256:
20601 IsTestPacked = true;
20603 case Intrinsic::x86_sse41_ptestnzc:
20604 case Intrinsic::x86_avx_ptestnzc_256:
20606 X86CC = X86::COND_A;
20610 SDValue LHS = Op.getOperand(1);
20611 SDValue RHS = Op.getOperand(2);
20612 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20613 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20614 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20615 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20618 case Intrinsic::x86_sse42_pcmpistria128:
20619 case Intrinsic::x86_sse42_pcmpestria128:
20620 case Intrinsic::x86_sse42_pcmpistric128:
20621 case Intrinsic::x86_sse42_pcmpestric128:
20622 case Intrinsic::x86_sse42_pcmpistrio128:
20623 case Intrinsic::x86_sse42_pcmpestrio128:
20624 case Intrinsic::x86_sse42_pcmpistris128:
20625 case Intrinsic::x86_sse42_pcmpestris128:
20626 case Intrinsic::x86_sse42_pcmpistriz128:
20627 case Intrinsic::x86_sse42_pcmpestriz128: {
20629 X86::CondCode X86CC;
20631 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20632 case Intrinsic::x86_sse42_pcmpistria128:
20633 Opcode = X86ISD::PCMPISTRI;
20634 X86CC = X86::COND_A;
20636 case Intrinsic::x86_sse42_pcmpestria128:
20637 Opcode = X86ISD::PCMPESTRI;
20638 X86CC = X86::COND_A;
20640 case Intrinsic::x86_sse42_pcmpistric128:
20641 Opcode = X86ISD::PCMPISTRI;
20642 X86CC = X86::COND_B;
20644 case Intrinsic::x86_sse42_pcmpestric128:
20645 Opcode = X86ISD::PCMPESTRI;
20646 X86CC = X86::COND_B;
20648 case Intrinsic::x86_sse42_pcmpistrio128:
20649 Opcode = X86ISD::PCMPISTRI;
20650 X86CC = X86::COND_O;
20652 case Intrinsic::x86_sse42_pcmpestrio128:
20653 Opcode = X86ISD::PCMPESTRI;
20654 X86CC = X86::COND_O;
20656 case Intrinsic::x86_sse42_pcmpistris128:
20657 Opcode = X86ISD::PCMPISTRI;
20658 X86CC = X86::COND_S;
20660 case Intrinsic::x86_sse42_pcmpestris128:
20661 Opcode = X86ISD::PCMPESTRI;
20662 X86CC = X86::COND_S;
20664 case Intrinsic::x86_sse42_pcmpistriz128:
20665 Opcode = X86ISD::PCMPISTRI;
20666 X86CC = X86::COND_E;
20668 case Intrinsic::x86_sse42_pcmpestriz128:
20669 Opcode = X86ISD::PCMPESTRI;
20670 X86CC = X86::COND_E;
20673 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20674 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20675 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20676 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20677 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20680 case Intrinsic::x86_sse42_pcmpistri128:
20681 case Intrinsic::x86_sse42_pcmpestri128: {
20683 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20684 Opcode = X86ISD::PCMPISTRI;
20686 Opcode = X86ISD::PCMPESTRI;
20688 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20689 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20690 return DAG.getNode(Opcode, dl, VTs, NewOps);
20693 case Intrinsic::eh_sjlj_lsda: {
20694 MachineFunction &MF = DAG.getMachineFunction();
20695 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20696 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20697 auto &Context = MF.getMMI().getContext();
20698 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20699 Twine(MF.getFunctionNumber()));
20700 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
20701 DAG.getMCSymbol(S, PtrVT));
20704 case Intrinsic::x86_seh_lsda: {
20705 // Compute the symbol for the LSDA. We know it'll get emitted later.
20706 MachineFunction &MF = DAG.getMachineFunction();
20707 SDValue Op1 = Op.getOperand(1);
20708 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20709 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20710 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20712 // Generate a simple absolute symbol reference. This intrinsic is only
20713 // supported on 32-bit Windows, which isn't PIC.
20714 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20715 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20718 case Intrinsic::x86_seh_recoverfp: {
20719 SDValue FnOp = Op.getOperand(1);
20720 SDValue IncomingFPOp = Op.getOperand(2);
20721 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20722 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20724 report_fatal_error(
20725 "llvm.x86.seh.recoverfp must take a function as the first argument");
20726 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20729 case Intrinsic::localaddress: {
20730 // Returns one of the stack, base, or frame pointer registers, depending on
20731 // which is used to reference local variables.
20732 MachineFunction &MF = DAG.getMachineFunction();
20733 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20735 if (RegInfo->hasBasePointer(MF))
20736 Reg = RegInfo->getBaseRegister();
20737 else // This function handles the SP or FP case.
20738 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20739 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20744 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20745 SDValue Src, SDValue Mask, SDValue Base,
20746 SDValue Index, SDValue ScaleOp, SDValue Chain,
20747 const X86Subtarget &Subtarget) {
20749 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20750 // Scale must be constant.
20753 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20754 EVT MaskVT = Mask.getValueType();
20755 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20756 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20757 SDValue Segment = DAG.getRegister(0, MVT::i32);
20758 // If source is undef or we know it won't be used, use a zero vector
20759 // to break register dependency.
20760 // TODO: use undef instead and let BreakFalseDeps deal with it?
20761 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20762 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20763 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20764 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20765 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20766 return DAG.getMergeValues(RetOps, dl);
20769 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20770 SDValue Src, SDValue Mask, SDValue Base,
20771 SDValue Index, SDValue ScaleOp, SDValue Chain,
20772 const X86Subtarget &Subtarget) {
20774 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20775 // Scale must be constant.
20778 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20779 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20780 Index.getSimpleValueType().getVectorNumElements());
20782 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20783 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20784 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20785 SDValue Segment = DAG.getRegister(0, MVT::i32);
20786 // If source is undef or we know it won't be used, use a zero vector
20787 // to break register dependency.
20788 // TODO: use undef instead and let BreakFalseDeps deal with it?
20789 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20790 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20791 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20792 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20793 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20794 return DAG.getMergeValues(RetOps, dl);
20797 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20798 SDValue Src, SDValue Mask, SDValue Base,
20799 SDValue Index, SDValue ScaleOp, SDValue Chain,
20800 const X86Subtarget &Subtarget) {
20802 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20803 // Scale must be constant.
20806 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20807 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20808 SDValue Segment = DAG.getRegister(0, MVT::i32);
20809 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20810 Index.getSimpleValueType().getVectorNumElements());
20812 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20813 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20814 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20815 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20816 return SDValue(Res, 1);
20819 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20820 SDValue Mask, SDValue Base, SDValue Index,
20821 SDValue ScaleOp, SDValue Chain,
20822 const X86Subtarget &Subtarget) {
20824 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20825 // Scale must be constant.
20828 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20829 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20830 SDValue Segment = DAG.getRegister(0, MVT::i32);
20832 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20833 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20834 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20835 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20836 return SDValue(Res, 0);
20839 /// Handles the lowering of builtin intrinsic that return the value
20840 /// of the extended control register.
20841 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20843 const X86Subtarget &Subtarget,
20844 SmallVectorImpl<SDValue> &Results) {
20845 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20846 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20849 // The ECX register is used to select the index of the XCR register to
20852 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20853 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20854 Chain = SDValue(N1, 0);
20856 // Reads the content of XCR and returns it in registers EDX:EAX.
20857 if (Subtarget.is64Bit()) {
20858 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20859 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20862 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20863 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20866 Chain = HI.getValue(1);
20868 if (Subtarget.is64Bit()) {
20869 // Merge the two 32-bit values into a 64-bit one..
20870 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20871 DAG.getConstant(32, DL, MVT::i8));
20872 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20873 Results.push_back(Chain);
20877 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20878 SDValue Ops[] = { LO, HI };
20879 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20880 Results.push_back(Pair);
20881 Results.push_back(Chain);
20884 /// Handles the lowering of builtin intrinsics that read performance monitor
20885 /// counters (x86_rdpmc).
20886 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20888 const X86Subtarget &Subtarget,
20889 SmallVectorImpl<SDValue> &Results) {
20890 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20891 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20894 // The ECX register is used to select the index of the performance counter
20896 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20898 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20900 // Reads the content of a 64-bit performance counter and returns it in the
20901 // registers EDX:EAX.
20902 if (Subtarget.is64Bit()) {
20903 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20904 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20907 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20908 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20911 Chain = HI.getValue(1);
20913 if (Subtarget.is64Bit()) {
20914 // The EAX register is loaded with the low-order 32 bits. The EDX register
20915 // is loaded with the supported high-order bits of the counter.
20916 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20917 DAG.getConstant(32, DL, MVT::i8));
20918 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20919 Results.push_back(Chain);
20923 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20924 SDValue Ops[] = { LO, HI };
20925 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20926 Results.push_back(Pair);
20927 Results.push_back(Chain);
20930 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20931 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20932 /// READCYCLECOUNTER nodes.
20933 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20935 const X86Subtarget &Subtarget,
20936 SmallVectorImpl<SDValue> &Results) {
20937 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20938 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20941 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20942 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20943 // and the EAX register is loaded with the low-order 32 bits.
20944 if (Subtarget.is64Bit()) {
20945 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20946 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20949 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20950 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20953 SDValue Chain = HI.getValue(1);
20955 if (Opcode == X86ISD::RDTSCP_DAG) {
20956 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20958 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20959 // the ECX register. Add 'ecx' explicitly to the chain.
20960 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20962 // Explicitly store the content of ECX at the location passed in input
20963 // to the 'rdtscp' intrinsic.
20964 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20965 MachinePointerInfo());
20968 if (Subtarget.is64Bit()) {
20969 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20970 // the EAX register is loaded with the low-order 32 bits.
20971 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20972 DAG.getConstant(32, DL, MVT::i8));
20973 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20974 Results.push_back(Chain);
20978 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20979 SDValue Ops[] = { LO, HI };
20980 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20981 Results.push_back(Pair);
20982 Results.push_back(Chain);
20985 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20986 SelectionDAG &DAG) {
20987 SmallVector<SDValue, 2> Results;
20989 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20991 return DAG.getMergeValues(Results, DL);
20994 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20995 MachineFunction &MF = DAG.getMachineFunction();
20996 SDValue Chain = Op.getOperand(0);
20997 SDValue RegNode = Op.getOperand(2);
20998 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21000 report_fatal_error("EH registrations only live in functions using WinEH");
21002 // Cast the operand to an alloca, and remember the frame index.
21003 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21005 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21006 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21008 // Return the chain operand without making any DAG nodes.
21012 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21013 MachineFunction &MF = DAG.getMachineFunction();
21014 SDValue Chain = Op.getOperand(0);
21015 SDValue EHGuard = Op.getOperand(2);
21016 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21018 report_fatal_error("EHGuard only live in functions using WinEH");
21020 // Cast the operand to an alloca, and remember the frame index.
21021 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21023 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21024 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21026 // Return the chain operand without making any DAG nodes.
21030 /// Emit Truncating Store with signed or unsigned saturation.
21032 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21033 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21034 SelectionDAG &DAG) {
21036 SDVTList VTs = DAG.getVTList(MVT::Other);
21037 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21038 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21040 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21041 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21044 /// Emit Masked Truncating Store with signed or unsigned saturation.
21046 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21047 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21048 MachineMemOperand *MMO, SelectionDAG &DAG) {
21050 SDVTList VTs = DAG.getVTList(MVT::Other);
21051 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21053 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21054 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21057 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21058 SelectionDAG &DAG) {
21059 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21061 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21064 case llvm::Intrinsic::x86_seh_ehregnode:
21065 return MarkEHRegistrationNode(Op, DAG);
21066 case llvm::Intrinsic::x86_seh_ehguard:
21067 return MarkEHGuard(Op, DAG);
21068 case llvm::Intrinsic::x86_flags_read_u32:
21069 case llvm::Intrinsic::x86_flags_read_u64:
21070 case llvm::Intrinsic::x86_flags_write_u32:
21071 case llvm::Intrinsic::x86_flags_write_u64: {
21072 // We need a frame pointer because this will get lowered to a PUSH/POP
21074 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21075 MFI.setHasCopyImplyingStackAdjustment(true);
21076 // Don't do anything here, we will expand these intrinsics out later
21077 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21080 case Intrinsic::x86_lwpins32:
21081 case Intrinsic::x86_lwpins64: {
21083 SDValue Chain = Op->getOperand(0);
21084 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21086 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
21087 Op->getOperand(3), Op->getOperand(4));
21088 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
21089 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21090 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21091 LwpIns.getValue(1));
21098 switch(IntrData->Type) {
21099 default: llvm_unreachable("Unknown Intrinsic Type");
21102 // Emit the node with the right value type.
21103 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21104 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21106 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21107 // Otherwise return the value from Rand, which is always 0, casted to i32.
21108 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21109 DAG.getConstant(1, dl, Op->getValueType(1)),
21110 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21111 SDValue(Result.getNode(), 1) };
21112 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21114 // Return { result, isValid, chain }.
21115 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21116 SDValue(Result.getNode(), 2));
21118 case GATHER_AVX2: {
21119 SDValue Chain = Op.getOperand(0);
21120 SDValue Src = Op.getOperand(2);
21121 SDValue Base = Op.getOperand(3);
21122 SDValue Index = Op.getOperand(4);
21123 SDValue Mask = Op.getOperand(5);
21124 SDValue Scale = Op.getOperand(6);
21125 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21126 Scale, Chain, Subtarget);
21129 //gather(v1, mask, index, base, scale);
21130 SDValue Chain = Op.getOperand(0);
21131 SDValue Src = Op.getOperand(2);
21132 SDValue Base = Op.getOperand(3);
21133 SDValue Index = Op.getOperand(4);
21134 SDValue Mask = Op.getOperand(5);
21135 SDValue Scale = Op.getOperand(6);
21136 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21140 //scatter(base, mask, index, v1, scale);
21141 SDValue Chain = Op.getOperand(0);
21142 SDValue Base = Op.getOperand(2);
21143 SDValue Mask = Op.getOperand(3);
21144 SDValue Index = Op.getOperand(4);
21145 SDValue Src = Op.getOperand(5);
21146 SDValue Scale = Op.getOperand(6);
21147 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21148 Scale, Chain, Subtarget);
21151 SDValue Hint = Op.getOperand(6);
21152 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21153 assert((HintVal == 2 || HintVal == 3) &&
21154 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21155 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21156 SDValue Chain = Op.getOperand(0);
21157 SDValue Mask = Op.getOperand(2);
21158 SDValue Index = Op.getOperand(3);
21159 SDValue Base = Op.getOperand(4);
21160 SDValue Scale = Op.getOperand(5);
21161 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21164 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21166 SmallVector<SDValue, 2> Results;
21167 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21169 return DAG.getMergeValues(Results, dl);
21171 // Read Performance Monitoring Counters.
21173 SmallVector<SDValue, 2> Results;
21174 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21175 return DAG.getMergeValues(Results, dl);
21177 // Get Extended Control Register.
21179 SmallVector<SDValue, 2> Results;
21180 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21181 return DAG.getMergeValues(Results, dl);
21183 // XTEST intrinsics.
21185 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21186 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21188 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21189 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21190 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21191 Ret, SDValue(InTrans.getNode(), 1));
21195 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21196 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21197 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21198 DAG.getConstant(-1, dl, MVT::i8));
21199 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21200 Op.getOperand(4), GenCF.getValue(1));
21201 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21202 Op.getOperand(5), MachinePointerInfo());
21203 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21204 SDValue Results[] = { SetCC, Store };
21205 return DAG.getMergeValues(Results, dl);
21207 case COMPRESS_TO_MEM: {
21208 SDValue Mask = Op.getOperand(4);
21209 SDValue DataToCompress = Op.getOperand(3);
21210 SDValue Addr = Op.getOperand(2);
21211 SDValue Chain = Op.getOperand(0);
21212 MVT VT = DataToCompress.getSimpleValueType();
21214 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21215 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21217 if (isAllOnesConstant(Mask)) // return just a store
21218 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21219 MemIntr->getMemOperand());
21221 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21222 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21224 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21225 MemIntr->getMemOperand(),
21226 false /* truncating */, true /* compressing */);
21228 case TRUNCATE_TO_MEM_VI8:
21229 case TRUNCATE_TO_MEM_VI16:
21230 case TRUNCATE_TO_MEM_VI32: {
21231 SDValue Mask = Op.getOperand(4);
21232 SDValue DataToTruncate = Op.getOperand(3);
21233 SDValue Addr = Op.getOperand(2);
21234 SDValue Chain = Op.getOperand(0);
21236 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21237 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21239 EVT MemVT = MemIntr->getMemoryVT();
21241 uint16_t TruncationOp = IntrData->Opc0;
21242 switch (TruncationOp) {
21243 case X86ISD::VTRUNC: {
21244 if (isAllOnesConstant(Mask)) // return just a truncate store
21245 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21246 MemIntr->getMemOperand());
21248 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21249 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21251 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21252 MemIntr->getMemOperand(), true /* truncating */);
21254 case X86ISD::VTRUNCUS:
21255 case X86ISD::VTRUNCS: {
21256 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21257 if (isAllOnesConstant(Mask))
21258 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21259 MemIntr->getMemOperand(), DAG);
21261 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21262 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21264 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21265 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21268 llvm_unreachable("Unsupported truncstore intrinsic");
21272 case EXPAND_FROM_MEM: {
21273 SDValue Mask = Op.getOperand(4);
21274 SDValue PassThru = Op.getOperand(3);
21275 SDValue Addr = Op.getOperand(2);
21276 SDValue Chain = Op.getOperand(0);
21277 MVT VT = Op.getSimpleValueType();
21279 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21280 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21282 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21283 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21284 if (X86::isZeroNode(Mask))
21285 return DAG.getUNDEF(VT);
21287 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21288 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21289 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21290 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21291 true /* expanding */);
21296 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21297 SelectionDAG &DAG) const {
21298 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21299 MFI.setReturnAddressIsTaken(true);
21301 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21304 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21306 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21309 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21310 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21311 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21312 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21313 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21314 MachinePointerInfo());
21317 // Just load the return address.
21318 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21319 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21320 MachinePointerInfo());
21323 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21324 SelectionDAG &DAG) const {
21325 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21326 return getReturnAddressFrameIndex(DAG);
21329 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21330 MachineFunction &MF = DAG.getMachineFunction();
21331 MachineFrameInfo &MFI = MF.getFrameInfo();
21332 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21333 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21334 EVT VT = Op.getValueType();
21336 MFI.setFrameAddressIsTaken(true);
21338 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21339 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21340 // is not possible to crawl up the stack without looking at the unwind codes
21342 int FrameAddrIndex = FuncInfo->getFAIndex();
21343 if (!FrameAddrIndex) {
21344 // Set up a frame object for the return address.
21345 unsigned SlotSize = RegInfo->getSlotSize();
21346 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21347 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21348 FuncInfo->setFAIndex(FrameAddrIndex);
21350 return DAG.getFrameIndex(FrameAddrIndex, VT);
21353 unsigned FrameReg =
21354 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21355 SDLoc dl(Op); // FIXME probably not meaningful
21356 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21357 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21358 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21359 "Invalid Frame Register!");
21360 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21362 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21363 MachinePointerInfo());
21367 // FIXME? Maybe this could be a TableGen attribute on some registers and
21368 // this table could be generated automatically from RegInfo.
21369 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21370 SelectionDAG &DAG) const {
21371 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21372 const MachineFunction &MF = DAG.getMachineFunction();
21374 unsigned Reg = StringSwitch<unsigned>(RegName)
21375 .Case("esp", X86::ESP)
21376 .Case("rsp", X86::RSP)
21377 .Case("ebp", X86::EBP)
21378 .Case("rbp", X86::RBP)
21381 if (Reg == X86::EBP || Reg == X86::RBP) {
21382 if (!TFI.hasFP(MF))
21383 report_fatal_error("register " + StringRef(RegName) +
21384 " is allocatable: function has no frame pointer");
21387 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21388 unsigned FrameReg =
21389 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21390 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21391 "Invalid Frame Register!");
21399 report_fatal_error("Invalid register name global variable");
21402 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21403 SelectionDAG &DAG) const {
21404 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21405 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21408 unsigned X86TargetLowering::getExceptionPointerRegister(
21409 const Constant *PersonalityFn) const {
21410 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21411 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21413 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21416 unsigned X86TargetLowering::getExceptionSelectorRegister(
21417 const Constant *PersonalityFn) const {
21418 // Funclet personalities don't use selectors (the runtime does the selection).
21419 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21420 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21423 bool X86TargetLowering::needsFixedCatchObjects() const {
21424 return Subtarget.isTargetWin64();
21427 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21428 SDValue Chain = Op.getOperand(0);
21429 SDValue Offset = Op.getOperand(1);
21430 SDValue Handler = Op.getOperand(2);
21433 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21434 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21435 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21436 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21437 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21438 "Invalid Frame Register!");
21439 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21440 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21442 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21443 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21445 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21446 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21447 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21449 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21450 DAG.getRegister(StoreAddrReg, PtrVT));
21453 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21454 SelectionDAG &DAG) const {
21456 // If the subtarget is not 64bit, we may need the global base reg
21457 // after isel expand pseudo, i.e., after CGBR pass ran.
21458 // Therefore, ask for the GlobalBaseReg now, so that the pass
21459 // inserts the code for us in case we need it.
21460 // Otherwise, we will end up in a situation where we will
21461 // reference a virtual register that is not defined!
21462 if (!Subtarget.is64Bit()) {
21463 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21464 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21466 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21467 DAG.getVTList(MVT::i32, MVT::Other),
21468 Op.getOperand(0), Op.getOperand(1));
21471 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21472 SelectionDAG &DAG) const {
21474 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21475 Op.getOperand(0), Op.getOperand(1));
21478 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21479 SelectionDAG &DAG) const {
21481 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21485 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21486 return Op.getOperand(0);
21489 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21490 SelectionDAG &DAG) const {
21491 SDValue Root = Op.getOperand(0);
21492 SDValue Trmp = Op.getOperand(1); // trampoline
21493 SDValue FPtr = Op.getOperand(2); // nested function
21494 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21497 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21498 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21500 if (Subtarget.is64Bit()) {
21501 SDValue OutChains[6];
21503 // Large code-model.
21504 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21505 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21507 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21508 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21510 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21512 // Load the pointer to the nested function into R11.
21513 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21514 SDValue Addr = Trmp;
21515 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21516 Addr, MachinePointerInfo(TrmpAddr));
21518 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21519 DAG.getConstant(2, dl, MVT::i64));
21521 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21522 /* Alignment = */ 2);
21524 // Load the 'nest' parameter value into R10.
21525 // R10 is specified in X86CallingConv.td
21526 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21527 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21528 DAG.getConstant(10, dl, MVT::i64));
21529 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21530 Addr, MachinePointerInfo(TrmpAddr, 10));
21532 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21533 DAG.getConstant(12, dl, MVT::i64));
21535 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21536 /* Alignment = */ 2);
21538 // Jump to the nested function.
21539 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21540 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21541 DAG.getConstant(20, dl, MVT::i64));
21542 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21543 Addr, MachinePointerInfo(TrmpAddr, 20));
21545 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21546 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21547 DAG.getConstant(22, dl, MVT::i64));
21548 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21549 Addr, MachinePointerInfo(TrmpAddr, 22));
21551 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21553 const Function *Func =
21554 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21555 CallingConv::ID CC = Func->getCallingConv();
21560 llvm_unreachable("Unsupported calling convention");
21561 case CallingConv::C:
21562 case CallingConv::X86_StdCall: {
21563 // Pass 'nest' parameter in ECX.
21564 // Must be kept in sync with X86CallingConv.td
21565 NestReg = X86::ECX;
21567 // Check that ECX wasn't needed by an 'inreg' parameter.
21568 FunctionType *FTy = Func->getFunctionType();
21569 const AttributeList &Attrs = Func->getAttributes();
21571 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21572 unsigned InRegCount = 0;
21575 for (FunctionType::param_iterator I = FTy->param_begin(),
21576 E = FTy->param_end(); I != E; ++I, ++Idx)
21577 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21578 auto &DL = DAG.getDataLayout();
21579 // FIXME: should only count parameters that are lowered to integers.
21580 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21583 if (InRegCount > 2) {
21584 report_fatal_error("Nest register in use - reduce number of inreg"
21590 case CallingConv::X86_FastCall:
21591 case CallingConv::X86_ThisCall:
21592 case CallingConv::Fast:
21593 // Pass 'nest' parameter in EAX.
21594 // Must be kept in sync with X86CallingConv.td
21595 NestReg = X86::EAX;
21599 SDValue OutChains[4];
21600 SDValue Addr, Disp;
21602 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21603 DAG.getConstant(10, dl, MVT::i32));
21604 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21606 // This is storing the opcode for MOV32ri.
21607 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21608 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21610 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21611 Trmp, MachinePointerInfo(TrmpAddr));
21613 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21614 DAG.getConstant(1, dl, MVT::i32));
21616 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21617 /* Alignment = */ 1);
21619 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21620 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21621 DAG.getConstant(5, dl, MVT::i32));
21622 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21623 Addr, MachinePointerInfo(TrmpAddr, 5),
21624 /* Alignment = */ 1);
21626 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21627 DAG.getConstant(6, dl, MVT::i32));
21629 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21630 /* Alignment = */ 1);
21632 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21636 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21637 SelectionDAG &DAG) const {
21639 The rounding mode is in bits 11:10 of FPSR, and has the following
21641 00 Round to nearest
21646 FLT_ROUNDS, on the other hand, expects the following:
21653 To perform the conversion, we do:
21654 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21657 MachineFunction &MF = DAG.getMachineFunction();
21658 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21659 unsigned StackAlignment = TFI.getStackAlignment();
21660 MVT VT = Op.getSimpleValueType();
21663 // Save FP Control Word to stack slot
21664 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21665 SDValue StackSlot =
21666 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21668 MachineMemOperand *MMO =
21669 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21670 MachineMemOperand::MOStore, 2, 2);
21672 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21673 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21674 DAG.getVTList(MVT::Other),
21675 Ops, MVT::i16, MMO);
21677 // Load FP Control Word from stack slot
21679 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21681 // Transform as necessary
21683 DAG.getNode(ISD::SRL, DL, MVT::i16,
21684 DAG.getNode(ISD::AND, DL, MVT::i16,
21685 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21686 DAG.getConstant(11, DL, MVT::i8));
21688 DAG.getNode(ISD::SRL, DL, MVT::i16,
21689 DAG.getNode(ISD::AND, DL, MVT::i16,
21690 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21691 DAG.getConstant(9, DL, MVT::i8));
21694 DAG.getNode(ISD::AND, DL, MVT::i16,
21695 DAG.getNode(ISD::ADD, DL, MVT::i16,
21696 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21697 DAG.getConstant(1, DL, MVT::i16)),
21698 DAG.getConstant(3, DL, MVT::i16));
21700 return DAG.getNode((VT.getSizeInBits() < 16 ?
21701 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21704 // Split an unary integer op into 2 half sized ops.
21705 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21706 MVT VT = Op.getSimpleValueType();
21707 unsigned NumElems = VT.getVectorNumElements();
21708 unsigned SizeInBits = VT.getSizeInBits();
21710 // Extract the Lo/Hi vectors
21712 SDValue Src = Op.getOperand(0);
21713 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21714 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21716 MVT EltVT = VT.getVectorElementType();
21717 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21718 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21719 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21720 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21723 // Decompose 256-bit ops into smaller 128-bit ops.
21724 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21725 assert(Op.getSimpleValueType().is256BitVector() &&
21726 Op.getSimpleValueType().isInteger() &&
21727 "Only handle AVX 256-bit vector integer operation");
21728 return LowerVectorIntUnary(Op, DAG);
21731 // Decompose 512-bit ops into smaller 256-bit ops.
21732 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21733 assert(Op.getSimpleValueType().is512BitVector() &&
21734 Op.getSimpleValueType().isInteger() &&
21735 "Only handle AVX 512-bit vector integer operation");
21736 return LowerVectorIntUnary(Op, DAG);
21739 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21741 // i8/i16 vector implemented using dword LZCNT vector instruction
21742 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21743 // split the vector, perform operation on it's Lo a Hi part and
21744 // concatenate the results.
21745 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
21746 const X86Subtarget &Subtarget) {
21747 assert(Op.getOpcode() == ISD::CTLZ);
21749 MVT VT = Op.getSimpleValueType();
21750 MVT EltVT = VT.getVectorElementType();
21751 unsigned NumElems = VT.getVectorNumElements();
21753 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21754 "Unsupported element type");
21756 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21757 if (NumElems > 16 ||
21758 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
21759 return LowerVectorIntUnary(Op, DAG);
21761 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21762 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21763 "Unsupported value type for operation");
21765 // Use native supported vector instruction vplzcntd.
21766 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21767 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21768 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21769 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21771 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21774 // Lower CTLZ using a PSHUFB lookup table implementation.
21775 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21776 const X86Subtarget &Subtarget,
21777 SelectionDAG &DAG) {
21778 MVT VT = Op.getSimpleValueType();
21779 int NumElts = VT.getVectorNumElements();
21780 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21781 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21783 // Per-nibble leading zero PSHUFB lookup table.
21784 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21785 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21786 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21787 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21789 SmallVector<SDValue, 64> LUTVec;
21790 for (int i = 0; i < NumBytes; ++i)
21791 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21792 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21794 // Begin by bitcasting the input to byte vector, then split those bytes
21795 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21796 // If the hi input nibble is zero then we add both results together, otherwise
21797 // we just take the hi result (by masking the lo result to zero before the
21799 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21800 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21802 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21803 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21804 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21805 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21807 if (CurrVT.is512BitVector()) {
21808 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21809 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
21810 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21812 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21815 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21816 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21817 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21818 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21820 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21821 // of the current vector width in the same way we did for the nibbles.
21822 // If the upper half of the input element is zero then add the halves'
21823 // leading zero counts together, otherwise just use the upper half's.
21824 // Double the width of the result until we are at target width.
21825 while (CurrVT != VT) {
21826 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21827 int CurrNumElts = CurrVT.getVectorNumElements();
21828 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21829 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21830 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21832 // Check if the upper half of the input element is zero.
21833 if (CurrVT.is512BitVector()) {
21834 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21835 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
21836 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21837 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21839 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21840 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21842 HiZ = DAG.getBitcast(NextVT, HiZ);
21844 // Move the upper/lower halves to the lower bits as we'll be extending to
21845 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21847 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21848 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21849 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21850 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21851 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21858 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21859 const X86Subtarget &Subtarget,
21860 SelectionDAG &DAG) {
21861 MVT VT = Op.getSimpleValueType();
21863 if (Subtarget.hasCDI() &&
21864 // vXi8 vectors need to be promoted to 512-bits for vXi32.
21865 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
21866 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
21868 // Decompose 256-bit ops into smaller 128-bit ops.
21869 if (VT.is256BitVector() && !Subtarget.hasInt256())
21870 return Lower256IntUnary(Op, DAG);
21872 // Decompose 512-bit ops into smaller 256-bit ops.
21873 if (VT.is512BitVector() && !Subtarget.hasBWI())
21874 return Lower512IntUnary(Op, DAG);
21876 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21877 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21880 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21881 SelectionDAG &DAG) {
21882 MVT VT = Op.getSimpleValueType();
21884 unsigned NumBits = VT.getSizeInBits();
21886 unsigned Opc = Op.getOpcode();
21889 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21891 Op = Op.getOperand(0);
21892 if (VT == MVT::i8) {
21893 // Zero extend to i32 since there is not an i8 bsr.
21895 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21898 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21899 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21900 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21902 if (Opc == ISD::CTLZ) {
21903 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21906 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21907 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21910 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21913 // Finally xor with NumBits-1.
21914 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21915 DAG.getConstant(NumBits - 1, dl, OpVT));
21918 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21922 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21923 MVT VT = Op.getSimpleValueType();
21924 unsigned NumBits = VT.getScalarSizeInBits();
21927 if (VT.isVector()) {
21928 SDValue N0 = Op.getOperand(0);
21929 SDValue Zero = DAG.getConstant(0, dl, VT);
21931 // lsb(x) = (x & -x)
21932 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21933 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21935 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21936 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21937 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21938 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21939 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21942 // cttz(x) = ctpop(lsb - 1)
21943 SDValue One = DAG.getConstant(1, dl, VT);
21944 return DAG.getNode(ISD::CTPOP, dl, VT,
21945 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21948 assert(Op.getOpcode() == ISD::CTTZ &&
21949 "Only scalar CTTZ requires custom lowering");
21951 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21952 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21953 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21955 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21958 DAG.getConstant(NumBits, dl, VT),
21959 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21962 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21965 /// Break a 256-bit integer operation into two new 128-bit ones and then
21966 /// concatenate the result back.
21967 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21968 MVT VT = Op.getSimpleValueType();
21970 assert(VT.is256BitVector() && VT.isInteger() &&
21971 "Unsupported value type for operation");
21973 unsigned NumElems = VT.getVectorNumElements();
21976 // Extract the LHS vectors
21977 SDValue LHS = Op.getOperand(0);
21978 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21979 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21981 // Extract the RHS vectors
21982 SDValue RHS = Op.getOperand(1);
21983 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21984 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21986 MVT EltVT = VT.getVectorElementType();
21987 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21989 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21990 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21991 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21994 /// Break a 512-bit integer operation into two new 256-bit ones and then
21995 /// concatenate the result back.
21996 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21997 MVT VT = Op.getSimpleValueType();
21999 assert(VT.is512BitVector() && VT.isInteger() &&
22000 "Unsupported value type for operation");
22002 unsigned NumElems = VT.getVectorNumElements();
22005 // Extract the LHS vectors
22006 SDValue LHS = Op.getOperand(0);
22007 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22008 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22010 // Extract the RHS vectors
22011 SDValue RHS = Op.getOperand(1);
22012 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22013 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22015 MVT EltVT = VT.getVectorElementType();
22016 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22018 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22019 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22020 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22023 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22024 MVT VT = Op.getSimpleValueType();
22025 if (VT.getScalarType() == MVT::i1)
22026 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22027 Op.getOperand(0), Op.getOperand(1));
22028 assert(Op.getSimpleValueType().is256BitVector() &&
22029 Op.getSimpleValueType().isInteger() &&
22030 "Only handle AVX 256-bit vector integer operation");
22031 return Lower256IntArith(Op, DAG);
22034 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22035 MVT VT = Op.getSimpleValueType();
22036 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22037 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22038 // 8-bit integer abs to NEG and CMOV.
22040 SDValue N0 = Op.getOperand(0);
22041 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22042 DAG.getConstant(0, DL, VT), N0);
22043 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22044 SDValue(Neg.getNode(), 1)};
22045 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22048 assert(Op.getSimpleValueType().is256BitVector() &&
22049 Op.getSimpleValueType().isInteger() &&
22050 "Only handle AVX 256-bit vector integer operation");
22051 return Lower256IntUnary(Op, DAG);
22054 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22055 assert(Op.getSimpleValueType().is256BitVector() &&
22056 Op.getSimpleValueType().isInteger() &&
22057 "Only handle AVX 256-bit vector integer operation");
22058 return Lower256IntArith(Op, DAG);
22061 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22062 SelectionDAG &DAG) {
22064 MVT VT = Op.getSimpleValueType();
22066 if (VT.getScalarType() == MVT::i1)
22067 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22069 // Decompose 256-bit ops into smaller 128-bit ops.
22070 if (VT.is256BitVector() && !Subtarget.hasInt256())
22071 return Lower256IntArith(Op, DAG);
22073 SDValue A = Op.getOperand(0);
22074 SDValue B = Op.getOperand(1);
22076 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22077 // vector pairs, multiply and truncate.
22078 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22079 if (Subtarget.hasInt256()) {
22080 // For 512-bit vectors, split into 256-bit vectors to allow the
22081 // sign-extension to occur.
22082 if (VT == MVT::v64i8)
22083 return Lower512IntArith(Op, DAG);
22085 // For 256-bit vectors, split into 128-bit vectors to allow the
22086 // sign-extension to occur. We don't need this on AVX512BW as we can
22087 // safely sign-extend to v32i16.
22088 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22089 return Lower256IntArith(Op, DAG);
22091 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22092 return DAG.getNode(
22093 ISD::TRUNCATE, dl, VT,
22094 DAG.getNode(ISD::MUL, dl, ExVT,
22095 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22096 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22099 assert(VT == MVT::v16i8 &&
22100 "Pre-AVX2 support only supports v16i8 multiplication");
22101 MVT ExVT = MVT::v8i16;
22103 // Extract the lo parts and sign extend to i16
22105 if (Subtarget.hasSSE41()) {
22106 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
22107 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
22109 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22110 -1, 4, -1, 5, -1, 6, -1, 7};
22111 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22112 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22113 ALo = DAG.getBitcast(ExVT, ALo);
22114 BLo = DAG.getBitcast(ExVT, BLo);
22115 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22116 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22119 // Extract the hi parts and sign extend to i16
22121 if (Subtarget.hasSSE41()) {
22122 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22123 -1, -1, -1, -1, -1, -1, -1, -1};
22124 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22125 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22126 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
22127 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
22129 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22130 -1, 12, -1, 13, -1, 14, -1, 15};
22131 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22132 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22133 AHi = DAG.getBitcast(ExVT, AHi);
22134 BHi = DAG.getBitcast(ExVT, BHi);
22135 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22136 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22139 // Multiply, mask the lower 8bits of the lo/hi results and pack
22140 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22141 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22142 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22143 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22144 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22147 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22148 if (VT == MVT::v4i32) {
22149 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22150 "Should not custom lower when pmulld is available!");
22152 // Extract the odd parts.
22153 static const int UnpackMask[] = { 1, -1, 3, -1 };
22154 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22155 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22157 // Multiply the even parts.
22158 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
22159 // Now multiply odd parts.
22160 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
22162 Evens = DAG.getBitcast(VT, Evens);
22163 Odds = DAG.getBitcast(VT, Odds);
22165 // Merge the two vectors back together with a shuffle. This expands into 2
22167 static const int ShufMask[] = { 0, 4, 2, 6 };
22168 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22171 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22172 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22174 // 32-bit vector types used for MULDQ/MULUDQ.
22175 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
22177 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22178 // 32-bits. We can lower with this if the sign bits stretch that far.
22179 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22180 DAG.ComputeNumSignBits(B) > 32) {
22181 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
22182 DAG.getBitcast(MulVT, B));
22185 // Ahi = psrlqi(a, 32);
22186 // Bhi = psrlqi(b, 32);
22188 // AloBlo = pmuludq(a, b);
22189 // AloBhi = pmuludq(a, Bhi);
22190 // AhiBlo = pmuludq(Ahi, b);
22192 // Hi = psllqi(AloBhi + AhiBlo, 32);
22193 // return AloBlo + Hi;
22194 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22195 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
22196 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
22198 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22199 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
22200 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
22202 // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
22203 // the high bits are known to be zero.
22204 if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
22207 // Bit cast to 32-bit vectors for MULUDQ.
22208 SDValue Alo = DAG.getBitcast(MulVT, A);
22209 SDValue Blo = DAG.getBitcast(MulVT, B);
22211 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22213 // Only multiply lo/hi halves that aren't known to be zero.
22214 SDValue AloBlo = Zero;
22215 if (!ALoIsZero && !BLoIsZero)
22216 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
22218 SDValue AloBhi = Zero;
22219 if (!ALoIsZero && !BHiIsZero) {
22220 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22221 Bhi = DAG.getBitcast(MulVT, Bhi);
22222 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
22225 SDValue AhiBlo = Zero;
22226 if (!AHiIsZero && !BLoIsZero) {
22227 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22228 Ahi = DAG.getBitcast(MulVT, Ahi);
22229 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
22232 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22233 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22235 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22238 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22239 SelectionDAG &DAG) {
22241 MVT VT = Op.getSimpleValueType();
22243 // Decompose 256-bit ops into smaller 128-bit ops.
22244 if (VT.is256BitVector() && !Subtarget.hasInt256())
22245 return Lower256IntArith(Op, DAG);
22247 // Only i8 vectors should need custom lowering after this.
22248 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22249 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22250 "Unsupported vector type");
22252 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22253 // logical shift down the upper half and pack back to i8.
22254 SDValue A = Op.getOperand(0);
22255 SDValue B = Op.getOperand(1);
22257 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22258 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22259 unsigned Opcode = Op.getOpcode();
22260 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22261 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22263 // For 512-bit vectors, split into 256-bit vectors to allow the
22264 // sign-extension to occur.
22265 if (VT == MVT::v64i8)
22266 return Lower512IntArith(Op, DAG);
22268 // AVX2 implementations - extend xmm subvectors to ymm.
22269 if (Subtarget.hasInt256()) {
22270 unsigned NumElems = VT.getVectorNumElements();
22271 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22272 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22274 if (VT == MVT::v32i8) {
22275 if (Subtarget.canExtendTo512BW()) {
22276 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22277 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22278 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22279 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22280 DAG.getConstant(8, dl, MVT::v32i16));
22281 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22283 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22284 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22285 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22286 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22287 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22288 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22289 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22290 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22291 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22292 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22293 DAG.getConstant(8, dl, MVT::v16i16));
22294 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22295 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22296 DAG.getConstant(8, dl, MVT::v16i16));
22297 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22298 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22299 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22300 16, 17, 18, 19, 20, 21, 22, 23};
22301 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22302 24, 25, 26, 27, 28, 29, 30, 31};
22303 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22304 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22305 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22308 assert(VT == MVT::v16i8 && "Unexpected VT");
22310 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22311 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22312 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22313 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22314 DAG.getConstant(8, dl, MVT::v16i16));
22315 // If we have BWI we can use truncate instruction.
22316 if (Subtarget.hasBWI())
22317 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22318 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22319 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22320 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22323 assert(VT == MVT::v16i8 &&
22324 "Pre-AVX2 support only supports v16i8 multiplication");
22325 MVT ExVT = MVT::v8i16;
22326 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22328 // Extract the lo parts and zero/sign extend to i16.
22330 if (Subtarget.hasSSE41()) {
22331 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22332 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22334 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22335 -1, 4, -1, 5, -1, 6, -1, 7};
22336 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22337 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22338 ALo = DAG.getBitcast(ExVT, ALo);
22339 BLo = DAG.getBitcast(ExVT, BLo);
22340 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22341 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22344 // Extract the hi parts and zero/sign extend to i16.
22346 if (Subtarget.hasSSE41()) {
22347 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22348 -1, -1, -1, -1, -1, -1, -1, -1};
22349 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22350 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22351 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22352 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22354 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22355 -1, 12, -1, 13, -1, 14, -1, 15};
22356 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22357 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22358 AHi = DAG.getBitcast(ExVT, AHi);
22359 BHi = DAG.getBitcast(ExVT, BHi);
22360 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22361 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22364 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22365 // pack back to v16i8.
22366 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22367 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22368 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22369 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22370 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22373 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22374 assert(Subtarget.isTargetWin64() && "Unexpected target");
22375 EVT VT = Op.getValueType();
22376 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22377 "Unexpected return type for lowering");
22381 switch (Op->getOpcode()) {
22382 default: llvm_unreachable("Unexpected request for libcall!");
22383 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22384 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22385 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22386 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22387 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22388 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22392 SDValue InChain = DAG.getEntryNode();
22394 TargetLowering::ArgListTy Args;
22395 TargetLowering::ArgListEntry Entry;
22396 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22397 EVT ArgVT = Op->getOperand(i).getValueType();
22398 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22399 "Unexpected argument type for lowering");
22400 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22401 Entry.Node = StackPtr;
22402 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22403 MachinePointerInfo(), /* Alignment = */ 16);
22404 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22405 Entry.Ty = PointerType::get(ArgTy,0);
22406 Entry.IsSExt = false;
22407 Entry.IsZExt = false;
22408 Args.push_back(Entry);
22411 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22412 getPointerTy(DAG.getDataLayout()));
22414 TargetLowering::CallLoweringInfo CLI(DAG);
22415 CLI.setDebugLoc(dl)
22418 getLibcallCallingConv(LC),
22419 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22422 .setSExtResult(isSigned)
22423 .setZExtResult(!isSigned);
22425 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22426 return DAG.getBitcast(VT, CallInfo.first);
22429 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22430 SelectionDAG &DAG) {
22431 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22432 MVT VT = Op0.getSimpleValueType();
22435 // Decompose 256-bit ops into smaller 128-bit ops.
22436 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22437 unsigned Opcode = Op.getOpcode();
22438 unsigned NumElems = VT.getVectorNumElements();
22439 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22440 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22441 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22442 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22443 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22444 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22445 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22447 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22448 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22450 return DAG.getMergeValues(Ops, dl);
22453 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22454 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22455 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22457 int NumElts = VT.getVectorNumElements();
22459 // PMULxD operations multiply each even value (starting at 0) of LHS with
22460 // the related value of RHS and produce a widen result.
22461 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22462 // => <2 x i64> <ae|cg>
22464 // In other word, to have all the results, we need to perform two PMULxD:
22465 // 1. one with the even values.
22466 // 2. one with the odd values.
22467 // To achieve #2, with need to place the odd values at an even position.
22469 // Place the odd value at an even position (basically, shift all values 1
22470 // step to the left):
22471 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22472 // <a|b|c|d> => <b|undef|d|undef>
22473 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22474 makeArrayRef(&Mask[0], NumElts));
22475 // <e|f|g|h> => <f|undef|h|undef>
22476 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22477 makeArrayRef(&Mask[0], NumElts));
22479 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22481 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22482 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22484 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22485 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22486 // => <2 x i64> <ae|cg>
22487 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
22488 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22489 // => <2 x i64> <bf|dh>
22490 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
22492 // Shuffle it back into the right order.
22493 SmallVector<int, 16> HighMask(NumElts);
22494 SmallVector<int, 16> LowMask(NumElts);
22495 for (int i = 0; i != NumElts; ++i) {
22496 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22497 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22500 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22501 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22503 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22504 // unsigned multiply.
22505 if (IsSigned && !Subtarget.hasSSE41()) {
22506 SDValue ShAmt = DAG.getConstant(
22508 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22509 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22510 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22511 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22512 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22514 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22515 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22518 // The first result of MUL_LOHI is actually the low value, followed by the
22520 SDValue Ops[] = {Lows, Highs};
22521 return DAG.getMergeValues(Ops, dl);
22524 // Return true if the required (according to Opcode) shift-imm form is natively
22525 // supported by the Subtarget
22526 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22528 if (VT.getScalarSizeInBits() < 16)
22531 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22532 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22535 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22536 (VT.is256BitVector() && Subtarget.hasInt256());
22538 bool AShift = LShift && (Subtarget.hasAVX512() ||
22539 (VT != MVT::v2i64 && VT != MVT::v4i64));
22540 return (Opcode == ISD::SRA) ? AShift : LShift;
22543 // The shift amount is a variable, but it is the same for all vector lanes.
22544 // These instructions are defined together with shift-immediate.
22546 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22548 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22551 // Return true if the required (according to Opcode) variable-shift form is
22552 // natively supported by the Subtarget
22553 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22556 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22559 // vXi16 supported only on AVX-512, BWI
22560 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22563 if (Subtarget.hasAVX512())
22566 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22567 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22568 return (Opcode == ISD::SRA) ? AShift : LShift;
22571 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22572 const X86Subtarget &Subtarget) {
22573 MVT VT = Op.getSimpleValueType();
22575 SDValue R = Op.getOperand(0);
22576 SDValue Amt = Op.getOperand(1);
22578 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22579 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22581 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22582 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22583 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22584 SDValue Ex = DAG.getBitcast(ExVT, R);
22586 // ashr(R, 63) === cmp_slt(R, 0)
22587 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22588 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22589 "Unsupported PCMPGT op");
22590 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22591 getZeroVector(VT, Subtarget, DAG, dl), R);
22594 if (ShiftAmt >= 32) {
22595 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22597 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22598 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22599 ShiftAmt - 32, DAG);
22600 if (VT == MVT::v2i64)
22601 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22602 if (VT == MVT::v4i64)
22603 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22604 {9, 1, 11, 3, 13, 5, 15, 7});
22606 // SRA upper i32, SHL whole i64 and select lower i32.
22607 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22610 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22611 Lower = DAG.getBitcast(ExVT, Lower);
22612 if (VT == MVT::v2i64)
22613 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22614 if (VT == MVT::v4i64)
22615 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22616 {8, 1, 10, 3, 12, 5, 14, 7});
22618 return DAG.getBitcast(VT, Ex);
22621 // Optimize shl/srl/sra with constant shift amount.
22622 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22623 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22624 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22626 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22627 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22629 // i64 SRA needs to be performed as partial shifts.
22630 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22631 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22632 Op.getOpcode() == ISD::SRA)
22633 return ArithmeticShiftRight64(ShiftAmt);
22635 if (VT == MVT::v16i8 ||
22636 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22637 VT == MVT::v64i8) {
22638 unsigned NumElts = VT.getVectorNumElements();
22639 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22641 // Simple i8 add case
22642 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22643 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22645 // ashr(R, 7) === cmp_slt(R, 0)
22646 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22647 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22648 if (VT.is512BitVector()) {
22649 assert(VT == MVT::v64i8 && "Unexpected element type!");
22650 SDValue CMP = DAG.getNode(X86ISD::CMPM, dl, MVT::v64i1, Zeros, R,
22651 DAG.getConstant(6, dl, MVT::i8));
22652 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22654 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22657 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22658 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22661 if (Op.getOpcode() == ISD::SHL) {
22662 // Make a large shift.
22663 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22665 SHL = DAG.getBitcast(VT, SHL);
22666 // Zero out the rightmost bits.
22667 return DAG.getNode(ISD::AND, dl, VT, SHL,
22668 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22670 if (Op.getOpcode() == ISD::SRL) {
22671 // Make a large shift.
22672 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22674 SRL = DAG.getBitcast(VT, SRL);
22675 // Zero out the leftmost bits.
22676 return DAG.getNode(ISD::AND, dl, VT, SRL,
22677 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22679 if (Op.getOpcode() == ISD::SRA) {
22680 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22681 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22683 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22684 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22685 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22688 llvm_unreachable("Unknown shift opcode.");
22693 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22694 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22695 if (!Subtarget.hasXOP() &&
22696 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22697 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22699 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22700 unsigned SubVectorScale = 1;
22701 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22703 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22704 Amt = Amt.getOperand(0);
22707 // Peek through any splat that was introduced for i64 shift vectorization.
22708 int SplatIndex = -1;
22709 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22710 if (SVN->isSplat()) {
22711 SplatIndex = SVN->getSplatIndex();
22712 Amt = Amt.getOperand(0);
22713 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22714 "Splat shuffle referencing second operand");
22717 if (Amt.getOpcode() != ISD::BITCAST ||
22718 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22721 Amt = Amt.getOperand(0);
22722 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22723 (SubVectorScale * VT.getVectorNumElements());
22724 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22725 uint64_t ShiftAmt = 0;
22726 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22727 for (unsigned i = 0; i != Ratio; ++i) {
22728 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22732 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22735 // Check remaining shift amounts (if not a splat).
22736 if (SplatIndex < 0) {
22737 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22738 uint64_t ShAmt = 0;
22739 for (unsigned j = 0; j != Ratio; ++j) {
22740 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22744 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22746 if (ShAmt != ShiftAmt)
22751 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22752 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22754 if (Op.getOpcode() == ISD::SRA)
22755 return ArithmeticShiftRight64(ShiftAmt);
22761 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22762 const X86Subtarget &Subtarget) {
22763 MVT VT = Op.getSimpleValueType();
22765 SDValue R = Op.getOperand(0);
22766 SDValue Amt = Op.getOperand(1);
22768 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22769 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22771 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22772 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22774 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22776 MVT EltVT = VT.getVectorElementType();
22778 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22779 // Check if this build_vector node is doing a splat.
22780 // If so, then set BaseShAmt equal to the splat value.
22781 BaseShAmt = BV->getSplatValue();
22782 if (BaseShAmt && BaseShAmt.isUndef())
22783 BaseShAmt = SDValue();
22785 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22786 Amt = Amt.getOperand(0);
22788 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22789 if (SVN && SVN->isSplat()) {
22790 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22791 SDValue InVec = Amt.getOperand(0);
22792 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22793 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22794 "Unexpected shuffle index found!");
22795 BaseShAmt = InVec.getOperand(SplatIdx);
22796 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22797 if (ConstantSDNode *C =
22798 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22799 if (C->getZExtValue() == SplatIdx)
22800 BaseShAmt = InVec.getOperand(1);
22805 // Avoid introducing an extract element from a shuffle.
22806 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22807 DAG.getIntPtrConstant(SplatIdx, dl));
22811 if (BaseShAmt.getNode()) {
22812 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22813 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22814 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22815 else if (EltVT.bitsLT(MVT::i32))
22816 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22818 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22822 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22823 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
22824 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22825 Amt = Amt.getOperand(0);
22826 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22827 VT.getVectorNumElements();
22828 std::vector<SDValue> Vals(Ratio);
22829 for (unsigned i = 0; i != Ratio; ++i)
22830 Vals[i] = Amt.getOperand(i);
22831 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22832 for (unsigned j = 0; j != Ratio; ++j)
22833 if (Vals[j] != Amt.getOperand(i + j))
22837 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22838 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22843 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22844 SelectionDAG &DAG) {
22845 MVT VT = Op.getSimpleValueType();
22847 SDValue R = Op.getOperand(0);
22848 SDValue Amt = Op.getOperand(1);
22849 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22851 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22852 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22854 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22857 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22860 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22863 // XOP has 128-bit variable logical/arithmetic shifts.
22864 // +ve/-ve Amt = shift left/right.
22865 if (Subtarget.hasXOP() &&
22866 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22867 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22868 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22869 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22870 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22872 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22873 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22874 if (Op.getOpcode() == ISD::SRA)
22875 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22878 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22879 // shifts per-lane and then shuffle the partial results back together.
22880 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22881 // Splat the shift amounts so the scalar shifts above will catch it.
22882 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22883 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22884 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22885 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22886 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22889 // i64 vector arithmetic shift can be emulated with the transform:
22890 // M = lshr(SIGN_MASK, Amt)
22891 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22892 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22893 Op.getOpcode() == ISD::SRA) {
22894 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22895 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22896 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22897 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22898 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22902 // If possible, lower this packed shift into a vector multiply instead of
22903 // expanding it into a sequence of scalar shifts.
22904 // Do this only if the vector shift count is a constant build_vector.
22905 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22906 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22907 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22908 SmallVector<SDValue, 8> Elts;
22909 MVT SVT = VT.getVectorElementType();
22910 unsigned SVTBits = SVT.getSizeInBits();
22911 APInt One(SVTBits, 1);
22912 unsigned NumElems = VT.getVectorNumElements();
22914 for (unsigned i=0; i !=NumElems; ++i) {
22915 SDValue Op = Amt->getOperand(i);
22916 if (Op->isUndef()) {
22917 Elts.push_back(Op);
22921 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22922 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22923 uint64_t ShAmt = C.getZExtValue();
22924 if (ShAmt >= SVTBits) {
22925 Elts.push_back(DAG.getUNDEF(SVT));
22928 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22930 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22931 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22934 // Lower SHL with variable shift amount.
22935 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22936 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22938 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22939 DAG.getConstant(0x3f800000U, dl, VT));
22940 Op = DAG.getBitcast(MVT::v4f32, Op);
22941 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22942 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22945 // If possible, lower this shift as a sequence of two shifts by
22946 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22948 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22950 // Could be rewritten as:
22951 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22953 // The advantage is that the two shifts from the example would be
22954 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22955 // the vector shift into four scalar shifts plus four pairs of vector
22957 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22958 bool UseMOVSD = false;
22959 bool CanBeSimplified;
22960 // The splat value for the first packed shift (the 'X' from the example).
22961 SDValue Amt1 = Amt->getOperand(0);
22962 // The splat value for the second packed shift (the 'Y' from the example).
22963 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22965 // See if it is possible to replace this node with a sequence of
22966 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22967 if (VT == MVT::v4i32) {
22968 // Check if it is legal to use a MOVSS.
22969 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22970 Amt2 == Amt->getOperand(3);
22971 if (!CanBeSimplified) {
22972 // Otherwise, check if we can still simplify this node using a MOVSD.
22973 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22974 Amt->getOperand(2) == Amt->getOperand(3);
22976 Amt2 = Amt->getOperand(2);
22979 // Do similar checks for the case where the machine value type
22981 CanBeSimplified = Amt1 == Amt->getOperand(1);
22982 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22983 CanBeSimplified = Amt2 == Amt->getOperand(i);
22985 if (!CanBeSimplified) {
22987 CanBeSimplified = true;
22988 Amt2 = Amt->getOperand(4);
22989 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22990 CanBeSimplified = Amt1 == Amt->getOperand(i);
22991 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22992 CanBeSimplified = Amt2 == Amt->getOperand(j);
22996 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22997 isa<ConstantSDNode>(Amt2)) {
22998 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
23000 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23001 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23003 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23004 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23005 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
23006 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
23008 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23009 BitCast2, {0, 1, 6, 7}));
23010 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23011 BitCast2, {0, 5, 6, 7}));
23015 // v4i32 Non Uniform Shifts.
23016 // If the shift amount is constant we can shift each lane using the SSE2
23017 // immediate shifts, else we need to zero-extend each lane to the lower i64
23018 // and shift using the SSE2 variable shifts.
23019 // The separate results can then be blended together.
23020 if (VT == MVT::v4i32) {
23021 unsigned Opc = Op.getOpcode();
23022 SDValue Amt0, Amt1, Amt2, Amt3;
23024 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23025 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23026 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23027 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23029 // ISD::SHL is handled above but we include it here for completeness.
23032 llvm_unreachable("Unknown target vector shift node");
23034 Opc = X86ISD::VSHL;
23037 Opc = X86ISD::VSRL;
23040 Opc = X86ISD::VSRA;
23043 // The SSE2 shifts use the lower i64 as the same shift amount for
23044 // all lanes and the upper i64 is ignored. These shuffle masks
23045 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
23046 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23047 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23048 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23049 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23050 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23053 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
23054 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
23055 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
23056 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
23057 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23058 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23059 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23062 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23063 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23064 // make the existing SSE solution better.
23065 // NOTE: We honor prefered vector width before promoting to 512-bits.
23066 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23067 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23068 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23069 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23070 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23071 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23072 "Unexpected vector type");
23073 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23074 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23076 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23077 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23078 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23079 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23080 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23083 if (VT == MVT::v16i8 ||
23084 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23085 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23086 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23087 unsigned ShiftOpcode = Op->getOpcode();
23089 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23090 if (VT.is512BitVector()) {
23091 // On AVX512BW targets we make use of the fact that VSELECT lowers
23092 // to a masked blend which selects bytes based just on the sign bit
23093 // extracted to a mask.
23094 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23095 V0 = DAG.getBitcast(VT, V0);
23096 V1 = DAG.getBitcast(VT, V1);
23097 Sel = DAG.getBitcast(VT, Sel);
23098 Sel = DAG.getNode(X86ISD::CMPM, dl, MaskVT,
23099 DAG.getConstant(0, dl, VT), Sel,
23100 DAG.getConstant(6, dl, MVT::i8));
23101 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23102 } else if (Subtarget.hasSSE41()) {
23103 // On SSE41 targets we make use of the fact that VSELECT lowers
23104 // to PBLENDVB which selects bytes based just on the sign bit.
23105 V0 = DAG.getBitcast(VT, V0);
23106 V1 = DAG.getBitcast(VT, V1);
23107 Sel = DAG.getBitcast(VT, Sel);
23108 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23110 // On pre-SSE41 targets we test for the sign bit by comparing to
23111 // zero - a negative value will set all bits of the lanes to true
23112 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23113 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23114 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23115 return DAG.getSelect(dl, SelVT, C, V0, V1);
23118 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23119 // We can safely do this using i16 shifts as we're only interested in
23120 // the 3 lower bits of each byte.
23121 Amt = DAG.getBitcast(ExtVT, Amt);
23122 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23123 Amt = DAG.getBitcast(VT, Amt);
23125 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23126 // r = VSELECT(r, shift(r, 4), a);
23128 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23129 R = SignBitSelect(VT, Amt, M, R);
23132 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23134 // r = VSELECT(r, shift(r, 2), a);
23135 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23136 R = SignBitSelect(VT, Amt, M, R);
23139 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23141 // return VSELECT(r, shift(r, 1), a);
23142 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23143 R = SignBitSelect(VT, Amt, M, R);
23147 if (Op->getOpcode() == ISD::SRA) {
23148 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23149 // so we can correctly sign extend. We don't care what happens to the
23151 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23152 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23153 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23154 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23155 ALo = DAG.getBitcast(ExtVT, ALo);
23156 AHi = DAG.getBitcast(ExtVT, AHi);
23157 RLo = DAG.getBitcast(ExtVT, RLo);
23158 RHi = DAG.getBitcast(ExtVT, RHi);
23160 // r = VSELECT(r, shift(r, 4), a);
23161 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23162 DAG.getConstant(4, dl, ExtVT));
23163 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23164 DAG.getConstant(4, dl, ExtVT));
23165 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23166 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23169 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23170 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23172 // r = VSELECT(r, shift(r, 2), a);
23173 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23174 DAG.getConstant(2, dl, ExtVT));
23175 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23176 DAG.getConstant(2, dl, ExtVT));
23177 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23178 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23181 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23182 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23184 // r = VSELECT(r, shift(r, 1), a);
23185 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23186 DAG.getConstant(1, dl, ExtVT));
23187 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23188 DAG.getConstant(1, dl, ExtVT));
23189 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23190 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23192 // Logical shift the result back to the lower byte, leaving a zero upper
23194 // meaning that we can safely pack with PACKUSWB.
23196 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23198 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23199 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23203 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23204 MVT ExtVT = MVT::v8i32;
23205 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23206 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23207 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23208 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23209 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23210 ALo = DAG.getBitcast(ExtVT, ALo);
23211 AHi = DAG.getBitcast(ExtVT, AHi);
23212 RLo = DAG.getBitcast(ExtVT, RLo);
23213 RHi = DAG.getBitcast(ExtVT, RHi);
23214 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23215 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23216 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23217 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23218 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23221 if (VT == MVT::v8i16) {
23222 unsigned ShiftOpcode = Op->getOpcode();
23224 // If we have a constant shift amount, the non-SSE41 path is best as
23225 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23226 bool UseSSE41 = Subtarget.hasSSE41() &&
23227 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23229 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23230 // On SSE41 targets we make use of the fact that VSELECT lowers
23231 // to PBLENDVB which selects bytes based just on the sign bit.
23233 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23234 V0 = DAG.getBitcast(ExtVT, V0);
23235 V1 = DAG.getBitcast(ExtVT, V1);
23236 Sel = DAG.getBitcast(ExtVT, Sel);
23237 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23239 // On pre-SSE41 targets we splat the sign bit - a negative value will
23240 // set all bits of the lanes to true and VSELECT uses that in
23241 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23243 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23244 return DAG.getSelect(dl, VT, C, V0, V1);
23247 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23249 // On SSE41 targets we need to replicate the shift mask in both
23250 // bytes for PBLENDVB.
23253 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23254 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23256 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23259 // r = VSELECT(r, shift(r, 8), a);
23260 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23261 R = SignBitSelect(Amt, M, R);
23264 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23266 // r = VSELECT(r, shift(r, 4), a);
23267 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23268 R = SignBitSelect(Amt, M, R);
23271 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23273 // r = VSELECT(r, shift(r, 2), a);
23274 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23275 R = SignBitSelect(Amt, M, R);
23278 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23280 // return VSELECT(r, shift(r, 1), a);
23281 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23282 R = SignBitSelect(Amt, M, R);
23286 // Decompose 256-bit shifts into smaller 128-bit shifts.
23287 if (VT.is256BitVector())
23288 return Lower256IntArith(Op, DAG);
23293 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23294 SelectionDAG &DAG) {
23295 MVT VT = Op.getSimpleValueType();
23297 SDValue R = Op.getOperand(0);
23298 SDValue Amt = Op.getOperand(1);
23299 unsigned Opcode = Op.getOpcode();
23300 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23302 if (Subtarget.hasAVX512()) {
23303 // Attempt to rotate by immediate.
23305 SmallVector<APInt, 16> EltBits;
23306 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23307 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23308 return EltBits[0] == V;
23310 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23311 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23312 return DAG.getNode(Op, DL, VT, R,
23313 DAG.getConstant(RotateAmt, DL, MVT::i8));
23317 // Else, fall-back on VPROLV/VPRORV.
23321 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23322 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
23323 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23325 // XOP has 128-bit vector variable + immediate rotates.
23326 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23328 // Split 256-bit integers.
23329 if (VT.is256BitVector())
23330 return Lower256IntArith(Op, DAG);
23332 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23334 // Attempt to rotate by immediate.
23335 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23336 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23337 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23338 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23339 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23340 DAG.getConstant(RotateAmt, DL, MVT::i8));
23344 // Use general rotate by variable (per-element).
23348 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23349 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23350 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23351 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23352 // has only one use.
23353 SDNode *N = Op.getNode();
23354 SDValue LHS = N->getOperand(0);
23355 SDValue RHS = N->getOperand(1);
23356 unsigned BaseOp = 0;
23357 X86::CondCode Cond;
23359 switch (Op.getOpcode()) {
23360 default: llvm_unreachable("Unknown ovf instruction!");
23362 // A subtract of one will be selected as a INC. Note that INC doesn't
23363 // set CF, so we can't do this for UADDO.
23364 if (isOneConstant(RHS)) {
23365 BaseOp = X86ISD::INC;
23366 Cond = X86::COND_O;
23369 BaseOp = X86ISD::ADD;
23370 Cond = X86::COND_O;
23373 BaseOp = X86ISD::ADD;
23374 Cond = X86::COND_B;
23377 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23378 // set CF, so we can't do this for USUBO.
23379 if (isOneConstant(RHS)) {
23380 BaseOp = X86ISD::DEC;
23381 Cond = X86::COND_O;
23384 BaseOp = X86ISD::SUB;
23385 Cond = X86::COND_O;
23388 BaseOp = X86ISD::SUB;
23389 Cond = X86::COND_B;
23392 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23393 Cond = X86::COND_O;
23395 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23396 if (N->getValueType(0) == MVT::i8) {
23397 BaseOp = X86ISD::UMUL8;
23398 Cond = X86::COND_O;
23401 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23403 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23405 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23407 if (N->getValueType(1) == MVT::i1)
23408 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23410 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23414 // Also sets EFLAGS.
23415 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23416 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23418 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23420 if (N->getValueType(1) == MVT::i1)
23421 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23423 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23426 /// Returns true if the operand type is exactly twice the native width, and
23427 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23428 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23429 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23430 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23431 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23434 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23435 else if (OpWidth == 128)
23436 return Subtarget.hasCmpxchg16b();
23441 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23442 return needsCmpXchgNb(SI->getValueOperand()->getType());
23445 // Note: this turns large loads into lock cmpxchg8b/16b.
23446 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23447 TargetLowering::AtomicExpansionKind
23448 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23449 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23450 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23451 : AtomicExpansionKind::None;
23454 TargetLowering::AtomicExpansionKind
23455 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23456 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23457 Type *MemType = AI->getType();
23459 // If the operand is too big, we must see if cmpxchg8/16b is available
23460 // and default to library calls otherwise.
23461 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23462 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23463 : AtomicExpansionKind::None;
23466 AtomicRMWInst::BinOp Op = AI->getOperation();
23469 llvm_unreachable("Unknown atomic operation");
23470 case AtomicRMWInst::Xchg:
23471 case AtomicRMWInst::Add:
23472 case AtomicRMWInst::Sub:
23473 // It's better to use xadd, xsub or xchg for these in all cases.
23474 return AtomicExpansionKind::None;
23475 case AtomicRMWInst::Or:
23476 case AtomicRMWInst::And:
23477 case AtomicRMWInst::Xor:
23478 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23479 // prefix to a normal instruction for these operations.
23480 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23481 : AtomicExpansionKind::None;
23482 case AtomicRMWInst::Nand:
23483 case AtomicRMWInst::Max:
23484 case AtomicRMWInst::Min:
23485 case AtomicRMWInst::UMax:
23486 case AtomicRMWInst::UMin:
23487 // These always require a non-trivial set of data operations on x86. We must
23488 // use a cmpxchg loop.
23489 return AtomicExpansionKind::CmpXChg;
23494 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23495 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23496 Type *MemType = AI->getType();
23497 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23498 // there is no benefit in turning such RMWs into loads, and it is actually
23499 // harmful as it introduces a mfence.
23500 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23503 auto Builder = IRBuilder<>(AI);
23504 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23505 auto SSID = AI->getSyncScopeID();
23506 // We must restrict the ordering to avoid generating loads with Release or
23507 // ReleaseAcquire orderings.
23508 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23509 auto Ptr = AI->getPointerOperand();
23511 // Before the load we need a fence. Here is an example lifted from
23512 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23515 // x.store(1, relaxed);
23516 // r1 = y.fetch_add(0, release);
23518 // y.fetch_add(42, acquire);
23519 // r2 = x.load(relaxed);
23520 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23521 // lowered to just a load without a fence. A mfence flushes the store buffer,
23522 // making the optimization clearly correct.
23523 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23524 // otherwise, we might be able to be more aggressive on relaxed idempotent
23525 // rmw. In practice, they do not look useful, so we don't try to be
23526 // especially clever.
23527 if (SSID == SyncScope::SingleThread)
23528 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23529 // the IR level, so we must wrap it in an intrinsic.
23532 if (!Subtarget.hasMFence())
23533 // FIXME: it might make sense to use a locked operation here but on a
23534 // different cache-line to prevent cache-line bouncing. In practice it
23535 // is probably a small win, and x86 processors without mfence are rare
23536 // enough that we do not bother.
23540 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23541 Builder.CreateCall(MFence, {});
23543 // Finally we can emit the atomic load.
23544 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23545 AI->getType()->getPrimitiveSizeInBits());
23546 Loaded->setAtomic(Order, SSID);
23547 AI->replaceAllUsesWith(Loaded);
23548 AI->eraseFromParent();
23552 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23553 SelectionDAG &DAG) {
23555 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23556 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23557 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23558 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23560 // The only fence that needs an instruction is a sequentially-consistent
23561 // cross-thread fence.
23562 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23563 FenceSSID == SyncScope::System) {
23564 if (Subtarget.hasMFence())
23565 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23567 SDValue Chain = Op.getOperand(0);
23568 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23570 DAG.getRegister(X86::ESP, MVT::i32), // Base
23571 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23572 DAG.getRegister(0, MVT::i32), // Index
23573 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23574 DAG.getRegister(0, MVT::i32), // Segment.
23578 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23579 return SDValue(Res, 0);
23582 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23583 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23586 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23587 SelectionDAG &DAG) {
23588 MVT T = Op.getSimpleValueType();
23592 switch(T.SimpleTy) {
23593 default: llvm_unreachable("Invalid value type!");
23594 case MVT::i8: Reg = X86::AL; size = 1; break;
23595 case MVT::i16: Reg = X86::AX; size = 2; break;
23596 case MVT::i32: Reg = X86::EAX; size = 4; break;
23598 assert(Subtarget.is64Bit() && "Node not type legal!");
23599 Reg = X86::RAX; size = 8;
23602 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23603 Op.getOperand(2), SDValue());
23604 SDValue Ops[] = { cpIn.getValue(0),
23607 DAG.getTargetConstant(size, DL, MVT::i8),
23608 cpIn.getValue(1) };
23609 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23610 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23611 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23615 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23616 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23617 MVT::i32, cpOut.getValue(2));
23618 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23620 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23621 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23622 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23626 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23627 SelectionDAG &DAG) {
23628 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23629 MVT DstVT = Op.getSimpleValueType();
23631 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
23632 // half to v32i1 and concatenating the result.
23633 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
23634 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
23635 assert(Subtarget.hasBWI() && "Expected BWI target");
23636 SDValue Op0 = Op->getOperand(0);
23638 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23639 DAG.getIntPtrConstant(0, dl));
23640 Lo = DAG.getBitcast(MVT::v32i1, Lo);
23641 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23642 DAG.getIntPtrConstant(1, dl));
23643 Hi = DAG.getBitcast(MVT::v32i1, Hi);
23644 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
23647 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23648 SrcVT == MVT::i64) {
23649 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23650 if (DstVT != MVT::f64)
23651 // This conversion needs to be expanded.
23654 SDValue Op0 = Op->getOperand(0);
23655 SmallVector<SDValue, 16> Elts;
23659 if (SrcVT.isVector()) {
23660 NumElts = SrcVT.getVectorNumElements();
23661 SVT = SrcVT.getVectorElementType();
23663 // Widen the vector in input in the case of MVT::v2i32.
23664 // Example: from MVT::v2i32 to MVT::v4i32.
23665 for (unsigned i = 0, e = NumElts; i != e; ++i)
23666 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23667 DAG.getIntPtrConstant(i, dl)));
23669 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23670 "Unexpected source type in LowerBITCAST");
23671 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23672 DAG.getIntPtrConstant(0, dl)));
23673 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23674 DAG.getIntPtrConstant(1, dl)));
23678 // Explicitly mark the extra elements as Undef.
23679 Elts.append(NumElts, DAG.getUNDEF(SVT));
23681 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23682 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23683 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23684 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23685 DAG.getIntPtrConstant(0, dl));
23688 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23689 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23690 assert((DstVT == MVT::i64 ||
23691 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23692 "Unexpected custom BITCAST");
23693 // i64 <=> MMX conversions are Legal.
23694 if (SrcVT==MVT::i64 && DstVT.isVector())
23696 if (DstVT==MVT::i64 && SrcVT.isVector())
23698 // MMX <=> MMX conversions are Legal.
23699 if (SrcVT.isVector() && DstVT.isVector())
23701 // All other conversions need to be expanded.
23705 /// Compute the horizontal sum of bytes in V for the elements of VT.
23707 /// Requires V to be a byte vector and VT to be an integer vector type with
23708 /// wider elements than V's type. The width of the elements of VT determines
23709 /// how many bytes of V are summed horizontally to produce each element of the
23711 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23712 const X86Subtarget &Subtarget,
23713 SelectionDAG &DAG) {
23715 MVT ByteVecVT = V.getSimpleValueType();
23716 MVT EltVT = VT.getVectorElementType();
23717 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23718 "Expected value to have byte element type.");
23719 assert(EltVT != MVT::i8 &&
23720 "Horizontal byte sum only makes sense for wider elements!");
23721 unsigned VecSize = VT.getSizeInBits();
23722 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23724 // PSADBW instruction horizontally add all bytes and leave the result in i64
23725 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23726 if (EltVT == MVT::i64) {
23727 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23728 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23729 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23730 return DAG.getBitcast(VT, V);
23733 if (EltVT == MVT::i32) {
23734 // We unpack the low half and high half into i32s interleaved with zeros so
23735 // that we can use PSADBW to horizontally sum them. The most useful part of
23736 // this is that it lines up the results of two PSADBW instructions to be
23737 // two v2i64 vectors which concatenated are the 4 population counts. We can
23738 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23739 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23740 SDValue V32 = DAG.getBitcast(VT, V);
23741 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23742 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23744 // Do the horizontal sums into two v2i64s.
23745 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23746 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23747 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23748 DAG.getBitcast(ByteVecVT, Low), Zeros);
23749 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23750 DAG.getBitcast(ByteVecVT, High), Zeros);
23752 // Merge them together.
23753 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23754 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23755 DAG.getBitcast(ShortVecVT, Low),
23756 DAG.getBitcast(ShortVecVT, High));
23758 return DAG.getBitcast(VT, V);
23761 // The only element type left is i16.
23762 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23764 // To obtain pop count for each i16 element starting from the pop count for
23765 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23766 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23767 // directly supported.
23768 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23769 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23770 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23771 DAG.getBitcast(ByteVecVT, V));
23772 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23775 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23776 const X86Subtarget &Subtarget,
23777 SelectionDAG &DAG) {
23778 MVT VT = Op.getSimpleValueType();
23779 MVT EltVT = VT.getVectorElementType();
23780 unsigned VecSize = VT.getSizeInBits();
23782 // Implement a lookup table in register by using an algorithm based on:
23783 // http://wm.ite.pl/articles/sse-popcount.html
23785 // The general idea is that every lower byte nibble in the input vector is an
23786 // index into a in-register pre-computed pop count table. We then split up the
23787 // input vector in two new ones: (1) a vector with only the shifted-right
23788 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23789 // masked out higher ones) for each byte. PSHUFB is used separately with both
23790 // to index the in-register table. Next, both are added and the result is a
23791 // i8 vector where each element contains the pop count for input byte.
23793 // To obtain the pop count for elements != i8, we follow up with the same
23794 // approach and use additional tricks as described below.
23796 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23797 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23798 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23799 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23801 int NumByteElts = VecSize / 8;
23802 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23803 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23804 SmallVector<SDValue, 64> LUTVec;
23805 for (int i = 0; i < NumByteElts; ++i)
23806 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23807 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23808 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23811 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23812 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23815 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23817 // The input vector is used as the shuffle mask that index elements into the
23818 // LUT. After counting low and high nibbles, add the vector to obtain the
23819 // final pop count per i8 element.
23820 SDValue HighPopCnt =
23821 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23822 SDValue LowPopCnt =
23823 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23824 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23826 if (EltVT == MVT::i8)
23829 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23832 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23833 const X86Subtarget &Subtarget,
23834 SelectionDAG &DAG) {
23835 MVT VT = Op.getSimpleValueType();
23836 assert(VT.is128BitVector() &&
23837 "Only 128-bit vector bitmath lowering supported.");
23839 int VecSize = VT.getSizeInBits();
23840 MVT EltVT = VT.getVectorElementType();
23841 int Len = EltVT.getSizeInBits();
23843 // This is the vectorized version of the "best" algorithm from
23844 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23845 // with a minor tweak to use a series of adds + shifts instead of vector
23846 // multiplications. Implemented for all integer vector types. We only use
23847 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23848 // much faster, even faster than using native popcnt instructions.
23850 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23851 MVT VT = V.getSimpleValueType();
23852 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23853 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23855 auto GetMask = [&](SDValue V, APInt Mask) {
23856 MVT VT = V.getSimpleValueType();
23857 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23858 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23861 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23862 // x86, so set the SRL type to have elements at least i16 wide. This is
23863 // correct because all of our SRLs are followed immediately by a mask anyways
23864 // that handles any bits that sneak into the high bits of the byte elements.
23865 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23869 // v = v - ((v >> 1) & 0x55555555...)
23871 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23872 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23873 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23875 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23876 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23877 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23878 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23879 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23881 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23882 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23883 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23884 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23886 // At this point, V contains the byte-wise population count, and we are
23887 // merely doing a horizontal sum if necessary to get the wider element
23889 if (EltVT == MVT::i8)
23892 return LowerHorizontalByteSum(
23893 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23897 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23898 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23899 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23900 SelectionDAG &DAG) {
23901 MVT VT = Op.getSimpleValueType();
23902 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23903 "Unknown CTPOP type to handle");
23904 SDLoc DL(Op.getNode());
23905 SDValue Op0 = Op.getOperand(0);
23907 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23908 if (Subtarget.hasVPOPCNTDQ()) {
23909 unsigned NumElems = VT.getVectorNumElements();
23910 assert((VT.getVectorElementType() == MVT::i8 ||
23911 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
23912 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
23913 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
23914 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
23915 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
23916 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
23920 if (!Subtarget.hasSSSE3()) {
23921 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23922 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23923 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23926 // Decompose 256-bit ops into smaller 128-bit ops.
23927 if (VT.is256BitVector() && !Subtarget.hasInt256())
23928 return Lower256IntUnary(Op, DAG);
23930 // Decompose 512-bit ops into smaller 256-bit ops.
23931 if (VT.is512BitVector() && !Subtarget.hasBWI())
23932 return Lower512IntUnary(Op, DAG);
23934 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23937 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23938 SelectionDAG &DAG) {
23939 assert(Op.getSimpleValueType().isVector() &&
23940 "We only do custom lowering for vector population count.");
23941 return LowerVectorCTPOP(Op, Subtarget, DAG);
23944 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23945 MVT VT = Op.getSimpleValueType();
23946 SDValue In = Op.getOperand(0);
23949 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23950 // perform the BITREVERSE.
23951 if (!VT.isVector()) {
23952 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23953 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23954 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23955 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23956 DAG.getIntPtrConstant(0, DL));
23959 int NumElts = VT.getVectorNumElements();
23960 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23962 // Decompose 256-bit ops into smaller 128-bit ops.
23963 if (VT.is256BitVector())
23964 return Lower256IntUnary(Op, DAG);
23966 assert(VT.is128BitVector() &&
23967 "Only 128-bit vector bitreverse lowering supported.");
23969 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23970 // perform the BSWAP in the shuffle.
23971 // Its best to shuffle using the second operand as this will implicitly allow
23972 // memory folding for multiple vectors.
23973 SmallVector<SDValue, 16> MaskElts;
23974 for (int i = 0; i != NumElts; ++i) {
23975 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23976 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23977 int PermuteByte = SourceByte | (2 << 5);
23978 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23982 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23983 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23984 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23986 return DAG.getBitcast(VT, Res);
23989 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23990 SelectionDAG &DAG) {
23991 MVT VT = Op.getSimpleValueType();
23993 if (Subtarget.hasXOP() && !VT.is512BitVector())
23994 return LowerBITREVERSE_XOP(Op, DAG);
23996 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23998 SDValue In = Op.getOperand(0);
24001 unsigned NumElts = VT.getVectorNumElements();
24002 assert(VT.getScalarType() == MVT::i8 &&
24003 "Only byte vector BITREVERSE supported");
24005 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24006 if (VT.is256BitVector() && !Subtarget.hasInt256())
24007 return Lower256IntUnary(Op, DAG);
24009 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24010 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24011 // 0-15 value (moved to the other nibble).
24012 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24013 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24014 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24016 const int LoLUT[16] = {
24017 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24018 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24019 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24020 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24021 const int HiLUT[16] = {
24022 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24023 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24024 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24025 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24027 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24028 for (unsigned i = 0; i < NumElts; ++i) {
24029 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24030 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24033 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24034 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24035 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24036 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24037 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24040 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24041 const X86Subtarget &Subtarget,
24042 bool AllowIncDec = true) {
24043 unsigned NewOpc = 0;
24044 switch (N->getOpcode()) {
24045 case ISD::ATOMIC_LOAD_ADD:
24046 NewOpc = X86ISD::LADD;
24048 case ISD::ATOMIC_LOAD_SUB:
24049 NewOpc = X86ISD::LSUB;
24051 case ISD::ATOMIC_LOAD_OR:
24052 NewOpc = X86ISD::LOR;
24054 case ISD::ATOMIC_LOAD_XOR:
24055 NewOpc = X86ISD::LXOR;
24057 case ISD::ATOMIC_LOAD_AND:
24058 NewOpc = X86ISD::LAND;
24061 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24064 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24066 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24067 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24068 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24069 DAG.getMachineFunction().getFunction().optForSize())) {
24070 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24071 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24072 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24073 DAG.getVTList(MVT::i32, MVT::Other),
24074 {N->getOperand(0), N->getOperand(1)},
24075 /*MemVT=*/N->getSimpleValueType(0), MMO);
24076 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24077 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24078 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24079 DAG.getVTList(MVT::i32, MVT::Other),
24080 {N->getOperand(0), N->getOperand(1)},
24081 /*MemVT=*/N->getSimpleValueType(0), MMO);
24085 return DAG.getMemIntrinsicNode(
24086 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24087 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24088 /*MemVT=*/N->getSimpleValueType(0), MMO);
24091 /// Lower atomic_load_ops into LOCK-prefixed operations.
24092 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24093 const X86Subtarget &Subtarget) {
24094 SDValue Chain = N->getOperand(0);
24095 SDValue LHS = N->getOperand(1);
24096 SDValue RHS = N->getOperand(2);
24097 unsigned Opc = N->getOpcode();
24098 MVT VT = N->getSimpleValueType(0);
24101 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24102 // can only be lowered when the result is unused. They should have already
24103 // been transformed into a cmpxchg loop in AtomicExpand.
24104 if (N->hasAnyUseOfValue(0)) {
24105 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24106 // select LXADD if LOCK_SUB can't be selected.
24107 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24108 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24109 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24110 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24111 RHS, AN->getMemOperand());
24113 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24114 "Used AtomicRMW ops other than Add should have been expanded!");
24118 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24119 // RAUW the chain, but don't worry about the result, as it's unused.
24120 assert(!N->hasAnyUseOfValue(0));
24121 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24125 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24126 SDNode *Node = Op.getNode();
24128 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24130 // Convert seq_cst store -> xchg
24131 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24132 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24133 // (The only way to get a 16-byte store is cmpxchg16b)
24134 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24135 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24136 AtomicOrdering::SequentiallyConsistent ||
24137 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24138 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24139 cast<AtomicSDNode>(Node)->getMemoryVT(),
24140 Node->getOperand(0),
24141 Node->getOperand(1), Node->getOperand(2),
24142 cast<AtomicSDNode>(Node)->getMemOperand());
24143 return Swap.getValue(1);
24145 // Other atomic stores have a simple pattern.
24149 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24150 SDNode *N = Op.getNode();
24151 MVT VT = N->getSimpleValueType(0);
24153 // Let legalize expand this if it isn't a legal type yet.
24154 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24157 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24160 // Set the carry flag.
24161 SDValue Carry = Op.getOperand(2);
24162 EVT CarryVT = Carry.getValueType();
24163 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24164 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24165 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24167 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24168 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24169 Op.getOperand(1), Carry.getValue(1));
24171 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24172 if (N->getValueType(1) == MVT::i1)
24173 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24175 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24178 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24179 SelectionDAG &DAG) {
24180 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24182 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24183 // which returns the values as { float, float } (in XMM0) or
24184 // { double, double } (which is returned in XMM0, XMM1).
24186 SDValue Arg = Op.getOperand(0);
24187 EVT ArgVT = Arg.getValueType();
24188 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24190 TargetLowering::ArgListTy Args;
24191 TargetLowering::ArgListEntry Entry;
24195 Entry.IsSExt = false;
24196 Entry.IsZExt = false;
24197 Args.push_back(Entry);
24199 bool isF64 = ArgVT == MVT::f64;
24200 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24201 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24202 // the results are returned via SRet in memory.
24203 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24204 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24205 const char *LibcallName = TLI.getLibcallName(LC);
24207 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24209 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24210 : (Type *)VectorType::get(ArgTy, 4);
24212 TargetLowering::CallLoweringInfo CLI(DAG);
24213 CLI.setDebugLoc(dl)
24214 .setChain(DAG.getEntryNode())
24215 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24217 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24220 // Returned in xmm0 and xmm1.
24221 return CallResult.first;
24223 // Returned in bits 0:31 and 32:64 xmm0.
24224 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24225 CallResult.first, DAG.getIntPtrConstant(0, dl));
24226 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24227 CallResult.first, DAG.getIntPtrConstant(1, dl));
24228 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24229 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24232 /// Widen a vector input to a vector of NVT. The
24233 /// input vector must have the same element type as NVT.
24234 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24235 bool FillWithZeroes = false) {
24236 // Check if InOp already has the right width.
24237 MVT InVT = InOp.getSimpleValueType();
24241 if (InOp.isUndef())
24242 return DAG.getUNDEF(NVT);
24244 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24245 "input and widen element type must match");
24247 unsigned InNumElts = InVT.getVectorNumElements();
24248 unsigned WidenNumElts = NVT.getVectorNumElements();
24249 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24250 "Unexpected request for vector widening");
24253 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24254 InOp.getNumOperands() == 2) {
24255 SDValue N1 = InOp.getOperand(1);
24256 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24258 InOp = InOp.getOperand(0);
24259 InVT = InOp.getSimpleValueType();
24260 InNumElts = InVT.getVectorNumElements();
24263 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24264 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24265 SmallVector<SDValue, 16> Ops;
24266 for (unsigned i = 0; i < InNumElts; ++i)
24267 Ops.push_back(InOp.getOperand(i));
24269 EVT EltVT = InOp.getOperand(0).getValueType();
24271 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24272 DAG.getUNDEF(EltVT);
24273 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24274 Ops.push_back(FillVal);
24275 return DAG.getBuildVector(NVT, dl, Ops);
24277 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24279 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24280 InOp, DAG.getIntPtrConstant(0, dl));
24283 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24284 SelectionDAG &DAG) {
24285 assert(Subtarget.hasAVX512() &&
24286 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24288 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24289 SDValue Src = N->getValue();
24290 MVT VT = Src.getSimpleValueType();
24291 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24294 SDValue Scale = N->getScale();
24295 SDValue Index = N->getIndex();
24296 SDValue Mask = N->getMask();
24297 SDValue Chain = N->getChain();
24298 SDValue BasePtr = N->getBasePtr();
24300 if (VT == MVT::v2f32) {
24301 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24302 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24303 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24304 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24305 DAG.getUNDEF(MVT::v2f32));
24306 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24307 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24308 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24309 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24310 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24311 return SDValue(NewScatter.getNode(), 1);
24316 if (VT == MVT::v2i32) {
24317 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24318 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
24319 DAG.getUNDEF(MVT::v2i32));
24320 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24321 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24322 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24323 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24324 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24325 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24326 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24327 return SDValue(NewScatter.getNode(), 1);
24329 // Custom widen all the operands to avoid promotion.
24330 EVT NewIndexVT = EVT::getVectorVT(
24331 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
24332 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
24333 DAG.getUNDEF(Index.getValueType()));
24334 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24335 DAG.getConstant(0, dl, MVT::v2i1));
24336 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24337 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
24338 Ops, N->getMemOperand());
24341 MVT IndexVT = Index.getSimpleValueType();
24342 MVT MaskVT = Mask.getSimpleValueType();
24344 // If the index is v2i32, we're being called by type legalization and we
24345 // should just let the default handling take care of it.
24346 if (IndexVT == MVT::v2i32)
24349 // If we don't have VLX and neither the passthru or index is 512-bits, we
24350 // need to widen until one is.
24351 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24352 !Index.getSimpleValueType().is512BitVector()) {
24353 // Determine how much we need to widen by to get a 512-bit type.
24354 unsigned Factor = std::min(512/VT.getSizeInBits(),
24355 512/IndexVT.getSizeInBits());
24356 unsigned NumElts = VT.getVectorNumElements() * Factor;
24358 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
24359 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
24360 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24362 Src = ExtendToType(Src, VT, DAG);
24363 Index = ExtendToType(Index, IndexVT, DAG);
24364 Mask = ExtendToType(Mask, MaskVT, DAG, true);
24367 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
24368 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24369 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24370 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24371 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24372 return SDValue(NewScatter.getNode(), 1);
24375 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24376 SelectionDAG &DAG) {
24378 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24379 MVT VT = Op.getSimpleValueType();
24380 MVT ScalarVT = VT.getScalarType();
24381 SDValue Mask = N->getMask();
24384 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24385 "Expanding masked load is supported on AVX-512 target only!");
24387 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24388 "Expanding masked load is supported for 32 and 64-bit types only!");
24390 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24391 "Cannot lower masked load op.");
24393 assert((ScalarVT.getSizeInBits() >= 32 ||
24394 (Subtarget.hasBWI() &&
24395 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24396 "Unsupported masked load op.");
24398 // This operation is legal for targets with VLX, but without
24399 // VLX the vector should be widened to 512 bit
24400 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24401 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24402 SDValue Src0 = N->getSrc0();
24403 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24405 // Mask element has to be i1.
24406 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
24407 "Unexpected mask type");
24409 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
24411 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24412 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24413 N->getBasePtr(), Mask, Src0,
24414 N->getMemoryVT(), N->getMemOperand(),
24415 N->getExtensionType(),
24416 N->isExpandingLoad());
24418 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24419 NewLoad.getValue(0),
24420 DAG.getIntPtrConstant(0, dl));
24421 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24422 return DAG.getMergeValues(RetOps, dl);
24425 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24426 SelectionDAG &DAG) {
24427 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24428 SDValue DataToStore = N->getValue();
24429 MVT VT = DataToStore.getSimpleValueType();
24430 MVT ScalarVT = VT.getScalarType();
24431 SDValue Mask = N->getMask();
24434 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
24435 "Expanding masked load is supported on AVX-512 target only!");
24437 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
24438 "Expanding masked load is supported for 32 and 64-bit types only!");
24440 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24441 "Cannot lower masked store op.");
24443 assert((ScalarVT.getSizeInBits() >= 32 ||
24444 (Subtarget.hasBWI() &&
24445 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24446 "Unsupported masked store op.");
24448 // This operation is legal for targets with VLX, but without
24449 // VLX the vector should be widened to 512 bit
24450 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24451 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24453 // Mask element has to be i1.
24454 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
24455 "Unexpected mask type");
24457 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
24459 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24460 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24461 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24462 Mask, N->getMemoryVT(), N->getMemOperand(),
24463 N->isTruncatingStore(), N->isCompressingStore());
24466 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24467 SelectionDAG &DAG) {
24468 assert(Subtarget.hasAVX2() &&
24469 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
24471 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24473 MVT VT = Op.getSimpleValueType();
24474 SDValue Index = N->getIndex();
24475 SDValue Mask = N->getMask();
24476 SDValue Src0 = N->getValue();
24477 MVT IndexVT = Index.getSimpleValueType();
24478 MVT MaskVT = Mask.getSimpleValueType();
24480 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
24482 // If the index is v2i32, we're being called by type legalization.
24483 if (IndexVT == MVT::v2i32)
24486 // If we don't have VLX and neither the passthru or index is 512-bits, we
24487 // need to widen until one is.
24489 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24490 !IndexVT.is512BitVector()) {
24491 // Determine how much we need to widen by to get a 512-bit type.
24492 unsigned Factor = std::min(512/VT.getSizeInBits(),
24493 512/IndexVT.getSizeInBits());
24495 unsigned NumElts = VT.getVectorNumElements() * Factor;
24497 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
24498 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
24499 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24501 Src0 = ExtendToType(Src0, VT, DAG);
24502 Index = ExtendToType(Index, IndexVT, DAG);
24503 Mask = ExtendToType(Mask, MaskVT, DAG, true);
24506 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
24508 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24509 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24510 N->getMemOperand());
24511 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
24512 NewGather, DAG.getIntPtrConstant(0, dl));
24513 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
24516 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24517 SelectionDAG &DAG) const {
24518 // TODO: Eventually, the lowering of these nodes should be informed by or
24519 // deferred to the GC strategy for the function in which they appear. For
24520 // now, however, they must be lowered to something. Since they are logically
24521 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24522 // require special handling for these nodes), lower them as literal NOOPs for
24524 SmallVector<SDValue, 2> Ops;
24526 Ops.push_back(Op.getOperand(0));
24527 if (Op->getGluedNode())
24528 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24531 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24532 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24537 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24538 SelectionDAG &DAG) const {
24539 // TODO: Eventually, the lowering of these nodes should be informed by or
24540 // deferred to the GC strategy for the function in which they appear. For
24541 // now, however, they must be lowered to something. Since they are logically
24542 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24543 // require special handling for these nodes), lower them as literal NOOPs for
24545 SmallVector<SDValue, 2> Ops;
24547 Ops.push_back(Op.getOperand(0));
24548 if (Op->getGluedNode())
24549 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24552 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24553 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24558 /// Provide custom lowering hooks for some operations.
24559 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24560 switch (Op.getOpcode()) {
24561 default: llvm_unreachable("Should not custom lower this!");
24562 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24563 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24564 return LowerCMP_SWAP(Op, Subtarget, DAG);
24565 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24566 case ISD::ATOMIC_LOAD_ADD:
24567 case ISD::ATOMIC_LOAD_SUB:
24568 case ISD::ATOMIC_LOAD_OR:
24569 case ISD::ATOMIC_LOAD_XOR:
24570 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24571 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24572 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24573 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24574 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24575 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24576 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24577 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24578 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24579 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24580 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24581 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24582 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24583 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24584 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24585 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24586 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24587 case ISD::SHL_PARTS:
24588 case ISD::SRA_PARTS:
24589 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24590 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24591 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24592 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24593 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24594 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24595 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24596 case ISD::ZERO_EXTEND_VECTOR_INREG:
24597 case ISD::SIGN_EXTEND_VECTOR_INREG:
24598 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24599 case ISD::FP_TO_SINT:
24600 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24601 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24602 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24604 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24605 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24606 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24607 case ISD::SETCC: return LowerSETCC(Op, DAG);
24608 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24609 case ISD::SELECT: return LowerSELECT(Op, DAG);
24610 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24611 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24612 case ISD::VASTART: return LowerVASTART(Op, DAG);
24613 case ISD::VAARG: return LowerVAARG(Op, DAG);
24614 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24615 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24616 case ISD::INTRINSIC_VOID:
24617 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24618 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24619 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24620 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24621 case ISD::FRAME_TO_ARGS_OFFSET:
24622 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24623 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24624 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24625 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24626 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24627 case ISD::EH_SJLJ_SETUP_DISPATCH:
24628 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24629 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24630 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24631 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24633 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24635 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24636 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24638 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24639 case ISD::UMUL_LOHI:
24640 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24642 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24645 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24651 case ISD::UMULO: return LowerXALUO(Op, DAG);
24652 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24653 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24654 case ISD::ADDCARRY:
24655 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24657 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24661 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24662 case ISD::ABS: return LowerABS(Op, DAG);
24663 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24664 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24665 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24666 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24667 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24668 case ISD::GC_TRANSITION_START:
24669 return LowerGC_TRANSITION_START(Op, DAG);
24670 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24674 /// Places new result values for the node in Results (their number
24675 /// and types must exactly match those of the original return values of
24676 /// the node), or leaves Results empty, which indicates that the node is not
24677 /// to be custom lowered after all.
24678 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24679 SmallVectorImpl<SDValue> &Results,
24680 SelectionDAG &DAG) const {
24681 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24683 if (!Res.getNode())
24686 assert((N->getNumValues() <= Res->getNumValues()) &&
24687 "Lowering returned the wrong number of results!");
24689 // Places new result values base on N result number.
24690 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24691 // than original node, chain should be dropped(last value).
24692 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24693 Results.push_back(Res.getValue(I));
24696 /// Replace a node with an illegal result type with a new node built out of
24698 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24699 SmallVectorImpl<SDValue>&Results,
24700 SelectionDAG &DAG) const {
24702 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24703 switch (N->getOpcode()) {
24705 llvm_unreachable("Do not know how to custom type legalize this operation!");
24706 case X86ISD::AVG: {
24707 // Legalize types for X86ISD::AVG by expanding vectors.
24708 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24710 auto InVT = N->getValueType(0);
24711 auto InVTSize = InVT.getSizeInBits();
24712 const unsigned RegSize =
24713 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24714 assert((Subtarget.hasBWI() || RegSize < 512) &&
24715 "512-bit vector requires AVX512BW");
24716 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24717 "256-bit vector requires AVX2");
24719 auto ElemVT = InVT.getVectorElementType();
24720 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24721 RegSize / ElemVT.getSizeInBits());
24722 assert(RegSize % InVT.getSizeInBits() == 0);
24723 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24725 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24726 Ops[0] = N->getOperand(0);
24727 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24728 Ops[0] = N->getOperand(1);
24729 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24731 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24732 if (!ExperimentalVectorWideningLegalization)
24733 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24734 DAG.getIntPtrConstant(0, dl));
24735 Results.push_back(Res);
24738 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24739 case X86ISD::FMINC:
24741 case X86ISD::FMAXC:
24742 case X86ISD::FMAX: {
24743 EVT VT = N->getValueType(0);
24744 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24745 SDValue UNDEF = DAG.getUNDEF(VT);
24746 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24747 N->getOperand(0), UNDEF);
24748 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24749 N->getOperand(1), UNDEF);
24750 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24758 case ISD::UDIVREM: {
24759 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24760 Results.push_back(V);
24763 case ISD::FP_TO_SINT:
24764 case ISD::FP_TO_UINT: {
24765 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24767 if (N->getValueType(0) == MVT::v2i32) {
24768 assert((IsSigned || Subtarget.hasAVX512()) &&
24769 "Can only handle signed conversion without AVX512");
24770 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24771 SDValue Src = N->getOperand(0);
24772 if (Src.getValueType() == MVT::v2f64) {
24773 MVT ResVT = MVT::v4i32;
24774 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
24775 if (!IsSigned && !Subtarget.hasVLX()) {
24776 // Widen to 512-bits.
24777 ResVT = MVT::v8i32;
24778 Opc = ISD::FP_TO_UINT;
24779 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
24780 DAG.getUNDEF(MVT::v8f64),
24781 Src, DAG.getIntPtrConstant(0, dl));
24783 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
24784 ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
24786 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
24787 DAG.getIntPtrConstant(0, dl));
24788 Results.push_back(Res);
24791 if (Src.getValueType() == MVT::v2f32) {
24792 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24793 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24794 DAG.getUNDEF(MVT::v2f32));
24795 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24796 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24797 if (!ExperimentalVectorWideningLegalization)
24798 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24799 Results.push_back(Res);
24803 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24804 // so early out here.
24808 std::pair<SDValue,SDValue> Vals =
24809 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24810 SDValue FIST = Vals.first, StackSlot = Vals.second;
24811 if (FIST.getNode()) {
24812 EVT VT = N->getValueType(0);
24813 // Return a load from the stack slot.
24814 if (StackSlot.getNode())
24816 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24818 Results.push_back(FIST);
24822 case ISD::SINT_TO_FP: {
24823 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24824 SDValue Src = N->getOperand(0);
24825 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24827 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24830 case ISD::UINT_TO_FP: {
24831 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24832 EVT VT = N->getValueType(0);
24833 if (VT != MVT::v2f32)
24835 SDValue Src = N->getOperand(0);
24836 EVT SrcVT = Src.getValueType();
24837 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24838 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24841 if (SrcVT != MVT::v2i32)
24843 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24845 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24846 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24847 DAG.getBitcast(MVT::v2i64, VBias));
24848 Or = DAG.getBitcast(MVT::v2f64, Or);
24849 // TODO: Are there any fast-math-flags to propagate here?
24850 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24851 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24854 case ISD::FP_ROUND: {
24855 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24857 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24858 Results.push_back(V);
24861 case ISD::FP_EXTEND: {
24862 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24863 // No other ValueType for FP_EXTEND should reach this point.
24864 assert(N->getValueType(0) == MVT::v2f32 &&
24865 "Do not know how to legalize this Node");
24868 case ISD::INTRINSIC_W_CHAIN: {
24869 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24871 default : llvm_unreachable("Do not know how to custom type "
24872 "legalize this intrinsic operation!");
24873 case Intrinsic::x86_rdtsc:
24874 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24876 case Intrinsic::x86_rdtscp:
24877 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24879 case Intrinsic::x86_rdpmc:
24880 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24882 case Intrinsic::x86_xgetbv:
24883 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24886 case ISD::INTRINSIC_WO_CHAIN: {
24887 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
24888 Results.push_back(V);
24891 case ISD::READCYCLECOUNTER: {
24892 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24895 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24896 EVT T = N->getValueType(0);
24897 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24898 bool Regs64bit = T == MVT::i128;
24899 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24900 SDValue cpInL, cpInH;
24901 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24902 DAG.getConstant(0, dl, HalfT));
24903 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24904 DAG.getConstant(1, dl, HalfT));
24905 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24906 Regs64bit ? X86::RAX : X86::EAX,
24908 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24909 Regs64bit ? X86::RDX : X86::EDX,
24910 cpInH, cpInL.getValue(1));
24911 SDValue swapInL, swapInH;
24912 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24913 DAG.getConstant(0, dl, HalfT));
24914 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24915 DAG.getConstant(1, dl, HalfT));
24917 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24918 swapInH, cpInH.getValue(1));
24919 // If the current function needs the base pointer, RBX,
24920 // we shouldn't use cmpxchg directly.
24921 // Indeed the lowering of that instruction will clobber
24922 // that register and since RBX will be a reserved register
24923 // the register allocator will not make sure its value will
24924 // be properly saved and restored around this live-range.
24925 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24927 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24928 unsigned BasePtr = TRI->getBaseRegister();
24929 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24930 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24931 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24932 // ISel prefers the LCMPXCHG64 variant.
24933 // If that assert breaks, that means it is not the case anymore,
24934 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24935 // not just EBX. This is a matter of accepting i64 input for that
24936 // pseudo, and restoring into the register of the right wide
24937 // in expand pseudo. Everything else should just work.
24938 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24939 "Saving only half of the RBX");
24940 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24941 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24942 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24943 Regs64bit ? X86::RBX : X86::EBX,
24944 HalfT, swapInH.getValue(1));
24945 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24947 /*Glue*/ RBXSave.getValue(2)};
24948 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24951 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24952 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24953 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24954 swapInH.getValue(1));
24955 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24956 swapInL.getValue(1)};
24957 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24959 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24960 Regs64bit ? X86::RAX : X86::EAX,
24961 HalfT, Result.getValue(1));
24962 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24963 Regs64bit ? X86::RDX : X86::EDX,
24964 HalfT, cpOutL.getValue(2));
24965 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24967 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24968 MVT::i32, cpOutH.getValue(2));
24969 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24970 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24972 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24973 Results.push_back(Success);
24974 Results.push_back(EFLAGS.getValue(1));
24977 case ISD::ATOMIC_SWAP:
24978 case ISD::ATOMIC_LOAD_ADD:
24979 case ISD::ATOMIC_LOAD_SUB:
24980 case ISD::ATOMIC_LOAD_AND:
24981 case ISD::ATOMIC_LOAD_OR:
24982 case ISD::ATOMIC_LOAD_XOR:
24983 case ISD::ATOMIC_LOAD_NAND:
24984 case ISD::ATOMIC_LOAD_MIN:
24985 case ISD::ATOMIC_LOAD_MAX:
24986 case ISD::ATOMIC_LOAD_UMIN:
24987 case ISD::ATOMIC_LOAD_UMAX:
24988 case ISD::ATOMIC_LOAD: {
24989 // Delegate to generic TypeLegalization. Situations we can really handle
24990 // should have already been dealt with by AtomicExpandPass.cpp.
24993 case ISD::BITCAST: {
24994 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24995 EVT DstVT = N->getValueType(0);
24996 EVT SrcVT = N->getOperand(0).getValueType();
24998 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
24999 // we can split using the k-register rather than memory.
25000 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25001 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25002 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v32i1,
25004 DAG.getIntPtrConstant(0, dl));
25005 Lo = DAG.getBitcast(MVT::i32, Lo);
25006 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v32i1,
25008 DAG.getIntPtrConstant(32, dl));
25009 Hi = DAG.getBitcast(MVT::i32, Hi);
25010 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25011 Results.push_back(Res);
25015 if (SrcVT != MVT::f64 ||
25016 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25019 unsigned NumElts = DstVT.getVectorNumElements();
25020 EVT SVT = DstVT.getVectorElementType();
25021 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25022 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25023 MVT::v2f64, N->getOperand(0));
25024 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25026 if (ExperimentalVectorWideningLegalization) {
25027 // If we are legalizing vectors by widening, we already have the desired
25028 // legal vector type, just return it.
25029 Results.push_back(ToVecInt);
25033 SmallVector<SDValue, 8> Elts;
25034 for (unsigned i = 0, e = NumElts; i != e; ++i)
25035 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25036 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25038 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25041 case ISD::MGATHER: {
25042 EVT VT = N->getValueType(0);
25043 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25044 auto *Gather = cast<MaskedGatherSDNode>(N);
25045 SDValue Index = Gather->getIndex();
25046 if (Index.getValueType() != MVT::v2i64)
25048 SDValue Mask = Gather->getMask();
25049 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25050 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25051 Gather->getValue(),
25052 DAG.getUNDEF(MVT::v2f32));
25053 if (!Subtarget.hasVLX()) {
25054 // We need to widen the mask, but the instruction will only use 2
25055 // of its elements. So we can use undef.
25056 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25057 DAG.getUNDEF(MVT::v2i1));
25058 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25060 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25061 Index, Gather->getScale() };
25062 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25063 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25064 Gather->getMemoryVT(), Gather->getMemOperand());
25065 Results.push_back(Res);
25066 Results.push_back(Res.getValue(2));
25069 if (VT == MVT::v2i32) {
25070 auto *Gather = cast<MaskedGatherSDNode>(N);
25071 SDValue Index = Gather->getIndex();
25072 SDValue Mask = Gather->getMask();
25073 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25074 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25075 Gather->getValue(),
25076 DAG.getUNDEF(MVT::v2i32));
25077 // If the index is v2i64 we can use it directly.
25078 if (Index.getValueType() == MVT::v2i64 &&
25079 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25080 if (!Subtarget.hasVLX()) {
25081 // We need to widen the mask, but the instruction will only use 2
25082 // of its elements. So we can use undef.
25083 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25084 DAG.getUNDEF(MVT::v2i1));
25085 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25087 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25088 Index, Gather->getScale() };
25089 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25090 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25091 Gather->getMemoryVT(), Gather->getMemOperand());
25092 SDValue Chain = Res.getValue(2);
25093 if (!ExperimentalVectorWideningLegalization)
25094 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25095 DAG.getIntPtrConstant(0, dl));
25096 Results.push_back(Res);
25097 Results.push_back(Chain);
25100 EVT IndexVT = Index.getValueType();
25101 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25102 IndexVT.getScalarType(), 4);
25103 // Otherwise we need to custom widen everything to avoid promotion.
25104 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25105 DAG.getUNDEF(IndexVT));
25106 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25107 DAG.getConstant(0, dl, MVT::v2i1));
25108 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25109 Index, Gather->getScale() };
25110 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25111 Gather->getMemoryVT(), dl, Ops,
25112 Gather->getMemOperand());
25113 SDValue Chain = Res.getValue(1);
25114 if (!ExperimentalVectorWideningLegalization)
25115 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25116 DAG.getIntPtrConstant(0, dl));
25117 Results.push_back(Res);
25118 Results.push_back(Chain);
25126 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25127 switch ((X86ISD::NodeType)Opcode) {
25128 case X86ISD::FIRST_NUMBER: break;
25129 case X86ISD::BSF: return "X86ISD::BSF";
25130 case X86ISD::BSR: return "X86ISD::BSR";
25131 case X86ISD::SHLD: return "X86ISD::SHLD";
25132 case X86ISD::SHRD: return "X86ISD::SHRD";
25133 case X86ISD::FAND: return "X86ISD::FAND";
25134 case X86ISD::FANDN: return "X86ISD::FANDN";
25135 case X86ISD::FOR: return "X86ISD::FOR";
25136 case X86ISD::FXOR: return "X86ISD::FXOR";
25137 case X86ISD::FILD: return "X86ISD::FILD";
25138 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25139 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25140 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25141 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25142 case X86ISD::FLD: return "X86ISD::FLD";
25143 case X86ISD::FST: return "X86ISD::FST";
25144 case X86ISD::CALL: return "X86ISD::CALL";
25145 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25146 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25147 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25148 case X86ISD::BT: return "X86ISD::BT";
25149 case X86ISD::CMP: return "X86ISD::CMP";
25150 case X86ISD::COMI: return "X86ISD::COMI";
25151 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25152 case X86ISD::CMPM: return "X86ISD::CMPM";
25153 case X86ISD::CMPMU: return "X86ISD::CMPMU";
25154 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25155 case X86ISD::SETCC: return "X86ISD::SETCC";
25156 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25157 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25158 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25159 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25160 case X86ISD::CMOV: return "X86ISD::CMOV";
25161 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25162 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25163 case X86ISD::IRET: return "X86ISD::IRET";
25164 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25165 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25166 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25167 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25168 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25169 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25170 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25171 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25172 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25173 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25174 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25175 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25176 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25177 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25178 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25179 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25180 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25181 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25182 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25183 case X86ISD::HADD: return "X86ISD::HADD";
25184 case X86ISD::HSUB: return "X86ISD::HSUB";
25185 case X86ISD::FHADD: return "X86ISD::FHADD";
25186 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25187 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25188 case X86ISD::FMAX: return "X86ISD::FMAX";
25189 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25190 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25191 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25192 case X86ISD::FMIN: return "X86ISD::FMIN";
25193 case X86ISD::FMINS: return "X86ISD::FMINS";
25194 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25195 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25196 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25197 case X86ISD::FMINC: return "X86ISD::FMINC";
25198 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25199 case X86ISD::FRCP: return "X86ISD::FRCP";
25200 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25201 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25202 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25203 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25204 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25205 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25206 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25207 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25208 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25209 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25210 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25211 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25212 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25213 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25214 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25215 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25216 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25217 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25218 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25219 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25220 case X86ISD::LADD: return "X86ISD::LADD";
25221 case X86ISD::LSUB: return "X86ISD::LSUB";
25222 case X86ISD::LOR: return "X86ISD::LOR";
25223 case X86ISD::LXOR: return "X86ISD::LXOR";
25224 case X86ISD::LAND: return "X86ISD::LAND";
25225 case X86ISD::LINC: return "X86ISD::LINC";
25226 case X86ISD::LDEC: return "X86ISD::LDEC";
25227 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25228 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25229 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25230 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25231 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25232 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25233 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25234 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25235 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25236 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25237 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25238 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25239 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25240 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25241 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25242 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25243 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25244 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25245 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25246 case X86ISD::VSHL: return "X86ISD::VSHL";
25247 case X86ISD::VSRL: return "X86ISD::VSRL";
25248 case X86ISD::VSRA: return "X86ISD::VSRA";
25249 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25250 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25251 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25252 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25253 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25254 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25255 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25256 case X86ISD::CMPP: return "X86ISD::CMPP";
25257 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25258 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25259 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25260 case X86ISD::ADD: return "X86ISD::ADD";
25261 case X86ISD::SUB: return "X86ISD::SUB";
25262 case X86ISD::ADC: return "X86ISD::ADC";
25263 case X86ISD::SBB: return "X86ISD::SBB";
25264 case X86ISD::SMUL: return "X86ISD::SMUL";
25265 case X86ISD::UMUL: return "X86ISD::UMUL";
25266 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25267 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25268 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25269 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25270 case X86ISD::INC: return "X86ISD::INC";
25271 case X86ISD::DEC: return "X86ISD::DEC";
25272 case X86ISD::OR: return "X86ISD::OR";
25273 case X86ISD::XOR: return "X86ISD::XOR";
25274 case X86ISD::AND: return "X86ISD::AND";
25275 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25276 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25277 case X86ISD::PTEST: return "X86ISD::PTEST";
25278 case X86ISD::TESTP: return "X86ISD::TESTP";
25279 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25280 case X86ISD::KTEST: return "X86ISD::KTEST";
25281 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25282 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25283 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25284 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25285 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25286 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25287 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25288 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25289 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25290 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25291 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25292 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25293 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25294 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25295 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25296 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25297 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25298 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25299 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25300 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25301 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25302 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25303 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25304 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25305 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25306 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25307 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25308 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25309 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25310 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25311 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25312 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25313 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25314 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25315 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25316 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25317 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25318 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25319 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25320 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25321 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25322 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25323 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25324 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25325 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25326 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25327 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25328 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25329 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25330 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25331 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25332 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25333 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25334 case X86ISD::SAHF: return "X86ISD::SAHF";
25335 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25336 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25337 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25338 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25339 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25340 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25341 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25342 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25343 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25344 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25345 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25346 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25347 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25348 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25349 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25350 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25351 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25352 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25353 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25354 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25355 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25356 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25357 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25358 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25359 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25360 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25361 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25362 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25363 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25364 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25365 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25366 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25367 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25368 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25369 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25370 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25371 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25372 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25373 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25374 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25375 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25376 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25377 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25378 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25379 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25380 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25381 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25382 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25383 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25384 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25385 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25386 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25387 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25388 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25389 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25390 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25391 case X86ISD::XTEST: return "X86ISD::XTEST";
25392 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25393 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25394 case X86ISD::SELECT: return "X86ISD::SELECT";
25395 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25396 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25397 case X86ISD::RCP14: return "X86ISD::RCP14";
25398 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25399 case X86ISD::RCP28: return "X86ISD::RCP28";
25400 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25401 case X86ISD::EXP2: return "X86ISD::EXP2";
25402 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25403 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25404 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25405 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25406 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25407 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25408 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25409 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25410 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25411 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25412 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25413 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25414 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25415 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25416 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25417 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25418 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25419 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25420 case X86ISD::ADDS: return "X86ISD::ADDS";
25421 case X86ISD::SUBS: return "X86ISD::SUBS";
25422 case X86ISD::AVG: return "X86ISD::AVG";
25423 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25424 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25425 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25426 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25427 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25428 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25429 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25430 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25431 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25432 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25433 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25434 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25435 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25436 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25437 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25438 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25439 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25440 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25441 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25442 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25443 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25444 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25445 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25446 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25447 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25448 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25449 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25450 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25451 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25452 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25453 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25454 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25455 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25456 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25457 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25458 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25463 /// Return true if the addressing mode represented by AM is legal for this
25464 /// target, for a load/store of the specified type.
25465 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25466 const AddrMode &AM, Type *Ty,
25468 Instruction *I) const {
25469 // X86 supports extremely general addressing modes.
25470 CodeModel::Model M = getTargetMachine().getCodeModel();
25472 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25473 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25477 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25479 // If a reference to this global requires an extra load, we can't fold it.
25480 if (isGlobalStubReference(GVFlags))
25483 // If BaseGV requires a register for the PIC base, we cannot also have a
25484 // BaseReg specified.
25485 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25488 // If lower 4G is not available, then we must use rip-relative addressing.
25489 if ((M != CodeModel::Small || isPositionIndependent()) &&
25490 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25494 switch (AM.Scale) {
25500 // These scales always work.
25505 // These scales are formed with basereg+scalereg. Only accept if there is
25510 default: // Other stuff never works.
25517 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25518 unsigned Bits = Ty->getScalarSizeInBits();
25520 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25521 // particularly cheaper than those without.
25525 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
25526 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
25527 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
25530 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
25531 // shifts just as cheap as scalar ones.
25532 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
25535 // AVX512BW has shifts such as vpsllvw.
25536 if (Subtarget.hasBWI() && Bits == 16)
25539 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25540 // fully general vector.
25544 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25545 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25547 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25548 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25549 return NumBits1 > NumBits2;
25552 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25553 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25556 if (!isTypeLegal(EVT::getEVT(Ty1)))
25559 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
25561 // Assuming the caller doesn't have a zeroext or signext return parameter,
25562 // truncation all the way down to i1 is valid.
25566 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25567 return isInt<32>(Imm);
25570 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25571 // Can also use sub to handle negated immediates.
25572 return isInt<32>(Imm);
25575 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25576 if (!VT1.isInteger() || !VT2.isInteger())
25578 unsigned NumBits1 = VT1.getSizeInBits();
25579 unsigned NumBits2 = VT2.getSizeInBits();
25580 return NumBits1 > NumBits2;
25583 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25584 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25585 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25588 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25589 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25590 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25593 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25594 EVT VT1 = Val.getValueType();
25595 if (isZExtFree(VT1, VT2))
25598 if (Val.getOpcode() != ISD::LOAD)
25601 if (!VT1.isSimple() || !VT1.isInteger() ||
25602 !VT2.isSimple() || !VT2.isInteger())
25605 switch (VT1.getSimpleVT().SimpleTy) {
25610 // X86 has 8, 16, and 32-bit zero-extending loads.
25617 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
25618 EVT SrcVT = ExtVal.getOperand(0).getValueType();
25620 // There is no extending load for vXi1.
25621 if (SrcVT.getScalarType() == MVT::i1)
25628 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
25629 if (!Subtarget.hasAnyFMA())
25632 VT = VT.getScalarType();
25634 if (!VT.isSimple())
25637 switch (VT.getSimpleVT().SimpleTy) {
25648 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
25649 // i16 instructions are longer (0x66 prefix) and potentially slower.
25650 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
25653 /// Targets can use this to indicate that they only support *some*
25654 /// VECTOR_SHUFFLE operations, those with specific masks.
25655 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
25656 /// are assumed to be legal.
25657 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
25658 if (!VT.isSimple())
25661 // Not for i1 vectors
25662 if (VT.getSimpleVT().getScalarType() == MVT::i1)
25665 // Very little shuffling can be done for 64-bit vectors right now.
25666 if (VT.getSimpleVT().getSizeInBits() == 64)
25669 // We only care that the types being shuffled are legal. The lowering can
25670 // handle any possible shuffle mask that results.
25671 return isTypeLegal(VT.getSimpleVT());
25675 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
25677 // Just delegate to the generic legality, clear masks aren't special.
25678 return isShuffleMaskLegal(Mask, VT);
25681 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
25682 // If the subtarget is using retpolines, we need to not generate jump tables.
25683 if (Subtarget.useRetpoline())
25686 // Otherwise, fallback on the generic logic.
25687 return TargetLowering::areJTsAllowed(Fn);
25690 //===----------------------------------------------------------------------===//
25691 // X86 Scheduler Hooks
25692 //===----------------------------------------------------------------------===//
25694 /// Utility function to emit xbegin specifying the start of an RTM region.
25695 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25696 const TargetInstrInfo *TII) {
25697 DebugLoc DL = MI.getDebugLoc();
25699 const BasicBlock *BB = MBB->getBasicBlock();
25700 MachineFunction::iterator I = ++MBB->getIterator();
25702 // For the v = xbegin(), we generate
25711 // eax = # XABORT_DEF
25715 // v = phi(s0/mainBB, s1/fallBB)
25717 MachineBasicBlock *thisMBB = MBB;
25718 MachineFunction *MF = MBB->getParent();
25719 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25720 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25721 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25722 MF->insert(I, mainMBB);
25723 MF->insert(I, fallMBB);
25724 MF->insert(I, sinkMBB);
25726 // Transfer the remainder of BB and its successor edges to sinkMBB.
25727 sinkMBB->splice(sinkMBB->begin(), MBB,
25728 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25729 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25731 MachineRegisterInfo &MRI = MF->getRegInfo();
25732 unsigned DstReg = MI.getOperand(0).getReg();
25733 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25734 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25735 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25739 // # fallthrough to mainMBB
25740 // # abortion to fallMBB
25741 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25742 thisMBB->addSuccessor(mainMBB);
25743 thisMBB->addSuccessor(fallMBB);
25746 // mainDstReg := -1
25747 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25748 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25749 mainMBB->addSuccessor(sinkMBB);
25752 // ; pseudo instruction to model hardware's definition from XABORT
25753 // EAX := XABORT_DEF
25754 // fallDstReg := EAX
25755 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25756 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25758 fallMBB->addSuccessor(sinkMBB);
25761 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25762 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25763 .addReg(mainDstReg).addMBB(mainMBB)
25764 .addReg(fallDstReg).addMBB(fallMBB);
25766 MI.eraseFromParent();
25770 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25771 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25772 // in the .td file.
25773 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25774 const TargetInstrInfo *TII) {
25776 switch (MI.getOpcode()) {
25777 default: llvm_unreachable("illegal opcode!");
25778 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25779 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25780 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25781 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25782 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25783 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25784 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25785 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25788 DebugLoc dl = MI.getDebugLoc();
25789 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25791 unsigned NumArgs = MI.getNumOperands();
25792 for (unsigned i = 1; i < NumArgs; ++i) {
25793 MachineOperand &Op = MI.getOperand(i);
25794 if (!(Op.isReg() && Op.isImplicit()))
25797 if (MI.hasOneMemOperand())
25798 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25800 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25801 .addReg(X86::XMM0);
25803 MI.eraseFromParent();
25807 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25808 // defs in an instruction pattern
25809 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25810 const TargetInstrInfo *TII) {
25812 switch (MI.getOpcode()) {
25813 default: llvm_unreachable("illegal opcode!");
25814 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25815 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25816 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25817 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25818 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25819 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25820 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25821 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25824 DebugLoc dl = MI.getDebugLoc();
25825 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25827 unsigned NumArgs = MI.getNumOperands(); // remove the results
25828 for (unsigned i = 1; i < NumArgs; ++i) {
25829 MachineOperand &Op = MI.getOperand(i);
25830 if (!(Op.isReg() && Op.isImplicit()))
25833 if (MI.hasOneMemOperand())
25834 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25836 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25839 MI.eraseFromParent();
25843 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25844 const X86Subtarget &Subtarget) {
25845 DebugLoc dl = MI.getDebugLoc();
25846 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25848 // insert input VAL into EAX
25849 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25850 .addReg(MI.getOperand(0).getReg());
25851 // insert zero to ECX
25852 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25854 // insert zero to EDX
25855 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25857 // insert WRPKRU instruction
25858 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25860 MI.eraseFromParent(); // The pseudo is gone now.
25864 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25865 const X86Subtarget &Subtarget) {
25866 DebugLoc dl = MI.getDebugLoc();
25867 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25869 // insert zero to ECX
25870 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25872 // insert RDPKRU instruction
25873 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25874 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25877 MI.eraseFromParent(); // The pseudo is gone now.
25881 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25882 const X86Subtarget &Subtarget,
25884 DebugLoc dl = MI.getDebugLoc();
25885 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25886 // Address into RAX/EAX, other two args into ECX, EDX.
25887 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25888 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25889 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25890 for (int i = 0; i < X86::AddrNumOperands; ++i)
25891 MIB.add(MI.getOperand(i));
25893 unsigned ValOps = X86::AddrNumOperands;
25894 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25895 .addReg(MI.getOperand(ValOps).getReg());
25896 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25897 .addReg(MI.getOperand(ValOps + 1).getReg());
25899 // The instruction doesn't actually take any operands though.
25900 BuildMI(*BB, MI, dl, TII->get(Opc));
25902 MI.eraseFromParent(); // The pseudo is gone now.
25906 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25907 const X86Subtarget &Subtarget) {
25908 DebugLoc dl = MI->getDebugLoc();
25909 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25910 // Address into RAX/EAX
25911 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25912 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25913 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25914 for (int i = 0; i < X86::AddrNumOperands; ++i)
25915 MIB.add(MI->getOperand(i));
25917 // The instruction doesn't actually take any operands though.
25918 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25920 MI->eraseFromParent(); // The pseudo is gone now.
25926 MachineBasicBlock *
25927 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25928 MachineBasicBlock *MBB) const {
25929 // Emit va_arg instruction on X86-64.
25931 // Operands to this pseudo-instruction:
25932 // 0 ) Output : destination address (reg)
25933 // 1-5) Input : va_list address (addr, i64mem)
25934 // 6 ) ArgSize : Size (in bytes) of vararg type
25935 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25936 // 8 ) Align : Alignment of type
25937 // 9 ) EFLAGS (implicit-def)
25939 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25940 static_assert(X86::AddrNumOperands == 5,
25941 "VAARG_64 assumes 5 address operands");
25943 unsigned DestReg = MI.getOperand(0).getReg();
25944 MachineOperand &Base = MI.getOperand(1);
25945 MachineOperand &Scale = MI.getOperand(2);
25946 MachineOperand &Index = MI.getOperand(3);
25947 MachineOperand &Disp = MI.getOperand(4);
25948 MachineOperand &Segment = MI.getOperand(5);
25949 unsigned ArgSize = MI.getOperand(6).getImm();
25950 unsigned ArgMode = MI.getOperand(7).getImm();
25951 unsigned Align = MI.getOperand(8).getImm();
25953 // Memory Reference
25954 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25955 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25956 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25958 // Machine Information
25959 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25960 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25961 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25962 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25963 DebugLoc DL = MI.getDebugLoc();
25965 // struct va_list {
25968 // i64 overflow_area (address)
25969 // i64 reg_save_area (address)
25971 // sizeof(va_list) = 24
25972 // alignment(va_list) = 8
25974 unsigned TotalNumIntRegs = 6;
25975 unsigned TotalNumXMMRegs = 8;
25976 bool UseGPOffset = (ArgMode == 1);
25977 bool UseFPOffset = (ArgMode == 2);
25978 unsigned MaxOffset = TotalNumIntRegs * 8 +
25979 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25981 /* Align ArgSize to a multiple of 8 */
25982 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25983 bool NeedsAlign = (Align > 8);
25985 MachineBasicBlock *thisMBB = MBB;
25986 MachineBasicBlock *overflowMBB;
25987 MachineBasicBlock *offsetMBB;
25988 MachineBasicBlock *endMBB;
25990 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25991 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25992 unsigned OffsetReg = 0;
25994 if (!UseGPOffset && !UseFPOffset) {
25995 // If we only pull from the overflow region, we don't create a branch.
25996 // We don't need to alter control flow.
25997 OffsetDestReg = 0; // unused
25998 OverflowDestReg = DestReg;
26000 offsetMBB = nullptr;
26001 overflowMBB = thisMBB;
26004 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26005 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26006 // If not, pull from overflow_area. (branch to overflowMBB)
26011 // offsetMBB overflowMBB
26016 // Registers for the PHI in endMBB
26017 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26018 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26020 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26021 MachineFunction *MF = MBB->getParent();
26022 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26023 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26024 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26026 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26028 // Insert the new basic blocks
26029 MF->insert(MBBIter, offsetMBB);
26030 MF->insert(MBBIter, overflowMBB);
26031 MF->insert(MBBIter, endMBB);
26033 // Transfer the remainder of MBB and its successor edges to endMBB.
26034 endMBB->splice(endMBB->begin(), thisMBB,
26035 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26036 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26038 // Make offsetMBB and overflowMBB successors of thisMBB
26039 thisMBB->addSuccessor(offsetMBB);
26040 thisMBB->addSuccessor(overflowMBB);
26042 // endMBB is a successor of both offsetMBB and overflowMBB
26043 offsetMBB->addSuccessor(endMBB);
26044 overflowMBB->addSuccessor(endMBB);
26046 // Load the offset value into a register
26047 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26048 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26052 .addDisp(Disp, UseFPOffset ? 4 : 0)
26054 .setMemRefs(MMOBegin, MMOEnd);
26056 // Check if there is enough room left to pull this argument.
26057 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26059 .addImm(MaxOffset + 8 - ArgSizeA8);
26061 // Branch to "overflowMBB" if offset >= max
26062 // Fall through to "offsetMBB" otherwise
26063 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26064 .addMBB(overflowMBB);
26067 // In offsetMBB, emit code to use the reg_save_area.
26069 assert(OffsetReg != 0);
26071 // Read the reg_save_area address.
26072 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26073 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26079 .setMemRefs(MMOBegin, MMOEnd);
26081 // Zero-extend the offset
26082 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26083 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26086 .addImm(X86::sub_32bit);
26088 // Add the offset to the reg_save_area to get the final address.
26089 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26090 .addReg(OffsetReg64)
26091 .addReg(RegSaveReg);
26093 // Compute the offset for the next argument
26094 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26095 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26097 .addImm(UseFPOffset ? 16 : 8);
26099 // Store it back into the va_list.
26100 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26104 .addDisp(Disp, UseFPOffset ? 4 : 0)
26106 .addReg(NextOffsetReg)
26107 .setMemRefs(MMOBegin, MMOEnd);
26110 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26115 // Emit code to use overflow area
26118 // Load the overflow_area address into a register.
26119 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26120 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26126 .setMemRefs(MMOBegin, MMOEnd);
26128 // If we need to align it, do so. Otherwise, just copy the address
26129 // to OverflowDestReg.
26131 // Align the overflow address
26132 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26133 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26135 // aligned_addr = (addr + (align-1)) & ~(align-1)
26136 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26137 .addReg(OverflowAddrReg)
26140 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26142 .addImm(~(uint64_t)(Align-1));
26144 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26145 .addReg(OverflowAddrReg);
26148 // Compute the next overflow address after this argument.
26149 // (the overflow address should be kept 8-byte aligned)
26150 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26151 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26152 .addReg(OverflowDestReg)
26153 .addImm(ArgSizeA8);
26155 // Store the new overflow address.
26156 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26162 .addReg(NextAddrReg)
26163 .setMemRefs(MMOBegin, MMOEnd);
26165 // If we branched, emit the PHI to the front of endMBB.
26167 BuildMI(*endMBB, endMBB->begin(), DL,
26168 TII->get(X86::PHI), DestReg)
26169 .addReg(OffsetDestReg).addMBB(offsetMBB)
26170 .addReg(OverflowDestReg).addMBB(overflowMBB);
26173 // Erase the pseudo instruction
26174 MI.eraseFromParent();
26179 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26180 MachineInstr &MI, MachineBasicBlock *MBB) const {
26181 // Emit code to save XMM registers to the stack. The ABI says that the
26182 // number of registers to save is given in %al, so it's theoretically
26183 // possible to do an indirect jump trick to avoid saving all of them,
26184 // however this code takes a simpler approach and just executes all
26185 // of the stores if %al is non-zero. It's less code, and it's probably
26186 // easier on the hardware branch predictor, and stores aren't all that
26187 // expensive anyway.
26189 // Create the new basic blocks. One block contains all the XMM stores,
26190 // and one block is the final destination regardless of whether any
26191 // stores were performed.
26192 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26193 MachineFunction *F = MBB->getParent();
26194 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26195 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26196 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26197 F->insert(MBBIter, XMMSaveMBB);
26198 F->insert(MBBIter, EndMBB);
26200 // Transfer the remainder of MBB and its successor edges to EndMBB.
26201 EndMBB->splice(EndMBB->begin(), MBB,
26202 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26203 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26205 // The original block will now fall through to the XMM save block.
26206 MBB->addSuccessor(XMMSaveMBB);
26207 // The XMMSaveMBB will fall through to the end block.
26208 XMMSaveMBB->addSuccessor(EndMBB);
26210 // Now add the instructions.
26211 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26212 DebugLoc DL = MI.getDebugLoc();
26214 unsigned CountReg = MI.getOperand(0).getReg();
26215 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26216 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26218 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26219 // If %al is 0, branch around the XMM save block.
26220 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26221 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26222 MBB->addSuccessor(EndMBB);
26225 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26226 // that was just emitted, but clearly shouldn't be "saved".
26227 assert((MI.getNumOperands() <= 3 ||
26228 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26229 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26230 "Expected last argument to be EFLAGS");
26231 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26232 // In the XMM save block, save all the XMM argument registers.
26233 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26234 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26235 MachineMemOperand *MMO = F->getMachineMemOperand(
26236 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26237 MachineMemOperand::MOStore,
26238 /*Size=*/16, /*Align=*/16);
26239 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26240 .addFrameIndex(RegSaveFrameIndex)
26241 .addImm(/*Scale=*/1)
26242 .addReg(/*IndexReg=*/0)
26243 .addImm(/*Disp=*/Offset)
26244 .addReg(/*Segment=*/0)
26245 .addReg(MI.getOperand(i).getReg())
26246 .addMemOperand(MMO);
26249 MI.eraseFromParent(); // The pseudo instruction is gone now.
26254 // The EFLAGS operand of SelectItr might be missing a kill marker
26255 // because there were multiple uses of EFLAGS, and ISel didn't know
26256 // which to mark. Figure out whether SelectItr should have had a
26257 // kill marker, and set it if it should. Returns the correct kill
26259 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26260 MachineBasicBlock* BB,
26261 const TargetRegisterInfo* TRI) {
26262 // Scan forward through BB for a use/def of EFLAGS.
26263 MachineBasicBlock::iterator miI(std::next(SelectItr));
26264 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26265 const MachineInstr& mi = *miI;
26266 if (mi.readsRegister(X86::EFLAGS))
26268 if (mi.definesRegister(X86::EFLAGS))
26269 break; // Should have kill-flag - update below.
26272 // If we hit the end of the block, check whether EFLAGS is live into a
26274 if (miI == BB->end()) {
26275 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26276 sEnd = BB->succ_end();
26277 sItr != sEnd; ++sItr) {
26278 MachineBasicBlock* succ = *sItr;
26279 if (succ->isLiveIn(X86::EFLAGS))
26284 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26285 // out. SelectMI should have a kill flag on EFLAGS.
26286 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26290 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26291 // together with other CMOV pseudo-opcodes into a single basic-block with
26292 // conditional jump around it.
26293 static bool isCMOVPseudo(MachineInstr &MI) {
26294 switch (MI.getOpcode()) {
26295 case X86::CMOV_FR32:
26296 case X86::CMOV_FR64:
26297 case X86::CMOV_GR8:
26298 case X86::CMOV_GR16:
26299 case X86::CMOV_GR32:
26300 case X86::CMOV_RFP32:
26301 case X86::CMOV_RFP64:
26302 case X86::CMOV_RFP80:
26303 case X86::CMOV_V2F64:
26304 case X86::CMOV_V2I64:
26305 case X86::CMOV_V4F32:
26306 case X86::CMOV_V4F64:
26307 case X86::CMOV_V4I64:
26308 case X86::CMOV_V16F32:
26309 case X86::CMOV_V8F32:
26310 case X86::CMOV_V8F64:
26311 case X86::CMOV_V8I64:
26312 case X86::CMOV_V8I1:
26313 case X86::CMOV_V16I1:
26314 case X86::CMOV_V32I1:
26315 case X86::CMOV_V64I1:
26323 // Helper function, which inserts PHI functions into SinkMBB:
26324 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26325 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26326 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26327 // the last PHI function inserted.
26328 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26329 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26330 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26331 MachineBasicBlock *SinkMBB) {
26332 MachineFunction *MF = TrueMBB->getParent();
26333 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26334 DebugLoc DL = MIItBegin->getDebugLoc();
26336 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26337 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26339 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26341 // As we are creating the PHIs, we have to be careful if there is more than
26342 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26343 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26344 // That also means that PHI construction must work forward from earlier to
26345 // later, and that the code must maintain a mapping from earlier PHI's
26346 // destination registers, and the registers that went into the PHI.
26347 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26348 MachineInstrBuilder MIB;
26350 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26351 unsigned DestReg = MIIt->getOperand(0).getReg();
26352 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26353 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26355 // If this CMOV we are generating is the opposite condition from
26356 // the jump we generated, then we have to swap the operands for the
26357 // PHI that is going to be generated.
26358 if (MIIt->getOperand(3).getImm() == OppCC)
26359 std::swap(Op1Reg, Op2Reg);
26361 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26362 Op1Reg = RegRewriteTable[Op1Reg].first;
26364 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26365 Op2Reg = RegRewriteTable[Op2Reg].second;
26367 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26373 // Add this PHI to the rewrite table.
26374 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26380 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26381 MachineBasicBlock *
26382 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26383 MachineInstr &SecondCascadedCMOV,
26384 MachineBasicBlock *ThisMBB) const {
26385 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26386 DebugLoc DL = FirstCMOV.getDebugLoc();
26388 // We lower cascaded CMOVs such as
26390 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26392 // to two successive branches.
26394 // Without this, we would add a PHI between the two jumps, which ends up
26395 // creating a few copies all around. For instance, for
26397 // (sitofp (zext (fcmp une)))
26399 // we would generate:
26401 // ucomiss %xmm1, %xmm0
26402 // movss <1.0f>, %xmm0
26403 // movaps %xmm0, %xmm1
26405 // xorps %xmm1, %xmm1
26408 // movaps %xmm1, %xmm0
26412 // because this custom-inserter would have generated:
26424 // A: X = ...; Y = ...
26426 // C: Z = PHI [X, A], [Y, B]
26428 // E: PHI [X, C], [Z, D]
26430 // If we lower both CMOVs in a single step, we can instead generate:
26442 // A: X = ...; Y = ...
26444 // E: PHI [X, A], [X, C], [Y, D]
26446 // Which, in our sitofp/fcmp example, gives us something like:
26448 // ucomiss %xmm1, %xmm0
26449 // movss <1.0f>, %xmm0
26452 // xorps %xmm0, %xmm0
26457 // We lower cascaded CMOV into two successive branches to the same block.
26458 // EFLAGS is used by both, so mark it as live in the second.
26459 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26460 MachineFunction *F = ThisMBB->getParent();
26461 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26462 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26463 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26465 MachineFunction::iterator It = ++ThisMBB->getIterator();
26466 F->insert(It, FirstInsertedMBB);
26467 F->insert(It, SecondInsertedMBB);
26468 F->insert(It, SinkMBB);
26470 // For a cascaded CMOV, we lower it to two successive branches to
26471 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26472 // the FirstInsertedMBB.
26473 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26475 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26476 // live into the sink and copy blocks.
26477 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26478 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26479 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26480 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26481 SinkMBB->addLiveIn(X86::EFLAGS);
26484 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26485 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26486 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26488 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26490 // Fallthrough block for ThisMBB.
26491 ThisMBB->addSuccessor(FirstInsertedMBB);
26492 // The true block target of the first branch is always SinkMBB.
26493 ThisMBB->addSuccessor(SinkMBB);
26494 // Fallthrough block for FirstInsertedMBB.
26495 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26496 // The true block for the branch of FirstInsertedMBB.
26497 FirstInsertedMBB->addSuccessor(SinkMBB);
26498 // This is fallthrough.
26499 SecondInsertedMBB->addSuccessor(SinkMBB);
26501 // Create the conditional branch instructions.
26502 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26503 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26504 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26506 X86::CondCode SecondCC =
26507 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26508 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26509 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26512 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26513 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26514 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26515 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26516 MachineInstrBuilder MIB =
26517 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26519 .addMBB(SecondInsertedMBB)
26523 // The second SecondInsertedMBB provides the same incoming value as the
26524 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26525 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26526 // Copy the PHI result to the register defined by the second CMOV.
26527 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26528 TII->get(TargetOpcode::COPY),
26529 SecondCascadedCMOV.getOperand(0).getReg())
26530 .addReg(FirstCMOV.getOperand(0).getReg());
26532 // Now remove the CMOVs.
26533 FirstCMOV.eraseFromParent();
26534 SecondCascadedCMOV.eraseFromParent();
26539 MachineBasicBlock *
26540 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26541 MachineBasicBlock *ThisMBB) const {
26542 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26543 DebugLoc DL = MI.getDebugLoc();
26545 // To "insert" a SELECT_CC instruction, we actually have to insert the
26546 // diamond control-flow pattern. The incoming instruction knows the
26547 // destination vreg to set, the condition code register to branch on, the
26548 // true/false values to select between and a branch opcode to use.
26553 // cmpTY ccX, r1, r2
26555 // fallthrough --> FalseMBB
26557 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26558 // as described above, by inserting a BB, and then making a PHI at the join
26559 // point to select the true and false operands of the CMOV in the PHI.
26561 // The code also handles two different cases of multiple CMOV opcodes
26565 // In this case, there are multiple CMOVs in a row, all which are based on
26566 // the same condition setting (or the exact opposite condition setting).
26567 // In this case we can lower all the CMOVs using a single inserted BB, and
26568 // then make a number of PHIs at the join point to model the CMOVs. The only
26569 // trickiness here, is that in a case like:
26571 // t2 = CMOV cond1 t1, f1
26572 // t3 = CMOV cond1 t2, f2
26574 // when rewriting this into PHIs, we have to perform some renaming on the
26575 // temps since you cannot have a PHI operand refer to a PHI result earlier
26576 // in the same block. The "simple" but wrong lowering would be:
26578 // t2 = PHI t1(BB1), f1(BB2)
26579 // t3 = PHI t2(BB1), f2(BB2)
26581 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26582 // renaming is to note that on the path through BB1, t2 is really just a
26583 // copy of t1, and do that renaming, properly generating:
26585 // t2 = PHI t1(BB1), f1(BB2)
26586 // t3 = PHI t1(BB1), f2(BB2)
26589 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26590 // function - EmitLoweredCascadedSelect.
26592 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26593 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26594 MachineInstr *LastCMOV = &MI;
26595 MachineBasicBlock::iterator NextMIIt =
26596 std::next(MachineBasicBlock::iterator(MI));
26598 // Check for case 1, where there are multiple CMOVs with the same condition
26599 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26600 // number of jumps the most.
26602 if (isCMOVPseudo(MI)) {
26603 // See if we have a string of CMOVS with the same condition.
26604 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
26605 (NextMIIt->getOperand(3).getImm() == CC ||
26606 NextMIIt->getOperand(3).getImm() == OppCC)) {
26607 LastCMOV = &*NextMIIt;
26612 // This checks for case 2, but only do this if we didn't already find
26613 // case 1, as indicated by LastCMOV == MI.
26614 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
26615 NextMIIt->getOpcode() == MI.getOpcode() &&
26616 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
26617 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
26618 NextMIIt->getOperand(1).isKill()) {
26619 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
26622 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26623 MachineFunction *F = ThisMBB->getParent();
26624 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
26625 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26627 MachineFunction::iterator It = ++ThisMBB->getIterator();
26628 F->insert(It, FalseMBB);
26629 F->insert(It, SinkMBB);
26631 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26632 // live into the sink and copy blocks.
26633 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26634 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
26635 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
26636 FalseMBB->addLiveIn(X86::EFLAGS);
26637 SinkMBB->addLiveIn(X86::EFLAGS);
26640 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26641 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26642 std::next(MachineBasicBlock::iterator(LastCMOV)),
26644 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26646 // Fallthrough block for ThisMBB.
26647 ThisMBB->addSuccessor(FalseMBB);
26648 // The true block target of the first (or only) branch is always a SinkMBB.
26649 ThisMBB->addSuccessor(SinkMBB);
26650 // Fallthrough block for FalseMBB.
26651 FalseMBB->addSuccessor(SinkMBB);
26653 // Create the conditional branch instruction.
26654 unsigned Opc = X86::GetCondBranchFromCond(CC);
26655 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26658 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
26660 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
26661 MachineBasicBlock::iterator MIItEnd =
26662 std::next(MachineBasicBlock::iterator(LastCMOV));
26663 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
26665 // Now remove the CMOV(s).
26666 ThisMBB->erase(MIItBegin, MIItEnd);
26671 MachineBasicBlock *
26672 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
26673 MachineBasicBlock *BB) const {
26674 // Combine the following atomic floating-point modification pattern:
26675 // a.store(reg OP a.load(acquire), release)
26676 // Transform them into:
26677 // OPss (%gpr), %xmm
26678 // movss %xmm, (%gpr)
26679 // Or sd equivalent for 64-bit operations.
26681 switch (MI.getOpcode()) {
26682 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
26683 case X86::RELEASE_FADD32mr:
26684 FOp = X86::ADDSSrm;
26685 MOp = X86::MOVSSmr;
26687 case X86::RELEASE_FADD64mr:
26688 FOp = X86::ADDSDrm;
26689 MOp = X86::MOVSDmr;
26692 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26693 DebugLoc DL = MI.getDebugLoc();
26694 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
26695 unsigned ValOpIdx = X86::AddrNumOperands;
26696 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
26697 MachineInstrBuilder MIB =
26698 BuildMI(*BB, MI, DL, TII->get(FOp),
26699 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
26701 for (int i = 0; i < X86::AddrNumOperands; ++i) {
26702 MachineOperand &Operand = MI.getOperand(i);
26703 // Clear any kill flags on register operands as we'll create a second
26704 // instruction using the same address operands.
26705 if (Operand.isReg())
26706 Operand.setIsKill(false);
26709 MachineInstr *FOpMI = MIB;
26710 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
26711 for (int i = 0; i < X86::AddrNumOperands; ++i)
26712 MIB.add(MI.getOperand(i));
26713 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
26714 MI.eraseFromParent(); // The pseudo instruction is gone now.
26718 MachineBasicBlock *
26719 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
26720 MachineBasicBlock *BB) const {
26721 MachineFunction *MF = BB->getParent();
26722 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26723 DebugLoc DL = MI.getDebugLoc();
26724 const BasicBlock *LLVM_BB = BB->getBasicBlock();
26726 assert(MF->shouldSplitStack());
26728 const bool Is64Bit = Subtarget.is64Bit();
26729 const bool IsLP64 = Subtarget.isTarget64BitLP64();
26731 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
26732 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
26735 // ... [Till the alloca]
26736 // If stacklet is not large enough, jump to mallocMBB
26739 // Allocate by subtracting from RSP
26740 // Jump to continueMBB
26743 // Allocate by call to runtime
26747 // [rest of original BB]
26750 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26751 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26752 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26754 MachineRegisterInfo &MRI = MF->getRegInfo();
26755 const TargetRegisterClass *AddrRegClass =
26756 getRegClassFor(getPointerTy(MF->getDataLayout()));
26758 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26759 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26760 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
26761 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
26762 sizeVReg = MI.getOperand(1).getReg(),
26764 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26766 MachineFunction::iterator MBBIter = ++BB->getIterator();
26768 MF->insert(MBBIter, bumpMBB);
26769 MF->insert(MBBIter, mallocMBB);
26770 MF->insert(MBBIter, continueMBB);
26772 continueMBB->splice(continueMBB->begin(), BB,
26773 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26774 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26776 // Add code to the main basic block to check if the stack limit has been hit,
26777 // and if so, jump to mallocMBB otherwise to bumpMBB.
26778 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26779 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26780 .addReg(tmpSPVReg).addReg(sizeVReg);
26781 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26782 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26783 .addReg(SPLimitVReg);
26784 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26786 // bumpMBB simply decreases the stack pointer, since we know the current
26787 // stacklet has enough space.
26788 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26789 .addReg(SPLimitVReg);
26790 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26791 .addReg(SPLimitVReg);
26792 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26794 // Calls into a routine in libgcc to allocate more space from the heap.
26795 const uint32_t *RegMask =
26796 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26798 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26800 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26801 .addExternalSymbol("__morestack_allocate_stack_space")
26802 .addRegMask(RegMask)
26803 .addReg(X86::RDI, RegState::Implicit)
26804 .addReg(X86::RAX, RegState::ImplicitDefine);
26805 } else if (Is64Bit) {
26806 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26808 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26809 .addExternalSymbol("__morestack_allocate_stack_space")
26810 .addRegMask(RegMask)
26811 .addReg(X86::EDI, RegState::Implicit)
26812 .addReg(X86::EAX, RegState::ImplicitDefine);
26814 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26816 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26817 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26818 .addExternalSymbol("__morestack_allocate_stack_space")
26819 .addRegMask(RegMask)
26820 .addReg(X86::EAX, RegState::ImplicitDefine);
26824 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26827 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26828 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26829 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26831 // Set up the CFG correctly.
26832 BB->addSuccessor(bumpMBB);
26833 BB->addSuccessor(mallocMBB);
26834 mallocMBB->addSuccessor(continueMBB);
26835 bumpMBB->addSuccessor(continueMBB);
26837 // Take care of the PHI nodes.
26838 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26839 MI.getOperand(0).getReg())
26840 .addReg(mallocPtrVReg)
26842 .addReg(bumpSPPtrVReg)
26845 // Delete the original pseudo instruction.
26846 MI.eraseFromParent();
26849 return continueMBB;
26852 MachineBasicBlock *
26853 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26854 MachineBasicBlock *BB) const {
26855 MachineFunction *MF = BB->getParent();
26856 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26857 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26858 DebugLoc DL = MI.getDebugLoc();
26860 assert(!isAsynchronousEHPersonality(
26861 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
26862 "SEH does not use catchret!");
26864 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26865 if (!Subtarget.is32Bit())
26868 // C++ EH creates a new target block to hold the restore code, and wires up
26869 // the new block to the return destination with a normal JMP_4.
26870 MachineBasicBlock *RestoreMBB =
26871 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26872 assert(BB->succ_size() == 1);
26873 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26874 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26875 BB->addSuccessor(RestoreMBB);
26876 MI.getOperand(0).setMBB(RestoreMBB);
26878 auto RestoreMBBI = RestoreMBB->begin();
26879 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26880 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26884 MachineBasicBlock *
26885 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26886 MachineBasicBlock *BB) const {
26887 MachineFunction *MF = BB->getParent();
26888 const Constant *PerFn = MF->getFunction().getPersonalityFn();
26889 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26890 // Only 32-bit SEH requires special handling for catchpad.
26891 if (IsSEH && Subtarget.is32Bit()) {
26892 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26893 DebugLoc DL = MI.getDebugLoc();
26894 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26896 MI.eraseFromParent();
26900 MachineBasicBlock *
26901 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26902 MachineBasicBlock *BB) const {
26903 // So, here we replace TLSADDR with the sequence:
26904 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26905 // We need this because TLSADDR is lowered into calls
26906 // inside MC, therefore without the two markers shrink-wrapping
26907 // may push the prologue/epilogue pass them.
26908 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26909 DebugLoc DL = MI.getDebugLoc();
26910 MachineFunction &MF = *BB->getParent();
26912 // Emit CALLSEQ_START right before the instruction.
26913 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26914 MachineInstrBuilder CallseqStart =
26915 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26916 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26918 // Emit CALLSEQ_END right after the instruction.
26919 // We don't call erase from parent because we want to keep the
26920 // original instruction around.
26921 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26922 MachineInstrBuilder CallseqEnd =
26923 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26924 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26929 MachineBasicBlock *
26930 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26931 MachineBasicBlock *BB) const {
26932 // This is pretty easy. We're taking the value that we received from
26933 // our load from the relocation, sticking it in either RDI (x86-64)
26934 // or EAX and doing an indirect call. The return value will then
26935 // be in the normal return register.
26936 MachineFunction *F = BB->getParent();
26937 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26938 DebugLoc DL = MI.getDebugLoc();
26940 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26941 assert(MI.getOperand(3).isGlobal() && "This should be a global");
26943 // Get a register mask for the lowered call.
26944 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26945 // proper register mask.
26946 const uint32_t *RegMask =
26947 Subtarget.is64Bit() ?
26948 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26949 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26950 if (Subtarget.is64Bit()) {
26951 MachineInstrBuilder MIB =
26952 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26956 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26957 MI.getOperand(3).getTargetFlags())
26959 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26960 addDirectMem(MIB, X86::RDI);
26961 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26962 } else if (!isPositionIndependent()) {
26963 MachineInstrBuilder MIB =
26964 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26968 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26969 MI.getOperand(3).getTargetFlags())
26971 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26972 addDirectMem(MIB, X86::EAX);
26973 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26975 MachineInstrBuilder MIB =
26976 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26977 .addReg(TII->getGlobalBaseReg(F))
26980 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26981 MI.getOperand(3).getTargetFlags())
26983 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26984 addDirectMem(MIB, X86::EAX);
26985 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26988 MI.eraseFromParent(); // The pseudo instruction is gone now.
26992 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
26994 case X86::RETPOLINE_CALL32:
26995 return X86::CALLpcrel32;
26996 case X86::RETPOLINE_CALL64:
26997 return X86::CALL64pcrel32;
26998 case X86::RETPOLINE_TCRETURN32:
26999 return X86::TCRETURNdi;
27000 case X86::RETPOLINE_TCRETURN64:
27001 return X86::TCRETURNdi64;
27003 llvm_unreachable("not retpoline opcode");
27006 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27008 if (Subtarget.useRetpolineExternalThunk()) {
27009 // When using an external thunk for retpolines, we pick names that match the
27010 // names GCC happens to use as well. This helps simplify the implementation
27011 // of the thunks for kernels where they have no easy ability to create
27012 // aliases and are doing non-trivial configuration of the thunk's body. For
27013 // example, the Linux kernel will do boot-time hot patching of the thunk
27014 // bodies and cannot easily export aliases of these to loaded modules.
27016 // Note that at any point in the future, we may need to change the semantics
27017 // of how we implement retpolines and at that time will likely change the
27018 // name of the called thunk. Essentially, there is no hard guarantee that
27019 // LLVM will generate calls to specific thunks, we merely make a best-effort
27020 // attempt to help out kernels and other systems where duplicating the
27021 // thunks is costly.
27024 assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
27025 return "__x86_indirect_thunk";
27027 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27028 return "__x86_indirect_thunk_eax";
27030 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27031 return "__x86_indirect_thunk_ecx";
27033 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27034 return "__x86_indirect_thunk_edx";
27036 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27037 return "__x86_indirect_thunk_r11";
27039 llvm_unreachable("unexpected reg for retpoline");
27042 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27045 assert(!Subtarget.is64Bit() && "R11 should always be available on x64");
27046 return "__llvm_retpoline_push";
27048 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27049 return "__llvm_retpoline_eax";
27051 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27052 return "__llvm_retpoline_ecx";
27054 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27055 return "__llvm_retpoline_edx";
27057 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27058 return "__llvm_retpoline_r11";
27060 llvm_unreachable("unexpected reg for retpoline");
27063 MachineBasicBlock *
27064 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27065 MachineBasicBlock *BB) const {
27066 // Copy the virtual register into the R11 physical register and
27067 // call the retpoline thunk.
27068 DebugLoc DL = MI.getDebugLoc();
27069 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27070 unsigned CalleeVReg = MI.getOperand(0).getReg();
27071 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27073 // Find an available scratch register to hold the callee. On 64-bit, we can
27074 // just use R11, but we scan for uses anyway to ensure we don't generate
27075 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27076 // already a register use operand to the call to hold the callee. If none
27077 // are available, push the callee instead. This is less efficient, but is
27078 // necessary for functions using 3 regparms. Such function calls are
27079 // (currently) not eligible for tail call optimization, because there is no
27080 // scratch register available to hold the address of the callee.
27081 SmallVector<unsigned, 3> AvailableRegs;
27082 if (Subtarget.is64Bit())
27083 AvailableRegs.push_back(X86::R11);
27085 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX});
27087 // Zero out any registers that are already used.
27088 for (const auto &MO : MI.operands()) {
27089 if (MO.isReg() && MO.isUse())
27090 for (unsigned &Reg : AvailableRegs)
27091 if (Reg == MO.getReg())
27095 // Choose the first remaining non-zero available register.
27096 unsigned AvailableReg = 0;
27097 for (unsigned MaybeReg : AvailableRegs) {
27099 AvailableReg = MaybeReg;
27104 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27106 if (AvailableReg == 0) {
27107 // No register available. Use PUSH. This must not be a tailcall, and this
27108 // must not be x64.
27109 if (Subtarget.is64Bit())
27110 report_fatal_error(
27111 "Cannot make an indirect call on x86-64 using both retpoline and a "
27112 "calling convention that preservers r11");
27113 if (Opc != X86::CALLpcrel32)
27114 report_fatal_error("Cannot make an indirect tail call on x86 using "
27115 "retpoline without a preserved register");
27116 BuildMI(*BB, MI, DL, TII->get(X86::PUSH32r)).addReg(CalleeVReg);
27117 MI.getOperand(0).ChangeToES(Symbol);
27118 MI.setDesc(TII->get(Opc));
27120 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27121 .addReg(CalleeVReg);
27122 MI.getOperand(0).ChangeToES(Symbol);
27123 MI.setDesc(TII->get(Opc));
27124 MachineInstrBuilder(*BB->getParent(), &MI)
27125 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27130 MachineBasicBlock *
27131 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27132 MachineBasicBlock *MBB) const {
27133 DebugLoc DL = MI.getDebugLoc();
27134 MachineFunction *MF = MBB->getParent();
27135 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27136 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27137 MachineRegisterInfo &MRI = MF->getRegInfo();
27139 const BasicBlock *BB = MBB->getBasicBlock();
27140 MachineFunction::iterator I = ++MBB->getIterator();
27142 // Memory Reference
27143 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27144 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27147 unsigned MemOpndSlot = 0;
27149 unsigned CurOp = 0;
27151 DstReg = MI.getOperand(CurOp++).getReg();
27152 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27153 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27155 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27156 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27158 MemOpndSlot = CurOp;
27160 MVT PVT = getPointerTy(MF->getDataLayout());
27161 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27162 "Invalid Pointer Size!");
27164 // For v = setjmp(buf), we generate
27167 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27168 // SjLjSetup restoreMBB
27174 // v = phi(main, restore)
27177 // if base pointer being used, load it from frame
27180 MachineBasicBlock *thisMBB = MBB;
27181 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27182 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27183 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27184 MF->insert(I, mainMBB);
27185 MF->insert(I, sinkMBB);
27186 MF->push_back(restoreMBB);
27187 restoreMBB->setHasAddressTaken();
27189 MachineInstrBuilder MIB;
27191 // Transfer the remainder of BB and its successor edges to sinkMBB.
27192 sinkMBB->splice(sinkMBB->begin(), MBB,
27193 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27194 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27197 unsigned PtrStoreOpc = 0;
27198 unsigned LabelReg = 0;
27199 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27200 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27201 !isPositionIndependent();
27203 // Prepare IP either in reg or imm.
27204 if (!UseImmLabel) {
27205 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27206 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27207 LabelReg = MRI.createVirtualRegister(PtrRC);
27208 if (Subtarget.is64Bit()) {
27209 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27213 .addMBB(restoreMBB)
27216 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27217 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27218 .addReg(XII->getGlobalBaseReg(MF))
27221 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27225 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27227 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27228 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27229 if (i == X86::AddrDisp)
27230 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27232 MIB.add(MI.getOperand(MemOpndSlot + i));
27235 MIB.addReg(LabelReg);
27237 MIB.addMBB(restoreMBB);
27238 MIB.setMemRefs(MMOBegin, MMOEnd);
27240 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27241 .addMBB(restoreMBB);
27243 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27244 MIB.addRegMask(RegInfo->getNoPreservedMask());
27245 thisMBB->addSuccessor(mainMBB);
27246 thisMBB->addSuccessor(restoreMBB);
27250 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27251 mainMBB->addSuccessor(sinkMBB);
27254 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27255 TII->get(X86::PHI), DstReg)
27256 .addReg(mainDstReg).addMBB(mainMBB)
27257 .addReg(restoreDstReg).addMBB(restoreMBB);
27260 if (RegInfo->hasBasePointer(*MF)) {
27261 const bool Uses64BitFramePtr =
27262 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27263 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27264 X86FI->setRestoreBasePointer(MF);
27265 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27266 unsigned BasePtr = RegInfo->getBaseRegister();
27267 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27268 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27269 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27270 .setMIFlag(MachineInstr::FrameSetup);
27272 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27273 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27274 restoreMBB->addSuccessor(sinkMBB);
27276 MI.eraseFromParent();
27280 MachineBasicBlock *
27281 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
27282 MachineBasicBlock *MBB) const {
27283 DebugLoc DL = MI.getDebugLoc();
27284 MachineFunction *MF = MBB->getParent();
27285 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27286 MachineRegisterInfo &MRI = MF->getRegInfo();
27288 // Memory Reference
27289 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27290 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27292 MVT PVT = getPointerTy(MF->getDataLayout());
27293 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27294 "Invalid Pointer Size!");
27296 const TargetRegisterClass *RC =
27297 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27298 unsigned Tmp = MRI.createVirtualRegister(RC);
27299 // Since FP is only updated here but NOT referenced, it's treated as GPR.
27300 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27301 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
27302 unsigned SP = RegInfo->getStackRegister();
27304 MachineInstrBuilder MIB;
27306 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27307 const int64_t SPOffset = 2 * PVT.getStoreSize();
27309 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
27310 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
27313 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
27314 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
27315 MIB.add(MI.getOperand(i));
27316 MIB.setMemRefs(MMOBegin, MMOEnd);
27318 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
27319 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27320 if (i == X86::AddrDisp)
27321 MIB.addDisp(MI.getOperand(i), LabelOffset);
27323 MIB.add(MI.getOperand(i));
27325 MIB.setMemRefs(MMOBegin, MMOEnd);
27327 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
27328 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27329 if (i == X86::AddrDisp)
27330 MIB.addDisp(MI.getOperand(i), SPOffset);
27332 MIB.add(MI.getOperand(i));
27334 MIB.setMemRefs(MMOBegin, MMOEnd);
27336 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
27338 MI.eraseFromParent();
27342 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
27343 MachineBasicBlock *MBB,
27344 MachineBasicBlock *DispatchBB,
27346 DebugLoc DL = MI.getDebugLoc();
27347 MachineFunction *MF = MBB->getParent();
27348 MachineRegisterInfo *MRI = &MF->getRegInfo();
27349 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27351 MVT PVT = getPointerTy(MF->getDataLayout());
27352 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
27357 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27358 !isPositionIndependent();
27361 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27363 const TargetRegisterClass *TRC =
27364 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27365 VR = MRI->createVirtualRegister(TRC);
27366 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27368 if (Subtarget.is64Bit())
27369 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27373 .addMBB(DispatchBB)
27376 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27377 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27380 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27384 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27385 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27387 MIB.addMBB(DispatchBB);
27392 MachineBasicBlock *
27393 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27394 MachineBasicBlock *BB) const {
27395 DebugLoc DL = MI.getDebugLoc();
27396 MachineFunction *MF = BB->getParent();
27397 MachineFrameInfo &MFI = MF->getFrameInfo();
27398 MachineRegisterInfo *MRI = &MF->getRegInfo();
27399 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27400 int FI = MFI.getFunctionContextIndex();
27402 // Get a mapping of the call site numbers to all of the landing pads they're
27403 // associated with.
27404 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27405 unsigned MaxCSNum = 0;
27406 for (auto &MBB : *MF) {
27407 if (!MBB.isEHPad())
27410 MCSymbol *Sym = nullptr;
27411 for (const auto &MI : MBB) {
27412 if (MI.isDebugValue())
27415 assert(MI.isEHLabel() && "expected EH_LABEL");
27416 Sym = MI.getOperand(0).getMCSymbol();
27420 if (!MF->hasCallSiteLandingPad(Sym))
27423 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27424 CallSiteNumToLPad[CSI].push_back(&MBB);
27425 MaxCSNum = std::max(MaxCSNum, CSI);
27429 // Get an ordered list of the machine basic blocks for the jump table.
27430 std::vector<MachineBasicBlock *> LPadList;
27431 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27432 LPadList.reserve(CallSiteNumToLPad.size());
27434 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27435 for (auto &LP : CallSiteNumToLPad[CSI]) {
27436 LPadList.push_back(LP);
27437 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27441 assert(!LPadList.empty() &&
27442 "No landing pad destinations for the dispatch jump table!");
27444 // Create the MBBs for the dispatch code.
27446 // Shove the dispatch's address into the return slot in the function context.
27447 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27448 DispatchBB->setIsEHPad(true);
27450 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27451 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27452 DispatchBB->addSuccessor(TrapBB);
27454 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27455 DispatchBB->addSuccessor(DispContBB);
27458 MF->push_back(DispatchBB);
27459 MF->push_back(DispContBB);
27460 MF->push_back(TrapBB);
27462 // Insert code into the entry block that creates and registers the function
27464 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27466 // Create the jump table and associated information
27467 unsigned JTE = getJumpTableEncoding();
27468 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27469 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27471 const X86RegisterInfo &RI = TII->getRegisterInfo();
27472 // Add a register mask with no preserved registers. This results in all
27473 // registers being marked as clobbered.
27474 if (RI.hasBasePointer(*MF)) {
27475 const bool FPIs64Bit =
27476 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27477 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27478 MFI->setRestoreBasePointer(MF);
27480 unsigned FP = RI.getFrameRegister(*MF);
27481 unsigned BP = RI.getBaseRegister();
27482 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27483 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27484 MFI->getRestoreBasePointerOffset())
27485 .addRegMask(RI.getNoPreservedMask());
27487 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27488 .addRegMask(RI.getNoPreservedMask());
27491 // IReg is used as an index in a memory operand and therefore can't be SP
27492 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27493 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27494 Subtarget.is64Bit() ? 8 : 4);
27495 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27497 .addImm(LPadList.size());
27498 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27500 if (Subtarget.is64Bit()) {
27501 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27502 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27504 // leaq .LJTI0_0(%rip), BReg
27505 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27509 .addJumpTableIndex(MJTI)
27511 // movzx IReg64, IReg
27512 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27515 .addImm(X86::sub_32bit);
27518 case MachineJumpTableInfo::EK_BlockAddress:
27519 // jmpq *(BReg,IReg64,8)
27520 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27527 case MachineJumpTableInfo::EK_LabelDifference32: {
27528 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27529 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27530 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27532 // movl (BReg,IReg64,4), OReg
27533 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27539 // movsx OReg64, OReg
27540 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27541 // addq BReg, OReg64, TReg
27542 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27546 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27550 llvm_unreachable("Unexpected jump table encoding");
27553 // jmpl *.LJTI0_0(,IReg,4)
27554 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27558 .addJumpTableIndex(MJTI)
27562 // Add the jump table entries as successors to the MBB.
27563 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27564 for (auto &LP : LPadList)
27565 if (SeenMBBs.insert(LP).second)
27566 DispContBB->addSuccessor(LP);
27568 // N.B. the order the invoke BBs are processed in doesn't matter here.
27569 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27570 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27571 for (MachineBasicBlock *MBB : InvokeBBs) {
27572 // Remove the landing pad successor from the invoke block and replace it
27573 // with the new dispatch block.
27574 // Keep a copy of Successors since it's modified inside the loop.
27575 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27577 // FIXME: Avoid quadratic complexity.
27578 for (auto MBBS : Successors) {
27579 if (MBBS->isEHPad()) {
27580 MBB->removeSuccessor(MBBS);
27581 MBBLPads.push_back(MBBS);
27585 MBB->addSuccessor(DispatchBB);
27587 // Find the invoke call and mark all of the callee-saved registers as
27588 // 'implicit defined' so that they're spilled. This prevents code from
27589 // moving instructions to before the EH block, where they will never be
27591 for (auto &II : reverse(*MBB)) {
27595 DenseMap<unsigned, bool> DefRegs;
27596 for (auto &MOp : II.operands())
27598 DefRegs[MOp.getReg()] = true;
27600 MachineInstrBuilder MIB(*MF, &II);
27601 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27602 unsigned Reg = SavedRegs[RI];
27604 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27611 // Mark all former landing pads as non-landing pads. The dispatch is the only
27612 // landing pad now.
27613 for (auto &LP : MBBLPads)
27614 LP->setIsEHPad(false);
27616 // The instruction is gone now.
27617 MI.eraseFromParent();
27621 MachineBasicBlock *
27622 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
27623 MachineBasicBlock *BB) const {
27624 MachineFunction *MF = BB->getParent();
27625 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27626 DebugLoc DL = MI.getDebugLoc();
27628 switch (MI.getOpcode()) {
27629 default: llvm_unreachable("Unexpected instr type to insert");
27630 case X86::TLS_addr32:
27631 case X86::TLS_addr64:
27632 case X86::TLS_base_addr32:
27633 case X86::TLS_base_addr64:
27634 return EmitLoweredTLSAddr(MI, BB);
27635 case X86::RETPOLINE_CALL32:
27636 case X86::RETPOLINE_CALL64:
27637 case X86::RETPOLINE_TCRETURN32:
27638 case X86::RETPOLINE_TCRETURN64:
27639 return EmitLoweredRetpoline(MI, BB);
27640 case X86::CATCHRET:
27641 return EmitLoweredCatchRet(MI, BB);
27642 case X86::CATCHPAD:
27643 return EmitLoweredCatchPad(MI, BB);
27644 case X86::SEG_ALLOCA_32:
27645 case X86::SEG_ALLOCA_64:
27646 return EmitLoweredSegAlloca(MI, BB);
27647 case X86::TLSCall_32:
27648 case X86::TLSCall_64:
27649 return EmitLoweredTLSCall(MI, BB);
27650 case X86::CMOV_FR32:
27651 case X86::CMOV_FR64:
27652 case X86::CMOV_FR128:
27653 case X86::CMOV_GR8:
27654 case X86::CMOV_GR16:
27655 case X86::CMOV_GR32:
27656 case X86::CMOV_RFP32:
27657 case X86::CMOV_RFP64:
27658 case X86::CMOV_RFP80:
27659 case X86::CMOV_V2F64:
27660 case X86::CMOV_V2I64:
27661 case X86::CMOV_V4F32:
27662 case X86::CMOV_V4F64:
27663 case X86::CMOV_V4I64:
27664 case X86::CMOV_V16F32:
27665 case X86::CMOV_V8F32:
27666 case X86::CMOV_V8F64:
27667 case X86::CMOV_V8I64:
27668 case X86::CMOV_V8I1:
27669 case X86::CMOV_V16I1:
27670 case X86::CMOV_V32I1:
27671 case X86::CMOV_V64I1:
27672 return EmitLoweredSelect(MI, BB);
27674 case X86::RDFLAGS32:
27675 case X86::RDFLAGS64: {
27677 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
27678 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
27679 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
27680 // Permit reads of the FLAGS register without it being defined.
27681 // This intrinsic exists to read external processor state in flags, such as
27682 // the trap flag, interrupt flag, and direction flag, none of which are
27683 // modeled by the backend.
27684 Push->getOperand(2).setIsUndef();
27685 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
27687 MI.eraseFromParent(); // The pseudo is gone now.
27691 case X86::WRFLAGS32:
27692 case X86::WRFLAGS64: {
27694 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
27696 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
27697 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
27698 BuildMI(*BB, MI, DL, TII->get(PopF));
27700 MI.eraseFromParent(); // The pseudo is gone now.
27704 case X86::RELEASE_FADD32mr:
27705 case X86::RELEASE_FADD64mr:
27706 return EmitLoweredAtomicFP(MI, BB);
27708 case X86::FP32_TO_INT16_IN_MEM:
27709 case X86::FP32_TO_INT32_IN_MEM:
27710 case X86::FP32_TO_INT64_IN_MEM:
27711 case X86::FP64_TO_INT16_IN_MEM:
27712 case X86::FP64_TO_INT32_IN_MEM:
27713 case X86::FP64_TO_INT64_IN_MEM:
27714 case X86::FP80_TO_INT16_IN_MEM:
27715 case X86::FP80_TO_INT32_IN_MEM:
27716 case X86::FP80_TO_INT64_IN_MEM: {
27717 // Change the floating point control register to use "round towards zero"
27718 // mode when truncating to an integer value.
27719 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
27720 addFrameReference(BuildMI(*BB, MI, DL,
27721 TII->get(X86::FNSTCW16m)), CWFrameIdx);
27723 // Load the old value of the high byte of the control word...
27725 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
27726 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
27729 // Set the high part to be round to zero...
27730 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
27733 // Reload the modified control word now...
27734 addFrameReference(BuildMI(*BB, MI, DL,
27735 TII->get(X86::FLDCW16m)), CWFrameIdx);
27737 // Restore the memory image of control word to original value
27738 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
27741 // Get the X86 opcode to use.
27743 switch (MI.getOpcode()) {
27744 default: llvm_unreachable("illegal opcode!");
27745 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
27746 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
27747 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
27748 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
27749 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
27750 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
27751 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
27752 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
27753 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
27756 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27757 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
27758 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
27760 // Reload the original control word now.
27761 addFrameReference(BuildMI(*BB, MI, DL,
27762 TII->get(X86::FLDCW16m)), CWFrameIdx);
27764 MI.eraseFromParent(); // The pseudo instruction is gone now.
27767 // String/text processing lowering.
27768 case X86::PCMPISTRM128REG:
27769 case X86::VPCMPISTRM128REG:
27770 case X86::PCMPISTRM128MEM:
27771 case X86::VPCMPISTRM128MEM:
27772 case X86::PCMPESTRM128REG:
27773 case X86::VPCMPESTRM128REG:
27774 case X86::PCMPESTRM128MEM:
27775 case X86::VPCMPESTRM128MEM:
27776 assert(Subtarget.hasSSE42() &&
27777 "Target must have SSE4.2 or AVX features enabled");
27778 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
27780 // String/text processing lowering.
27781 case X86::PCMPISTRIREG:
27782 case X86::VPCMPISTRIREG:
27783 case X86::PCMPISTRIMEM:
27784 case X86::VPCMPISTRIMEM:
27785 case X86::PCMPESTRIREG:
27786 case X86::VPCMPESTRIREG:
27787 case X86::PCMPESTRIMEM:
27788 case X86::VPCMPESTRIMEM:
27789 assert(Subtarget.hasSSE42() &&
27790 "Target must have SSE4.2 or AVX features enabled");
27791 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
27793 // Thread synchronization.
27795 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
27796 case X86::MONITORX:
27797 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
27801 return emitClzero(&MI, BB, Subtarget);
27805 return emitWRPKRU(MI, BB, Subtarget);
27807 return emitRDPKRU(MI, BB, Subtarget);
27810 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
27812 case X86::VASTART_SAVE_XMM_REGS:
27813 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
27815 case X86::VAARG_64:
27816 return EmitVAARG64WithCustomInserter(MI, BB);
27818 case X86::EH_SjLj_SetJmp32:
27819 case X86::EH_SjLj_SetJmp64:
27820 return emitEHSjLjSetJmp(MI, BB);
27822 case X86::EH_SjLj_LongJmp32:
27823 case X86::EH_SjLj_LongJmp64:
27824 return emitEHSjLjLongJmp(MI, BB);
27826 case X86::Int_eh_sjlj_setup_dispatch:
27827 return EmitSjLjDispatchBlock(MI, BB);
27829 case TargetOpcode::STATEPOINT:
27830 // As an implementation detail, STATEPOINT shares the STACKMAP format at
27831 // this point in the process. We diverge later.
27832 return emitPatchPoint(MI, BB);
27834 case TargetOpcode::STACKMAP:
27835 case TargetOpcode::PATCHPOINT:
27836 return emitPatchPoint(MI, BB);
27838 case TargetOpcode::PATCHABLE_EVENT_CALL:
27839 return emitXRayCustomEvent(MI, BB);
27841 case X86::LCMPXCHG8B: {
27842 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
27843 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
27844 // requires a memory operand. If it happens that current architecture is
27845 // i686 and for current function we need a base pointer
27846 // - which is ESI for i686 - register allocator would not be able to
27847 // allocate registers for an address in form of X(%reg, %reg, Y)
27848 // - there never would be enough unreserved registers during regalloc
27849 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
27850 // We are giving a hand to register allocator by precomputing the address in
27851 // a new vreg using LEA.
27853 // If it is not i686 or there is no base pointer - nothing to do here.
27854 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
27857 // Even though this code does not necessarily needs the base pointer to
27858 // be ESI, we check for that. The reason: if this assert fails, there are
27859 // some changes happened in the compiler base pointer handling, which most
27860 // probably have to be addressed somehow here.
27861 assert(TRI->getBaseRegister() == X86::ESI &&
27862 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
27863 "base pointer in mind");
27865 MachineRegisterInfo &MRI = MF->getRegInfo();
27866 MVT SPTy = getPointerTy(MF->getDataLayout());
27867 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27868 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
27870 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27871 // Regalloc does not need any help when the memory operand of CMPXCHG8B
27872 // does not use index register.
27873 if (AM.IndexReg == X86::NoRegister)
27876 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
27877 // four operand definitions that are E[ABCD] registers. We skip them and
27878 // then insert the LEA.
27879 MachineBasicBlock::iterator MBBI(MI);
27880 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
27881 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
27884 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
27886 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
27890 case X86::LCMPXCHG16B:
27892 case X86::LCMPXCHG8B_SAVE_EBX:
27893 case X86::LCMPXCHG16B_SAVE_RBX: {
27895 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
27896 if (!BB->isLiveIn(BasePtr))
27897 BB->addLiveIn(BasePtr);
27903 //===----------------------------------------------------------------------===//
27904 // X86 Optimization Hooks
27905 //===----------------------------------------------------------------------===//
27908 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
27909 const APInt &Demanded,
27910 TargetLoweringOpt &TLO) const {
27911 // Only optimize Ands to prevent shrinking a constant that could be
27912 // matched by movzx.
27913 if (Op.getOpcode() != ISD::AND)
27916 EVT VT = Op.getValueType();
27922 unsigned Size = VT.getSizeInBits();
27924 // Make sure the RHS really is a constant.
27925 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
27929 const APInt &Mask = C->getAPIntValue();
27931 // Clear all non-demanded bits initially.
27932 APInt ShrunkMask = Mask & Demanded;
27934 // Find the width of the shrunk mask.
27935 unsigned Width = ShrunkMask.getActiveBits();
27937 // If the mask is all 0s there's nothing to do here.
27941 // Find the next power of 2 width, rounding up to a byte.
27942 Width = PowerOf2Ceil(std::max(Width, 8U));
27943 // Truncate the width to size to handle illegal types.
27944 Width = std::min(Width, Size);
27946 // Calculate a possible zero extend mask for this constant.
27947 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
27949 // If we aren't changing the mask, just return true to keep it and prevent
27950 // the caller from optimizing.
27951 if (ZeroExtendMask == Mask)
27954 // Make sure the bits in the ZeroExtendMask are also set in the original mask.
27955 // TODO: We should be able to set bits that aren't demanded too.
27956 if (!ZeroExtendMask.isSubsetOf(Mask))
27959 // Replace the constant with the zero extend mask.
27961 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
27962 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
27963 return TLO.CombineTo(Op, NewOp);
27966 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
27968 const APInt &DemandedElts,
27969 const SelectionDAG &DAG,
27970 unsigned Depth) const {
27971 unsigned BitWidth = Known.getBitWidth();
27972 unsigned Opc = Op.getOpcode();
27973 EVT VT = Op.getValueType();
27974 assert((Opc >= ISD::BUILTIN_OP_END ||
27975 Opc == ISD::INTRINSIC_WO_CHAIN ||
27976 Opc == ISD::INTRINSIC_W_CHAIN ||
27977 Opc == ISD::INTRINSIC_VOID) &&
27978 "Should use MaskedValueIsZero if you don't know whether Op"
27979 " is a target node!");
27984 case X86ISD::SETCC:
27985 Known.Zero.setBitsFrom(1);
27987 case X86ISD::MOVMSK: {
27988 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
27989 Known.Zero.setBitsFrom(NumLoBits);
27992 case X86ISD::PEXTRB:
27993 case X86ISD::PEXTRW: {
27994 SDValue Src = Op.getOperand(0);
27995 EVT SrcVT = Src.getValueType();
27996 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
27997 Op.getConstantOperandVal(1));
27998 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
27999 Known = Known.zextOrTrunc(BitWidth);
28000 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28003 case X86ISD::VSHLI:
28004 case X86ISD::VSRLI: {
28005 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28006 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28007 Known.setAllZero();
28011 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28012 unsigned ShAmt = ShiftImm->getZExtValue();
28013 if (Opc == X86ISD::VSHLI) {
28014 Known.Zero <<= ShAmt;
28015 Known.One <<= ShAmt;
28016 // Low bits are known zero.
28017 Known.Zero.setLowBits(ShAmt);
28019 Known.Zero.lshrInPlace(ShAmt);
28020 Known.One.lshrInPlace(ShAmt);
28021 // High bits are known zero.
28022 Known.Zero.setHighBits(ShAmt);
28027 case X86ISD::VZEXT: {
28028 // TODO: Add DemandedElts support.
28029 SDValue N0 = Op.getOperand(0);
28030 unsigned NumElts = VT.getVectorNumElements();
28032 EVT SrcVT = N0.getValueType();
28033 unsigned InNumElts = SrcVT.getVectorNumElements();
28034 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28035 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28037 Known = KnownBits(InBitWidth);
28038 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28039 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28040 Known = Known.zext(BitWidth);
28041 Known.Zero.setBitsFrom(InBitWidth);
28044 case X86ISD::CMOV: {
28045 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28046 // If we don't know any bits, early out.
28047 if (Known.isUnknown())
28050 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28052 // Only known if known in both the LHS and RHS.
28053 Known.One &= Known2.One;
28054 Known.Zero &= Known2.Zero;
28057 case X86ISD::UDIVREM8_ZEXT_HREG:
28058 // TODO: Support more than just the zero extended bits?
28059 if (Op.getResNo() != 1)
28061 // The remainder is zero extended.
28062 Known.Zero.setBitsFrom(8);
28067 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
28068 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
28069 unsigned Depth) const {
28070 unsigned VTBits = Op.getScalarValueSizeInBits();
28071 unsigned Opcode = Op.getOpcode();
28073 case X86ISD::SETCC_CARRY:
28074 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
28077 case X86ISD::VSEXT: {
28078 // TODO: Add DemandedElts support.
28079 SDValue Src = Op.getOperand(0);
28080 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28081 Tmp += VTBits - Src.getScalarValueSizeInBits();
28085 case X86ISD::VTRUNC: {
28086 // TODO: Add DemandedElts support.
28087 SDValue Src = Op.getOperand(0);
28088 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
28089 assert(VTBits < NumSrcBits && "Illegal truncation input type");
28090 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28091 if (Tmp > (NumSrcBits - VTBits))
28092 return Tmp - (NumSrcBits - VTBits);
28096 case X86ISD::PACKSS: {
28097 // PACKSS is just a truncation if the sign bits extend to the packed size.
28098 // TODO: Add DemandedElts support.
28099 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
28100 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
28101 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
28102 unsigned Tmp = std::min(Tmp0, Tmp1);
28103 if (Tmp > (SrcBits - VTBits))
28104 return Tmp - (SrcBits - VTBits);
28108 case X86ISD::VSHLI: {
28109 SDValue Src = Op.getOperand(0);
28110 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28111 if (ShiftVal.uge(VTBits))
28112 return VTBits; // Shifted all bits out --> zero.
28113 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28114 if (ShiftVal.uge(Tmp))
28115 return 1; // Shifted all sign bits out --> unknown.
28116 return Tmp - ShiftVal.getZExtValue();
28119 case X86ISD::VSRAI: {
28120 SDValue Src = Op.getOperand(0);
28121 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28122 if (ShiftVal.uge(VTBits - 1))
28123 return VTBits; // Sign splat.
28124 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28126 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
28129 case X86ISD::PCMPGT:
28130 case X86ISD::PCMPEQ:
28132 case X86ISD::VPCOM:
28133 case X86ISD::VPCOMU:
28134 // Vector compares return zero/all-bits result values.
28137 case X86ISD::CMOV: {
28138 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
28139 if (Tmp0 == 1) return 1; // Early out.
28140 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
28141 return std::min(Tmp0, Tmp1);
28143 case X86ISD::SDIVREM8_SEXT_HREG:
28144 // TODO: Support more than just the sign extended bits?
28145 if (Op.getResNo() != 1)
28147 // The remainder is sign extended.
28155 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
28156 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
28157 return N->getOperand(0);
28161 /// Returns true (and the GlobalValue and the offset) if the node is a
28162 /// GlobalAddress + offset.
28163 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
28164 const GlobalValue* &GA,
28165 int64_t &Offset) const {
28166 if (N->getOpcode() == X86ISD::Wrapper) {
28167 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
28168 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
28169 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
28173 return TargetLowering::isGAPlusOffset(N, GA, Offset);
28176 // Attempt to match a combined shuffle mask against supported unary shuffle
28178 // TODO: Investigate sharing more of this with shuffle lowering.
28179 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28180 bool AllowFloatDomain, bool AllowIntDomain,
28181 SDValue &V1, const SDLoc &DL,
28183 const X86Subtarget &Subtarget,
28184 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
28185 unsigned NumMaskElts = Mask.size();
28186 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
28188 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
28189 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
28190 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
28191 Shuffle = X86ISD::VZEXT_MOVL;
28192 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
28196 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
28197 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
28198 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
28199 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
28200 unsigned MaxScale = 64 / MaskEltSize;
28201 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
28203 unsigned NumDstElts = NumMaskElts / Scale;
28204 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
28205 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
28206 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
28209 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
28210 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
28211 MVT::getIntegerVT(MaskEltSize);
28212 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
28214 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
28215 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
28216 Shuffle = unsigned(X86ISD::VZEXT);
28218 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
28220 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
28221 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
28227 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
28228 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
28229 isUndefOrEqual(Mask[0], 0) &&
28230 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
28231 Shuffle = X86ISD::VZEXT_MOVL;
28232 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
28236 // Check if we have SSE3 which will let us use MOVDDUP etc. The
28237 // instructions are no slower than UNPCKLPD but has the option to
28238 // fold the input operand into even an unaligned memory load.
28239 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
28240 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
28241 Shuffle = X86ISD::MOVDDUP;
28242 SrcVT = DstVT = MVT::v2f64;
28245 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28246 Shuffle = X86ISD::MOVSLDUP;
28247 SrcVT = DstVT = MVT::v4f32;
28250 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
28251 Shuffle = X86ISD::MOVSHDUP;
28252 SrcVT = DstVT = MVT::v4f32;
28257 if (MaskVT.is256BitVector() && AllowFloatDomain) {
28258 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
28259 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28260 Shuffle = X86ISD::MOVDDUP;
28261 SrcVT = DstVT = MVT::v4f64;
28264 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28265 Shuffle = X86ISD::MOVSLDUP;
28266 SrcVT = DstVT = MVT::v8f32;
28269 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
28270 Shuffle = X86ISD::MOVSHDUP;
28271 SrcVT = DstVT = MVT::v8f32;
28276 if (MaskVT.is512BitVector() && AllowFloatDomain) {
28277 assert(Subtarget.hasAVX512() &&
28278 "AVX512 required for 512-bit vector shuffles");
28279 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28280 Shuffle = X86ISD::MOVDDUP;
28281 SrcVT = DstVT = MVT::v8f64;
28284 if (isTargetShuffleEquivalent(
28285 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
28286 Shuffle = X86ISD::MOVSLDUP;
28287 SrcVT = DstVT = MVT::v16f32;
28290 if (isTargetShuffleEquivalent(
28291 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
28292 Shuffle = X86ISD::MOVSHDUP;
28293 SrcVT = DstVT = MVT::v16f32;
28298 // Attempt to match against broadcast-from-vector.
28299 if (Subtarget.hasAVX2()) {
28300 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
28301 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
28302 SrcVT = DstVT = MaskVT;
28303 Shuffle = X86ISD::VBROADCAST;
28311 // Attempt to match a combined shuffle mask against supported unary immediate
28312 // permute instructions.
28313 // TODO: Investigate sharing more of this with shuffle lowering.
28314 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28315 const APInt &Zeroable,
28316 bool AllowFloatDomain,
28317 bool AllowIntDomain,
28318 const X86Subtarget &Subtarget,
28319 unsigned &Shuffle, MVT &ShuffleVT,
28320 unsigned &PermuteImm) {
28321 unsigned NumMaskElts = Mask.size();
28322 unsigned InputSizeInBits = MaskVT.getSizeInBits();
28323 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
28324 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
28326 bool ContainsZeros =
28327 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28329 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
28330 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
28331 // Check for lane crossing permutes.
28332 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
28333 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
28334 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
28335 Shuffle = X86ISD::VPERMI;
28336 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
28337 PermuteImm = getV4X86ShuffleImm(Mask);
28340 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
28341 SmallVector<int, 4> RepeatedMask;
28342 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
28343 Shuffle = X86ISD::VPERMI;
28344 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
28345 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
28349 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
28350 // VPERMILPD can permute with a non-repeating shuffle.
28351 Shuffle = X86ISD::VPERMILPI;
28352 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
28354 for (int i = 0, e = Mask.size(); i != e; ++i) {
28356 if (M == SM_SentinelUndef)
28358 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
28359 PermuteImm |= (M & 1) << i;
28365 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
28366 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
28367 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
28368 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
28369 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
28370 SmallVector<int, 4> RepeatedMask;
28371 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28372 // Narrow the repeated mask to create 32-bit element permutes.
28373 SmallVector<int, 4> WordMask = RepeatedMask;
28374 if (MaskScalarSizeInBits == 64)
28375 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
28377 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
28378 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
28379 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
28380 PermuteImm = getV4X86ShuffleImm(WordMask);
28385 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
28386 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
28387 SmallVector<int, 4> RepeatedMask;
28388 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28389 ArrayRef<int> LoMask(Mask.data() + 0, 4);
28390 ArrayRef<int> HiMask(Mask.data() + 4, 4);
28392 // PSHUFLW: permute lower 4 elements only.
28393 if (isUndefOrInRange(LoMask, 0, 4) &&
28394 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
28395 Shuffle = X86ISD::PSHUFLW;
28396 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28397 PermuteImm = getV4X86ShuffleImm(LoMask);
28401 // PSHUFHW: permute upper 4 elements only.
28402 if (isUndefOrInRange(HiMask, 4, 8) &&
28403 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
28404 // Offset the HiMask so that we can create the shuffle immediate.
28405 int OffsetHiMask[4];
28406 for (int i = 0; i != 4; ++i)
28407 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
28409 Shuffle = X86ISD::PSHUFHW;
28410 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28411 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
28417 // Attempt to match against byte/bit shifts.
28418 // FIXME: Add 512-bit support.
28419 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28420 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28421 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28422 MaskScalarSizeInBits, Mask,
28423 0, Zeroable, Subtarget);
28424 if (0 < ShiftAmt) {
28425 PermuteImm = (unsigned)ShiftAmt;
28433 // Attempt to match a combined unary shuffle mask against supported binary
28434 // shuffle instructions.
28435 // TODO: Investigate sharing more of this with shuffle lowering.
28436 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28437 bool AllowFloatDomain, bool AllowIntDomain,
28438 SDValue &V1, SDValue &V2, const SDLoc &DL,
28440 const X86Subtarget &Subtarget,
28441 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28443 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28445 if (MaskVT.is128BitVector()) {
28446 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28448 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
28449 Shuffle = X86ISD::MOVLHPS;
28450 SrcVT = DstVT = MVT::v4f32;
28453 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28455 Shuffle = X86ISD::MOVHLPS;
28456 SrcVT = DstVT = MVT::v4f32;
28459 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28460 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28462 Shuffle = X86ISD::MOVSD;
28463 SrcVT = DstVT = MaskVT;
28466 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28467 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28468 Shuffle = X86ISD::MOVSS;
28469 SrcVT = DstVT = MaskVT;
28474 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28475 // TODO add support for 256/512-bit types.
28476 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28477 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28484 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28485 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28486 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28487 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28488 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28489 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28490 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28492 SrcVT = DstVT = MaskVT;
28493 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28494 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28502 static bool matchBinaryPermuteVectorShuffle(
28503 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
28504 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
28505 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
28506 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
28507 unsigned NumMaskElts = Mask.size();
28508 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28510 // Attempt to match against PALIGNR byte rotate.
28511 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28512 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28513 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28514 if (0 < ByteRotation) {
28515 Shuffle = X86ISD::PALIGNR;
28516 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28517 PermuteImm = ByteRotation;
28522 // Attempt to combine to X86ISD::BLENDI.
28523 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28524 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28525 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28526 uint64_t BlendMask = 0;
28527 bool ForceV1Zero = false, ForceV2Zero = false;
28528 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28529 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28531 if (MaskVT == MVT::v16i16) {
28532 // We can only use v16i16 PBLENDW if the lanes are repeated.
28533 SmallVector<int, 8> RepeatedMask;
28534 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28536 assert(RepeatedMask.size() == 8 &&
28537 "Repeated mask size doesn't match!");
28539 for (int i = 0; i < 8; ++i)
28540 if (RepeatedMask[i] >= 8)
28541 PermuteImm |= 1 << i;
28542 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28543 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28544 Shuffle = X86ISD::BLENDI;
28545 ShuffleVT = MaskVT;
28549 // Determine a type compatible with X86ISD::BLENDI.
28550 ShuffleVT = MaskVT;
28551 if (Subtarget.hasAVX2()) {
28552 if (ShuffleVT == MVT::v4i64)
28553 ShuffleVT = MVT::v8i32;
28554 else if (ShuffleVT == MVT::v2i64)
28555 ShuffleVT = MVT::v4i32;
28557 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28558 ShuffleVT = MVT::v8i16;
28559 else if (ShuffleVT == MVT::v4i64)
28560 ShuffleVT = MVT::v4f64;
28561 else if (ShuffleVT == MVT::v8i32)
28562 ShuffleVT = MVT::v8f32;
28565 if (!ShuffleVT.isFloatingPoint()) {
28566 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28568 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28569 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28570 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28573 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28574 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28575 PermuteImm = (unsigned)BlendMask;
28576 Shuffle = X86ISD::BLENDI;
28582 // Attempt to combine to INSERTPS.
28583 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28584 MaskVT.is128BitVector()) {
28585 if (Zeroable.getBoolValue() &&
28586 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28587 Shuffle = X86ISD::INSERTPS;
28588 ShuffleVT = MVT::v4f32;
28593 // Attempt to combine to SHUFPD.
28594 if (AllowFloatDomain && EltSizeInBits == 64 &&
28595 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28596 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28597 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28598 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28599 Shuffle = X86ISD::SHUFP;
28600 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28605 // Attempt to combine to SHUFPS.
28606 if (AllowFloatDomain && EltSizeInBits == 32 &&
28607 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28608 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28609 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28610 SmallVector<int, 4> RepeatedMask;
28611 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28612 // Match each half of the repeated mask, to determine if its just
28613 // referencing one of the vectors, is zeroable or entirely undef.
28614 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28615 int M0 = RepeatedMask[Offset];
28616 int M1 = RepeatedMask[Offset + 1];
28618 if (isUndefInRange(RepeatedMask, Offset, 2)) {
28619 return DAG.getUNDEF(MaskVT);
28620 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
28621 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
28622 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
28623 return getZeroVector(MaskVT, Subtarget, DAG, DL);
28624 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
28625 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28626 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28628 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
28629 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28630 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28637 int ShufMask[4] = {-1, -1, -1, -1};
28638 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
28639 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
28644 Shuffle = X86ISD::SHUFP;
28645 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
28646 PermuteImm = getV4X86ShuffleImm(ShufMask);
28655 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
28658 /// This is the leaf of the recursive combine below. When we have found some
28659 /// chain of single-use x86 shuffle instructions and accumulated the combined
28660 /// shuffle mask represented by them, this will try to pattern match that mask
28661 /// into either a single instruction if there is a special purpose instruction
28662 /// for this operation, or into a PSHUFB instruction which is a fully general
28663 /// instruction but should only be used to replace chains over a certain depth.
28664 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
28665 ArrayRef<int> BaseMask, int Depth,
28666 bool HasVariableMask, SelectionDAG &DAG,
28667 TargetLowering::DAGCombinerInfo &DCI,
28668 const X86Subtarget &Subtarget) {
28669 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
28670 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
28671 "Unexpected number of shuffle inputs!");
28673 // Find the inputs that enter the chain. Note that multiple uses are OK
28674 // here, we're not going to remove the operands we find.
28675 bool UnaryShuffle = (Inputs.size() == 1);
28676 SDValue V1 = peekThroughBitcasts(Inputs[0]);
28677 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
28678 : peekThroughBitcasts(Inputs[1]));
28680 MVT VT1 = V1.getSimpleValueType();
28681 MVT VT2 = V2.getSimpleValueType();
28682 MVT RootVT = Root.getSimpleValueType();
28683 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
28684 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
28685 "Vector size mismatch");
28690 unsigned NumBaseMaskElts = BaseMask.size();
28691 if (NumBaseMaskElts == 1) {
28692 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
28693 return DAG.getBitcast(RootVT, V1);
28696 unsigned RootSizeInBits = RootVT.getSizeInBits();
28697 unsigned NumRootElts = RootVT.getVectorNumElements();
28698 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
28699 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
28700 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
28702 // Don't combine if we are a AVX512/EVEX target and the mask element size
28703 // is different from the root element size - this would prevent writemasks
28704 // from being reused.
28705 // TODO - this currently prevents all lane shuffles from occurring.
28706 // TODO - check for writemasks usage instead of always preventing combining.
28707 // TODO - attempt to narrow Mask back to writemask size.
28708 bool IsEVEXShuffle =
28709 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
28711 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
28713 // Handle 128-bit lane shuffles of 256-bit vectors.
28714 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
28715 // we need to use the zeroing feature.
28716 // TODO - this should support binary shuffles.
28717 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
28718 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
28719 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
28720 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
28721 return SDValue(); // Nothing to do!
28722 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
28723 unsigned PermMask = 0;
28724 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
28725 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
28727 Res = DAG.getBitcast(ShuffleVT, V1);
28728 DCI.AddToWorklist(Res.getNode());
28729 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
28730 DAG.getUNDEF(ShuffleVT),
28731 DAG.getConstant(PermMask, DL, MVT::i8));
28732 DCI.AddToWorklist(Res.getNode());
28733 return DAG.getBitcast(RootVT, Res);
28736 // For masks that have been widened to 128-bit elements or more,
28737 // narrow back down to 64-bit elements.
28738 SmallVector<int, 64> Mask;
28739 if (BaseMaskEltSizeInBits > 64) {
28740 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
28741 int MaskScale = BaseMaskEltSizeInBits / 64;
28742 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
28744 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
28747 unsigned NumMaskElts = Mask.size();
28748 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
28750 // Determine the effective mask value type.
28751 FloatDomain &= (32 <= MaskEltSizeInBits);
28752 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
28753 : MVT::getIntegerVT(MaskEltSizeInBits);
28754 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
28756 // Only allow legal mask types.
28757 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
28760 // Attempt to match the mask against known shuffle patterns.
28761 MVT ShuffleSrcVT, ShuffleVT;
28762 unsigned Shuffle, PermuteImm;
28764 // Which shuffle domains are permitted?
28765 // Permit domain crossing at higher combine depths.
28766 bool AllowFloatDomain = FloatDomain || (Depth > 3);
28767 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
28768 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
28770 // Determine zeroable mask elements.
28771 APInt Zeroable(NumMaskElts, 0);
28772 for (unsigned i = 0; i != NumMaskElts; ++i)
28773 if (isUndefOrZero(Mask[i]))
28774 Zeroable.setBit(i);
28776 if (UnaryShuffle) {
28777 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
28778 // directly if we don't shuffle the lower element and we shuffle the upper
28779 // (zero) elements within themselves.
28780 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
28781 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
28782 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
28783 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
28784 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
28785 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
28786 return DAG.getBitcast(RootVT, V1);
28790 SDValue NewV1 = V1; // Save operand in case early exit happens.
28791 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28792 NewV1, DL, DAG, Subtarget, Shuffle,
28793 ShuffleSrcVT, ShuffleVT) &&
28794 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28795 if (Depth == 1 && Root.getOpcode() == Shuffle)
28796 return SDValue(); // Nothing to do!
28797 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
28798 DCI.AddToWorklist(Res.getNode());
28799 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
28800 DCI.AddToWorklist(Res.getNode());
28801 return DAG.getBitcast(RootVT, Res);
28804 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28805 AllowIntDomain, Subtarget, Shuffle,
28806 ShuffleVT, PermuteImm) &&
28807 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28808 if (Depth == 1 && Root.getOpcode() == Shuffle)
28809 return SDValue(); // Nothing to do!
28810 Res = DAG.getBitcast(ShuffleVT, V1);
28811 DCI.AddToWorklist(Res.getNode());
28812 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
28813 DAG.getConstant(PermuteImm, DL, MVT::i8));
28814 DCI.AddToWorklist(Res.getNode());
28815 return DAG.getBitcast(RootVT, Res);
28819 SDValue NewV1 = V1; // Save operands in case early exit happens.
28820 SDValue NewV2 = V2;
28821 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28822 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
28823 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
28824 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28825 if (Depth == 1 && Root.getOpcode() == Shuffle)
28826 return SDValue(); // Nothing to do!
28827 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
28828 DCI.AddToWorklist(NewV1.getNode());
28829 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
28830 DCI.AddToWorklist(NewV2.getNode());
28831 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
28832 DCI.AddToWorklist(Res.getNode());
28833 return DAG.getBitcast(RootVT, Res);
28836 NewV1 = V1; // Save operands in case early exit happens.
28838 if (matchBinaryPermuteVectorShuffle(
28839 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
28840 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
28841 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
28842 if (Depth == 1 && Root.getOpcode() == Shuffle)
28843 return SDValue(); // Nothing to do!
28844 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
28845 DCI.AddToWorklist(NewV1.getNode());
28846 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
28847 DCI.AddToWorklist(NewV2.getNode());
28848 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
28849 DAG.getConstant(PermuteImm, DL, MVT::i8));
28850 DCI.AddToWorklist(Res.getNode());
28851 return DAG.getBitcast(RootVT, Res);
28854 // Typically from here on, we need an integer version of MaskVT.
28855 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
28856 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
28858 // Annoyingly, SSE4A instructions don't map into the above match helpers.
28859 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
28860 uint64_t BitLen, BitIdx;
28861 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
28863 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
28864 return SDValue(); // Nothing to do!
28865 V1 = DAG.getBitcast(IntMaskVT, V1);
28866 DCI.AddToWorklist(V1.getNode());
28867 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
28868 DAG.getConstant(BitLen, DL, MVT::i8),
28869 DAG.getConstant(BitIdx, DL, MVT::i8));
28870 DCI.AddToWorklist(Res.getNode());
28871 return DAG.getBitcast(RootVT, Res);
28874 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
28875 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
28876 return SDValue(); // Nothing to do!
28877 V1 = DAG.getBitcast(IntMaskVT, V1);
28878 DCI.AddToWorklist(V1.getNode());
28879 V2 = DAG.getBitcast(IntMaskVT, V2);
28880 DCI.AddToWorklist(V2.getNode());
28881 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
28882 DAG.getConstant(BitLen, DL, MVT::i8),
28883 DAG.getConstant(BitIdx, DL, MVT::i8));
28884 DCI.AddToWorklist(Res.getNode());
28885 return DAG.getBitcast(RootVT, Res);
28889 // Don't try to re-form single instruction chains under any circumstances now
28890 // that we've done encoding canonicalization for them.
28894 // Depth threshold above which we can efficiently use variable mask shuffles.
28895 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
28896 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
28898 bool MaskContainsZeros =
28899 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28901 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
28902 // If we have a single input lane-crossing shuffle then lower to VPERMV.
28903 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28904 ((Subtarget.hasAVX2() &&
28905 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28906 (Subtarget.hasAVX512() &&
28907 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28908 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28909 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28910 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28911 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28912 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28913 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28914 DCI.AddToWorklist(VPermMask.getNode());
28915 Res = DAG.getBitcast(MaskVT, V1);
28916 DCI.AddToWorklist(Res.getNode());
28917 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
28918 DCI.AddToWorklist(Res.getNode());
28919 return DAG.getBitcast(RootVT, Res);
28922 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
28923 // vector as the second source.
28924 if (UnaryShuffle && AllowVariableMask &&
28925 ((Subtarget.hasAVX512() &&
28926 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28927 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28928 (Subtarget.hasVLX() &&
28929 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28930 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28931 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28932 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28933 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28934 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28935 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
28936 for (unsigned i = 0; i != NumMaskElts; ++i)
28937 if (Mask[i] == SM_SentinelZero)
28938 Mask[i] = NumMaskElts + i;
28940 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28941 DCI.AddToWorklist(VPermMask.getNode());
28942 Res = DAG.getBitcast(MaskVT, V1);
28943 DCI.AddToWorklist(Res.getNode());
28944 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
28945 DCI.AddToWorklist(Zero.getNode());
28946 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
28947 DCI.AddToWorklist(Res.getNode());
28948 return DAG.getBitcast(RootVT, Res);
28951 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
28952 if (AllowVariableMask && !MaskContainsZeros &&
28953 ((Subtarget.hasAVX512() &&
28954 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28955 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28956 (Subtarget.hasVLX() &&
28957 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28958 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28959 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28960 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28961 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28962 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28963 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28964 DCI.AddToWorklist(VPermMask.getNode());
28965 V1 = DAG.getBitcast(MaskVT, V1);
28966 DCI.AddToWorklist(V1.getNode());
28967 V2 = DAG.getBitcast(MaskVT, V2);
28968 DCI.AddToWorklist(V2.getNode());
28969 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
28970 DCI.AddToWorklist(Res.getNode());
28971 return DAG.getBitcast(RootVT, Res);
28976 // See if we can combine a single input shuffle with zeros to a bit-mask,
28977 // which is much simpler than any shuffle.
28978 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
28979 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
28980 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
28981 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
28982 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
28983 APInt UndefElts(NumMaskElts, 0);
28984 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
28985 for (unsigned i = 0; i != NumMaskElts; ++i) {
28987 if (M == SM_SentinelUndef) {
28988 UndefElts.setBit(i);
28991 if (M == SM_SentinelZero)
28993 EltBits[i] = AllOnes;
28995 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
28996 DCI.AddToWorklist(BitMask.getNode());
28997 Res = DAG.getBitcast(MaskVT, V1);
28998 DCI.AddToWorklist(Res.getNode());
28999 unsigned AndOpcode =
29000 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29001 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29002 DCI.AddToWorklist(Res.getNode());
29003 return DAG.getBitcast(RootVT, Res);
29006 // If we have a single input shuffle with different shuffle patterns in the
29007 // the 128-bit lanes use the variable mask to VPERMILPS.
29008 // TODO Combine other mask types at higher depths.
29009 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29010 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29011 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29012 SmallVector<SDValue, 16> VPermIdx;
29013 for (int M : Mask) {
29015 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29016 VPermIdx.push_back(Idx);
29018 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29019 DCI.AddToWorklist(VPermMask.getNode());
29020 Res = DAG.getBitcast(MaskVT, V1);
29021 DCI.AddToWorklist(Res.getNode());
29022 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29023 DCI.AddToWorklist(Res.getNode());
29024 return DAG.getBitcast(RootVT, Res);
29027 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29028 // to VPERMIL2PD/VPERMIL2PS.
29029 if (AllowVariableMask && Subtarget.hasXOP() &&
29030 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29031 MaskVT == MVT::v8f32)) {
29032 // VPERMIL2 Operation.
29033 // Bits[3] - Match Bit.
29034 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29035 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29036 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29037 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29038 SmallVector<int, 8> VPerm2Idx;
29039 unsigned M2ZImm = 0;
29040 for (int M : Mask) {
29041 if (M == SM_SentinelUndef) {
29042 VPerm2Idx.push_back(-1);
29045 if (M == SM_SentinelZero) {
29047 VPerm2Idx.push_back(8);
29050 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29051 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29052 VPerm2Idx.push_back(Index);
29054 V1 = DAG.getBitcast(MaskVT, V1);
29055 DCI.AddToWorklist(V1.getNode());
29056 V2 = DAG.getBitcast(MaskVT, V2);
29057 DCI.AddToWorklist(V2.getNode());
29058 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29059 DCI.AddToWorklist(VPerm2MaskOp.getNode());
29060 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29061 DAG.getConstant(M2ZImm, DL, MVT::i8));
29062 DCI.AddToWorklist(Res.getNode());
29063 return DAG.getBitcast(RootVT, Res);
29066 // If we have 3 or more shuffle instructions or a chain involving a variable
29067 // mask, we can replace them with a single PSHUFB instruction profitably.
29068 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29069 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29070 // more aggressive.
29071 if (UnaryShuffle && AllowVariableMask &&
29072 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29073 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29074 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29075 SmallVector<SDValue, 16> PSHUFBMask;
29076 int NumBytes = RootVT.getSizeInBits() / 8;
29077 int Ratio = NumBytes / NumMaskElts;
29078 for (int i = 0; i < NumBytes; ++i) {
29079 int M = Mask[i / Ratio];
29080 if (M == SM_SentinelUndef) {
29081 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29084 if (M == SM_SentinelZero) {
29085 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29088 M = Ratio * M + i % Ratio;
29089 assert((M / 16) == (i / 16) && "Lane crossing detected");
29090 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29092 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
29093 Res = DAG.getBitcast(ByteVT, V1);
29094 DCI.AddToWorklist(Res.getNode());
29095 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
29096 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
29097 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
29098 DCI.AddToWorklist(Res.getNode());
29099 return DAG.getBitcast(RootVT, Res);
29102 // With XOP, if we have a 128-bit binary input shuffle we can always combine
29103 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
29104 // slower than PSHUFB on targets that support both.
29105 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
29106 // VPPERM Mask Operation
29107 // Bits[4:0] - Byte Index (0 - 31)
29108 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
29109 SmallVector<SDValue, 16> VPPERMMask;
29111 int Ratio = NumBytes / NumMaskElts;
29112 for (int i = 0; i < NumBytes; ++i) {
29113 int M = Mask[i / Ratio];
29114 if (M == SM_SentinelUndef) {
29115 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
29118 if (M == SM_SentinelZero) {
29119 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
29122 M = Ratio * M + i % Ratio;
29123 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29125 MVT ByteVT = MVT::v16i8;
29126 V1 = DAG.getBitcast(ByteVT, V1);
29127 DCI.AddToWorklist(V1.getNode());
29128 V2 = DAG.getBitcast(ByteVT, V2);
29129 DCI.AddToWorklist(V2.getNode());
29130 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
29131 DCI.AddToWorklist(VPPERMMaskOp.getNode());
29132 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
29133 DCI.AddToWorklist(Res.getNode());
29134 return DAG.getBitcast(RootVT, Res);
29137 // Failed to find any combines.
29141 // Attempt to constant fold all of the constant source ops.
29142 // Returns true if the entire shuffle is folded to a constant.
29143 // TODO: Extend this to merge multiple constant Ops and update the mask.
29144 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
29145 ArrayRef<int> Mask, SDValue Root,
29146 bool HasVariableMask,
29148 TargetLowering::DAGCombinerInfo &DCI,
29149 const X86Subtarget &Subtarget) {
29150 MVT VT = Root.getSimpleValueType();
29152 unsigned SizeInBits = VT.getSizeInBits();
29153 unsigned NumMaskElts = Mask.size();
29154 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
29155 unsigned NumOps = Ops.size();
29157 // Extract constant bits from each source op.
29158 bool OneUseConstantOp = false;
29159 SmallVector<APInt, 16> UndefEltsOps(NumOps);
29160 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
29161 for (unsigned i = 0; i != NumOps; ++i) {
29162 SDValue SrcOp = Ops[i];
29163 OneUseConstantOp |= SrcOp.hasOneUse();
29164 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
29169 // Only fold if at least one of the constants is only used once or
29170 // the combined shuffle has included a variable mask shuffle, this
29171 // is to avoid constant pool bloat.
29172 if (!OneUseConstantOp && !HasVariableMask)
29175 // Shuffle the constant bits according to the mask.
29176 APInt UndefElts(NumMaskElts, 0);
29177 APInt ZeroElts(NumMaskElts, 0);
29178 APInt ConstantElts(NumMaskElts, 0);
29179 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
29180 APInt::getNullValue(MaskSizeInBits));
29181 for (unsigned i = 0; i != NumMaskElts; ++i) {
29183 if (M == SM_SentinelUndef) {
29184 UndefElts.setBit(i);
29186 } else if (M == SM_SentinelZero) {
29187 ZeroElts.setBit(i);
29190 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
29192 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
29193 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
29195 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
29196 if (SrcUndefElts[SrcMaskIdx]) {
29197 UndefElts.setBit(i);
29201 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
29202 APInt &Bits = SrcEltBits[SrcMaskIdx];
29204 ZeroElts.setBit(i);
29208 ConstantElts.setBit(i);
29209 ConstantBitData[i] = Bits;
29211 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
29213 // Create the constant data.
29215 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
29216 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
29218 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
29220 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
29223 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
29224 DCI.AddToWorklist(CstOp.getNode());
29225 return DAG.getBitcast(VT, CstOp);
29228 /// \brief Fully generic combining of x86 shuffle instructions.
29230 /// This should be the last combine run over the x86 shuffle instructions. Once
29231 /// they have been fully optimized, this will recursively consider all chains
29232 /// of single-use shuffle instructions, build a generic model of the cumulative
29233 /// shuffle operation, and check for simpler instructions which implement this
29234 /// operation. We use this primarily for two purposes:
29236 /// 1) Collapse generic shuffles to specialized single instructions when
29237 /// equivalent. In most cases, this is just an encoding size win, but
29238 /// sometimes we will collapse multiple generic shuffles into a single
29239 /// special-purpose shuffle.
29240 /// 2) Look for sequences of shuffle instructions with 3 or more total
29241 /// instructions, and replace them with the slightly more expensive SSSE3
29242 /// PSHUFB instruction if available. We do this as the last combining step
29243 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
29244 /// a suitable short sequence of other instructions. The PSHUFB will either
29245 /// use a register or have to read from memory and so is slightly (but only
29246 /// slightly) more expensive than the other shuffle instructions.
29248 /// Because this is inherently a quadratic operation (for each shuffle in
29249 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
29250 /// This should never be an issue in practice as the shuffle lowering doesn't
29251 /// produce sequences of more than 8 instructions.
29253 /// FIXME: We will currently miss some cases where the redundant shuffling
29254 /// would simplify under the threshold for PSHUFB formation because of
29255 /// combine-ordering. To fix this, we should do the redundant instruction
29256 /// combining in this recursive walk.
29257 static SDValue combineX86ShufflesRecursively(
29258 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
29259 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
29260 bool HasVariableMask, SelectionDAG &DAG,
29261 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
29262 // Bound the depth of our recursive combine because this is ultimately
29263 // quadratic in nature.
29264 const unsigned MaxRecursionDepth = 8;
29265 if (Depth > MaxRecursionDepth)
29268 // Directly rip through bitcasts to find the underlying operand.
29269 SDValue Op = SrcOps[SrcOpIndex];
29270 Op = peekThroughOneUseBitcasts(Op);
29272 MVT VT = Op.getSimpleValueType();
29273 if (!VT.isVector())
29274 return SDValue(); // Bail if we hit a non-vector.
29276 assert(Root.getSimpleValueType().isVector() &&
29277 "Shuffles operate on vector types!");
29278 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
29279 "Can only combine shuffles of the same vector register size.");
29281 // Extract target shuffle mask and resolve sentinels and inputs.
29282 SmallVector<int, 64> OpMask;
29283 SmallVector<SDValue, 2> OpInputs;
29284 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
29287 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
29288 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
29289 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
29291 // Add the inputs to the Ops list, avoiding duplicates.
29292 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
29294 int InputIdx0 = -1, InputIdx1 = -1;
29295 for (int i = 0, e = Ops.size(); i < e; ++i) {
29296 SDValue BC = peekThroughBitcasts(Ops[i]);
29297 if (Input0 && BC == peekThroughBitcasts(Input0))
29299 if (Input1 && BC == peekThroughBitcasts(Input1))
29303 if (Input0 && InputIdx0 < 0) {
29304 InputIdx0 = SrcOpIndex;
29305 Ops[SrcOpIndex] = Input0;
29307 if (Input1 && InputIdx1 < 0) {
29308 InputIdx1 = Ops.size();
29309 Ops.push_back(Input1);
29312 assert(((RootMask.size() > OpMask.size() &&
29313 RootMask.size() % OpMask.size() == 0) ||
29314 (OpMask.size() > RootMask.size() &&
29315 OpMask.size() % RootMask.size() == 0) ||
29316 OpMask.size() == RootMask.size()) &&
29317 "The smaller number of elements must divide the larger.");
29319 // This function can be performance-critical, so we rely on the power-of-2
29320 // knowledge that we have about the mask sizes to replace div/rem ops with
29321 // bit-masks and shifts.
29322 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
29323 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
29324 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
29325 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
29327 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
29328 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
29329 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
29330 assert((RootRatio == 1 || OpRatio == 1) &&
29331 "Must not have a ratio for both incoming and op masks!");
29333 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
29334 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
29335 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
29336 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
29337 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
29339 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
29341 // Merge this shuffle operation's mask into our accumulated mask. Note that
29342 // this shuffle's mask will be the first applied to the input, followed by the
29343 // root mask to get us all the way to the root value arrangement. The reason
29344 // for this order is that we are recursing up the operation chain.
29345 for (unsigned i = 0; i < MaskWidth; ++i) {
29346 unsigned RootIdx = i >> RootRatioLog2;
29347 if (RootMask[RootIdx] < 0) {
29348 // This is a zero or undef lane, we're done.
29349 Mask[i] = RootMask[RootIdx];
29353 unsigned RootMaskedIdx =
29355 ? RootMask[RootIdx]
29356 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
29358 // Just insert the scaled root mask value if it references an input other
29359 // than the SrcOp we're currently inserting.
29360 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
29361 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
29362 Mask[i] = RootMaskedIdx;
29366 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
29367 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
29368 if (OpMask[OpIdx] < 0) {
29369 // The incoming lanes are zero or undef, it doesn't matter which ones we
29371 Mask[i] = OpMask[OpIdx];
29375 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
29376 unsigned OpMaskedIdx =
29379 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
29381 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
29382 if (OpMask[OpIdx] < (int)OpMask.size()) {
29383 assert(0 <= InputIdx0 && "Unknown target shuffle input");
29384 OpMaskedIdx += InputIdx0 * MaskWidth;
29386 assert(0 <= InputIdx1 && "Unknown target shuffle input");
29387 OpMaskedIdx += InputIdx1 * MaskWidth;
29390 Mask[i] = OpMaskedIdx;
29393 // Handle the all undef/zero cases early.
29394 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
29395 return DAG.getUNDEF(Root.getValueType());
29397 // TODO - should we handle the mixed zero/undef case as well? Just returning
29398 // a zero mask will lose information on undef elements possibly reducing
29399 // future combine possibilities.
29400 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
29401 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
29404 // Remove unused shuffle source ops.
29405 resolveTargetShuffleInputsAndMask(Ops, Mask);
29406 assert(!Ops.empty() && "Shuffle with no inputs detected");
29408 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
29410 // Update the list of shuffle nodes that have been combined so far.
29411 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
29413 CombinedNodes.push_back(Op.getNode());
29415 // See if we can recurse into each shuffle source op (if it's a target
29416 // shuffle). The source op should only be combined if it either has a
29417 // single use (i.e. current Op) or all its users have already been combined.
29418 // Don't recurse if we already have more source ops than we can combine in
29419 // the remaining recursion depth.
29420 if (Ops.size() < (MaxRecursionDepth - Depth)) {
29421 for (int i = 0, e = Ops.size(); i < e; ++i)
29422 if (Ops[i].getNode()->hasOneUse() ||
29423 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29424 if (SDValue Res = combineX86ShufflesRecursively(
29425 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29426 DAG, DCI, Subtarget))
29430 // Attempt to constant fold all of the constant source ops.
29431 if (SDValue Cst = combineX86ShufflesConstants(
29432 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29435 // We can only combine unary and binary shuffle mask cases.
29436 if (Ops.size() > 2)
29439 // Minor canonicalization of the accumulated shuffle mask to make it easier
29440 // to match below. All this does is detect masks with sequential pairs of
29441 // elements, and shrink them to the half-width mask. It does this in a loop
29442 // so it will reduce the size of the mask to the minimal width mask which
29443 // performs an equivalent shuffle.
29444 SmallVector<int, 64> WidenedMask;
29445 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29446 Mask = std::move(WidenedMask);
29449 // Canonicalization of binary shuffle masks to improve pattern matching by
29450 // commuting the inputs.
29451 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29452 ShuffleVectorSDNode::commuteMask(Mask);
29453 std::swap(Ops[0], Ops[1]);
29456 // Finally, try to combine into a single shuffle instruction.
29457 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29461 /// \brief Get the PSHUF-style mask from PSHUF node.
29463 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29464 /// PSHUF-style masks that can be reused with such instructions.
29465 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29466 MVT VT = N.getSimpleValueType();
29467 SmallVector<int, 4> Mask;
29468 SmallVector<SDValue, 2> Ops;
29471 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29475 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29476 // matter. Check that the upper masks are repeats and remove them.
29477 if (VT.getSizeInBits() > 128) {
29478 int LaneElts = 128 / VT.getScalarSizeInBits();
29480 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29481 for (int j = 0; j < LaneElts; ++j)
29482 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
29483 "Mask doesn't repeat in high 128-bit lanes!");
29485 Mask.resize(LaneElts);
29488 switch (N.getOpcode()) {
29489 case X86ISD::PSHUFD:
29491 case X86ISD::PSHUFLW:
29494 case X86ISD::PSHUFHW:
29495 Mask.erase(Mask.begin(), Mask.begin() + 4);
29496 for (int &M : Mask)
29500 llvm_unreachable("No valid shuffle instruction found!");
29504 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
29506 /// We walk up the chain and look for a combinable shuffle, skipping over
29507 /// shuffles that we could hoist this shuffle's transformation past without
29508 /// altering anything.
29510 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29511 SelectionDAG &DAG) {
29512 assert(N.getOpcode() == X86ISD::PSHUFD &&
29513 "Called with something other than an x86 128-bit half shuffle!");
29516 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29517 // of the shuffles in the chain so that we can form a fresh chain to replace
29519 SmallVector<SDValue, 8> Chain;
29520 SDValue V = N.getOperand(0);
29521 for (; V.hasOneUse(); V = V.getOperand(0)) {
29522 switch (V.getOpcode()) {
29524 return SDValue(); // Nothing combined!
29527 // Skip bitcasts as we always know the type for the target specific
29531 case X86ISD::PSHUFD:
29532 // Found another dword shuffle.
29535 case X86ISD::PSHUFLW:
29536 // Check that the low words (being shuffled) are the identity in the
29537 // dword shuffle, and the high words are self-contained.
29538 if (Mask[0] != 0 || Mask[1] != 1 ||
29539 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29542 Chain.push_back(V);
29545 case X86ISD::PSHUFHW:
29546 // Check that the high words (being shuffled) are the identity in the
29547 // dword shuffle, and the low words are self-contained.
29548 if (Mask[2] != 2 || Mask[3] != 3 ||
29549 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29552 Chain.push_back(V);
29555 case X86ISD::UNPCKL:
29556 case X86ISD::UNPCKH:
29557 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29558 // shuffle into a preceding word shuffle.
29559 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29560 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29563 // Search for a half-shuffle which we can combine with.
29564 unsigned CombineOp =
29565 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29566 if (V.getOperand(0) != V.getOperand(1) ||
29567 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29569 Chain.push_back(V);
29570 V = V.getOperand(0);
29572 switch (V.getOpcode()) {
29574 return SDValue(); // Nothing to combine.
29576 case X86ISD::PSHUFLW:
29577 case X86ISD::PSHUFHW:
29578 if (V.getOpcode() == CombineOp)
29581 Chain.push_back(V);
29585 V = V.getOperand(0);
29589 } while (V.hasOneUse());
29592 // Break out of the loop if we break out of the switch.
29596 if (!V.hasOneUse())
29597 // We fell out of the loop without finding a viable combining instruction.
29600 // Merge this node's mask and our incoming mask.
29601 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29602 for (int &M : Mask)
29604 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29605 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29607 // Rebuild the chain around this new shuffle.
29608 while (!Chain.empty()) {
29609 SDValue W = Chain.pop_back_val();
29611 if (V.getValueType() != W.getOperand(0).getValueType())
29612 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29614 switch (W.getOpcode()) {
29616 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
29618 case X86ISD::UNPCKL:
29619 case X86ISD::UNPCKH:
29620 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
29623 case X86ISD::PSHUFD:
29624 case X86ISD::PSHUFLW:
29625 case X86ISD::PSHUFHW:
29626 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
29630 if (V.getValueType() != N.getValueType())
29631 V = DAG.getBitcast(N.getValueType(), V);
29633 // Return the new chain to replace N.
29637 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
29640 /// We walk up the chain, skipping shuffles of the other half and looking
29641 /// through shuffles which switch halves trying to find a shuffle of the same
29642 /// pair of dwords.
29643 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
29645 TargetLowering::DAGCombinerInfo &DCI) {
29647 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
29648 "Called with something other than an x86 128-bit half shuffle!");
29650 unsigned CombineOpcode = N.getOpcode();
29652 // Walk up a single-use chain looking for a combinable shuffle.
29653 SDValue V = N.getOperand(0);
29654 for (; V.hasOneUse(); V = V.getOperand(0)) {
29655 switch (V.getOpcode()) {
29657 return false; // Nothing combined!
29660 // Skip bitcasts as we always know the type for the target specific
29664 case X86ISD::PSHUFLW:
29665 case X86ISD::PSHUFHW:
29666 if (V.getOpcode() == CombineOpcode)
29669 // Other-half shuffles are no-ops.
29672 // Break out of the loop if we break out of the switch.
29676 if (!V.hasOneUse())
29677 // We fell out of the loop without finding a viable combining instruction.
29680 // Combine away the bottom node as its shuffle will be accumulated into
29681 // a preceding shuffle.
29682 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29684 // Record the old value.
29687 // Merge this node's mask and our incoming mask (adjusted to account for all
29688 // the pshufd instructions encountered).
29689 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29690 for (int &M : Mask)
29692 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
29693 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29695 // Check that the shuffles didn't cancel each other out. If not, we need to
29696 // combine to the new one.
29698 // Replace the combinable shuffle with the combined one, updating all users
29699 // so that we re-evaluate the chain here.
29700 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
29705 /// \brief Try to combine x86 target specific shuffles.
29706 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
29707 TargetLowering::DAGCombinerInfo &DCI,
29708 const X86Subtarget &Subtarget) {
29710 MVT VT = N.getSimpleValueType();
29711 SmallVector<int, 4> Mask;
29712 unsigned Opcode = N.getOpcode();
29714 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
29715 // single instruction.
29716 if (VT.getScalarSizeInBits() == 64 &&
29717 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
29718 Opcode == X86ISD::UNPCKL)) {
29719 auto BC0 = peekThroughBitcasts(N.getOperand(0));
29720 auto BC1 = peekThroughBitcasts(N.getOperand(1));
29721 EVT VT0 = BC0.getValueType();
29722 EVT VT1 = BC1.getValueType();
29723 unsigned Opcode0 = BC0.getOpcode();
29724 unsigned Opcode1 = BC1.getOpcode();
29725 if (Opcode0 == Opcode1 && VT0 == VT1 &&
29726 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
29727 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
29728 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
29730 if (Opcode == X86ISD::MOVSD) {
29731 Lo = BC1.getOperand(0);
29732 Hi = BC0.getOperand(1);
29734 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29735 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29737 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
29738 DCI.AddToWorklist(Horiz.getNode());
29739 return DAG.getBitcast(VT, Horiz);
29744 case X86ISD::VBROADCAST: {
29745 // If broadcasting from another shuffle, attempt to simplify it.
29746 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
29747 SDValue Src = N.getOperand(0);
29748 SDValue BC = peekThroughBitcasts(Src);
29749 EVT SrcVT = Src.getValueType();
29750 EVT BCVT = BC.getValueType();
29751 if (isTargetShuffle(BC.getOpcode()) &&
29752 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
29753 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
29754 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
29756 for (unsigned i = 0; i != Scale; ++i)
29757 DemandedMask[i] = i;
29758 if (SDValue Res = combineX86ShufflesRecursively(
29759 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
29760 /*HasVarMask*/ false, DAG, DCI, Subtarget))
29761 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
29762 DAG.getBitcast(SrcVT, Res));
29766 case X86ISD::PSHUFD:
29767 case X86ISD::PSHUFLW:
29768 case X86ISD::PSHUFHW:
29769 Mask = getPSHUFShuffleMask(N);
29770 assert(Mask.size() == 4);
29772 case X86ISD::UNPCKL: {
29773 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
29774 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
29775 // moves upper half elements into the lower half part. For example:
29777 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
29779 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
29781 // will be combined to:
29783 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
29785 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
29786 // happen due to advanced instructions.
29787 if (!VT.is128BitVector())
29790 auto Op0 = N.getOperand(0);
29791 auto Op1 = N.getOperand(1);
29792 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
29793 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
29795 unsigned NumElts = VT.getVectorNumElements();
29796 SmallVector<int, 8> ExpectedMask(NumElts, -1);
29797 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
29800 auto ShufOp = Op1.getOperand(0);
29801 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
29802 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
29806 case X86ISD::BLENDI: {
29807 SDValue V0 = N->getOperand(0);
29808 SDValue V1 = N->getOperand(1);
29809 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
29810 "Unexpected input vector types");
29812 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
29813 // operands and changing the mask to 1. This saves us a bunch of
29814 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
29815 // x86InstrInfo knows how to commute this back after instruction selection
29816 // if it would help register allocation.
29818 // TODO: If optimizing for size or a processor that doesn't suffer from
29819 // partial register update stalls, this should be transformed into a MOVSD
29820 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
29822 if (VT == MVT::v2f64)
29823 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
29824 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
29825 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
29826 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
29831 case X86ISD::MOVSD:
29832 case X86ISD::MOVSS: {
29833 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
29834 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
29835 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
29836 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
29837 if (isZero0 && isZero1)
29840 // We often lower to MOVSD/MOVSS from integer as well as native float
29841 // types; remove unnecessary domain-crossing bitcasts if we can to make it
29842 // easier to combine shuffles later on. We've already accounted for the
29843 // domain switching cost when we decided to lower with it.
29844 bool isFloat = VT.isFloatingPoint();
29845 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
29846 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
29847 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
29848 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
29849 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
29850 V0 = DAG.getBitcast(NewVT, V0);
29851 V1 = DAG.getBitcast(NewVT, V1);
29852 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
29857 case X86ISD::INSERTPS: {
29858 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
29859 SDValue Op0 = N.getOperand(0);
29860 SDValue Op1 = N.getOperand(1);
29861 SDValue Op2 = N.getOperand(2);
29862 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
29863 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
29864 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
29865 unsigned ZeroMask = InsertPSMask & 0xF;
29867 // If we zero out all elements from Op0 then we don't need to reference it.
29868 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
29869 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
29870 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29872 // If we zero out the element from Op1 then we don't need to reference it.
29873 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
29874 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29875 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29877 // Attempt to merge insertps Op1 with an inner target shuffle node.
29878 SmallVector<int, 8> TargetMask1;
29879 SmallVector<SDValue, 2> Ops1;
29880 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
29881 int M = TargetMask1[SrcIdx];
29882 if (isUndefOrZero(M)) {
29883 // Zero/UNDEF insertion - zero out element and remove dependency.
29884 InsertPSMask |= (1u << DstIdx);
29885 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29886 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29888 // Update insertps mask srcidx and reference the source input directly.
29889 assert(0 <= M && M < 8 && "Shuffle index out of range");
29890 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
29891 Op1 = Ops1[M < 4 ? 0 : 1];
29892 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29893 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29896 // Attempt to merge insertps Op0 with an inner target shuffle node.
29897 SmallVector<int, 8> TargetMask0;
29898 SmallVector<SDValue, 2> Ops0;
29899 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
29902 bool Updated = false;
29903 bool UseInput00 = false;
29904 bool UseInput01 = false;
29905 for (int i = 0; i != 4; ++i) {
29906 int M = TargetMask0[i];
29907 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
29908 // No change if element is already zero or the inserted element.
29910 } else if (isUndefOrZero(M)) {
29911 // If the target mask is undef/zero then we must zero the element.
29912 InsertPSMask |= (1u << i);
29917 // The input vector element must be inline.
29918 if (M != i && M != (i + 4))
29921 // Determine which inputs of the target shuffle we're using.
29922 UseInput00 |= (0 <= M && M < 4);
29923 UseInput01 |= (4 <= M);
29926 // If we're not using both inputs of the target shuffle then use the
29927 // referenced input directly.
29928 if (UseInput00 && !UseInput01) {
29931 } else if (!UseInput00 && UseInput01) {
29937 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29938 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29946 // Nuke no-op shuffles that show up after combining.
29947 if (isNoopShuffleMask(Mask))
29948 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29950 // Look for simplifications involving one or two shuffle instructions.
29951 SDValue V = N.getOperand(0);
29952 switch (N.getOpcode()) {
29955 case X86ISD::PSHUFLW:
29956 case X86ISD::PSHUFHW:
29957 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
29959 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
29960 return SDValue(); // We combined away this shuffle, so we're done.
29962 // See if this reduces to a PSHUFD which is no more expensive and can
29963 // combine with more operations. Note that it has to at least flip the
29964 // dwords as otherwise it would have been removed as a no-op.
29965 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
29966 int DMask[] = {0, 1, 2, 3};
29967 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
29968 DMask[DOffset + 0] = DOffset + 1;
29969 DMask[DOffset + 1] = DOffset + 0;
29970 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29971 V = DAG.getBitcast(DVT, V);
29972 DCI.AddToWorklist(V.getNode());
29973 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
29974 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
29975 DCI.AddToWorklist(V.getNode());
29976 return DAG.getBitcast(VT, V);
29979 // Look for shuffle patterns which can be implemented as a single unpack.
29980 // FIXME: This doesn't handle the location of the PSHUFD generically, and
29981 // only works when we have a PSHUFD followed by two half-shuffles.
29982 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
29983 (V.getOpcode() == X86ISD::PSHUFLW ||
29984 V.getOpcode() == X86ISD::PSHUFHW) &&
29985 V.getOpcode() != N.getOpcode() &&
29987 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
29988 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
29989 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29990 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
29991 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29992 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29994 for (int i = 0; i < 4; ++i) {
29995 WordMask[i + NOffset] = Mask[i] + NOffset;
29996 WordMask[i + VOffset] = VMask[i] + VOffset;
29998 // Map the word mask through the DWord mask.
30000 for (int i = 0; i < 8; ++i)
30001 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30002 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30003 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30004 // We can replace all three shuffles with an unpack.
30005 V = DAG.getBitcast(VT, D.getOperand(0));
30006 DCI.AddToWorklist(V.getNode());
30007 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30016 case X86ISD::PSHUFD:
30017 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30026 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30027 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30028 /// are written to the parameters \p Opnd0 and \p Opnd1.
30030 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30031 /// so it is easier to generically match. We also insert dummy vector shuffle
30032 /// nodes for the operands which explicitly discard the lanes which are unused
30033 /// by this operation to try to flow through the rest of the combiner
30034 /// the fact that they're unused.
30035 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30036 SDValue &Opnd0, SDValue &Opnd1,
30037 bool matchSubAdd = false) {
30039 EVT VT = N->getValueType(0);
30040 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
30041 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
30042 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
30045 // We only handle target-independent shuffles.
30046 // FIXME: It would be easy and harmless to use the target shuffle mask
30047 // extraction tool to support more.
30048 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30051 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
30052 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
30054 SDValue V1 = N->getOperand(0);
30055 SDValue V2 = N->getOperand(1);
30057 unsigned ExpectedOpcode = matchSubAdd ? ISD::FADD : ISD::FSUB;
30058 unsigned NextExpectedOpcode = matchSubAdd ? ISD::FSUB : ISD::FADD;
30060 // We require the first shuffle operand to be the ExpectedOpcode node,
30061 // and the second to be the NextExpectedOpcode node.
30062 if (V1.getOpcode() == NextExpectedOpcode && V2.getOpcode() == ExpectedOpcode) {
30063 ShuffleVectorSDNode::commuteMask(Mask);
30065 } else if (V1.getOpcode() != ExpectedOpcode || V2.getOpcode() != NextExpectedOpcode)
30068 // If there are other uses of these operations we can't fold them.
30069 if (!V1->hasOneUse() || !V2->hasOneUse())
30072 // Ensure that both operations have the same operands. Note that we can
30073 // commute the FADD operands.
30074 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
30075 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30076 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30079 // We're looking for blends between FADD and FSUB nodes. We insist on these
30080 // nodes being lined up in a specific expected pattern.
30081 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
30082 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
30083 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
30084 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
30085 8, 25, 10, 27, 12, 29, 14, 31})))
30093 /// \brief Try to combine a shuffle into a target-specific add-sub or
30094 /// mul-add-sub node.
30095 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
30096 const X86Subtarget &Subtarget,
30097 SelectionDAG &DAG) {
30098 SDValue Opnd0, Opnd1;
30099 if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1))
30102 EVT VT = N->getValueType(0);
30105 // Try to generate X86ISD::FMADDSUB node here.
30107 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
30108 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
30110 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
30111 // the ADDSUB idiom has been successfully recognized. There are no known
30112 // X86 targets with 512-bit ADDSUB instructions!
30113 if (VT.is512BitVector())
30116 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
30119 /// \brief Try to combine a shuffle into a target-specific
30120 /// mul-sub-add node.
30121 static SDValue combineShuffleToFMSubAdd(SDNode *N,
30122 const X86Subtarget &Subtarget,
30123 SelectionDAG &DAG) {
30124 SDValue Opnd0, Opnd1;
30125 if (!isAddSubOrSubAdd(N, Subtarget, Opnd0, Opnd1, true))
30128 EVT VT = N->getValueType(0);
30131 // Try to generate X86ISD::FMSUBADD node here.
30133 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
30134 return DAG.getNode(X86ISD::FMSUBADD, DL, VT, Opnd0, Opnd1, Opnd2);
30139 // We are looking for a shuffle where both sources are concatenated with undef
30140 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
30141 // if we can express this as a single-source shuffle, that's preferable.
30142 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
30143 const X86Subtarget &Subtarget) {
30144 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
30147 EVT VT = N->getValueType(0);
30149 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
30150 if (!VT.is128BitVector() && !VT.is256BitVector())
30153 if (VT.getVectorElementType() != MVT::i32 &&
30154 VT.getVectorElementType() != MVT::i64 &&
30155 VT.getVectorElementType() != MVT::f32 &&
30156 VT.getVectorElementType() != MVT::f64)
30159 SDValue N0 = N->getOperand(0);
30160 SDValue N1 = N->getOperand(1);
30162 // Check that both sources are concats with undef.
30163 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
30164 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
30165 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
30166 !N1.getOperand(1).isUndef())
30169 // Construct the new shuffle mask. Elements from the first source retain their
30170 // index, but elements from the second source no longer need to skip an undef.
30171 SmallVector<int, 8> Mask;
30172 int NumElts = VT.getVectorNumElements();
30174 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30175 for (int Elt : SVOp->getMask())
30176 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
30179 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
30181 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
30184 /// Eliminate a redundant shuffle of a horizontal math op.
30185 static SDValue foldShuffleOfHorizOp(SDNode *N) {
30186 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
30189 SDValue HOp = N->getOperand(0);
30190 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
30191 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
30194 // 128-bit horizontal math instructions are defined to operate on adjacent
30195 // lanes of each operand as:
30196 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
30197 // ...similarly for v2f64 and v8i16.
30198 // TODO: 256-bit is not the same because...x86.
30199 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
30202 // When the operands of a horizontal math op are identical, the low half of
30203 // the result is the same as the high half. If the shuffle is also replicating
30204 // low and high halves, we don't need the shuffle.
30205 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
30206 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30207 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
30208 // but this should be tied to whatever horizontal op matching and shuffle
30209 // canonicalization are producing.
30210 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
30211 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
30212 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
30218 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
30219 TargetLowering::DAGCombinerInfo &DCI,
30220 const X86Subtarget &Subtarget) {
30222 EVT VT = N->getValueType(0);
30223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30224 // If we have legalized the vector types, look for blends of FADD and FSUB
30225 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
30226 if (TLI.isTypeLegal(VT)) {
30227 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
30230 if (SDValue FMSubAdd = combineShuffleToFMSubAdd(N, Subtarget, DAG))
30233 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
30237 // During Type Legalization, when promoting illegal vector types,
30238 // the backend might introduce new shuffle dag nodes and bitcasts.
30240 // This code performs the following transformation:
30241 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
30242 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
30244 // We do this only if both the bitcast and the BINOP dag nodes have
30245 // one use. Also, perform this transformation only if the new binary
30246 // operation is legal. This is to avoid introducing dag nodes that
30247 // potentially need to be further expanded (or custom lowered) into a
30248 // less optimal sequence of dag nodes.
30249 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
30250 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
30251 N->getOperand(0).getOpcode() == ISD::BITCAST &&
30252 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
30253 SDValue N0 = N->getOperand(0);
30254 SDValue N1 = N->getOperand(1);
30256 SDValue BC0 = N0.getOperand(0);
30257 EVT SVT = BC0.getValueType();
30258 unsigned Opcode = BC0.getOpcode();
30259 unsigned NumElts = VT.getVectorNumElements();
30261 if (BC0.hasOneUse() && SVT.isVector() &&
30262 SVT.getVectorNumElements() * 2 == NumElts &&
30263 TLI.isOperationLegal(Opcode, VT)) {
30264 bool CanFold = false;
30270 // isOperationLegal lies for integer ops on floating point types.
30271 CanFold = VT.isInteger();
30276 // isOperationLegal lies for floating point ops on integer types.
30277 CanFold = VT.isFloatingPoint();
30281 unsigned SVTNumElts = SVT.getVectorNumElements();
30282 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30283 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
30284 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
30285 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
30286 CanFold = SVOp->getMaskElt(i) < 0;
30289 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
30290 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
30291 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
30292 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
30297 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
30298 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
30299 // consecutive, non-overlapping, and in the right order.
30300 SmallVector<SDValue, 16> Elts;
30301 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30302 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
30303 Elts.push_back(Elt);
30310 if (Elts.size() == VT.getVectorNumElements())
30312 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
30315 // For AVX2, we sometimes want to combine
30316 // (vector_shuffle <mask> (concat_vectors t1, undef)
30317 // (concat_vectors t2, undef))
30319 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
30320 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
30321 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
30324 if (isTargetShuffle(N->getOpcode())) {
30326 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
30329 // Try recursively combining arbitrary sequences of x86 shuffle
30330 // instructions into higher-order shuffles. We do this after combining
30331 // specific PSHUF instruction sequences into their minimal form so that we
30332 // can evaluate how many specialized shuffle instructions are involved in
30333 // a particular chain.
30334 if (SDValue Res = combineX86ShufflesRecursively(
30335 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
30336 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
30337 DCI.CombineTo(N, Res);
30345 /// Check if a vector extract from a target-specific shuffle of a load can be
30346 /// folded into a single element load.
30347 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
30348 /// shuffles have been custom lowered so we need to handle those here.
30349 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
30350 TargetLowering::DAGCombinerInfo &DCI) {
30351 if (DCI.isBeforeLegalizeOps())
30354 SDValue InVec = N->getOperand(0);
30355 SDValue EltNo = N->getOperand(1);
30356 EVT EltVT = N->getValueType(0);
30358 if (!isa<ConstantSDNode>(EltNo))
30361 EVT OriginalVT = InVec.getValueType();
30363 // Peek through bitcasts, don't duplicate a load with other uses.
30364 InVec = peekThroughOneUseBitcasts(InVec);
30366 EVT CurrentVT = InVec.getValueType();
30367 if (!CurrentVT.isVector() ||
30368 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
30371 if (!isTargetShuffle(InVec.getOpcode()))
30374 // Don't duplicate a load with other uses.
30375 if (!InVec.hasOneUse())
30378 SmallVector<int, 16> ShuffleMask;
30379 SmallVector<SDValue, 2> ShuffleOps;
30381 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
30382 ShuffleOps, ShuffleMask, UnaryShuffle))
30385 // Select the input vector, guarding against out of range extract vector.
30386 unsigned NumElems = CurrentVT.getVectorNumElements();
30387 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
30388 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
30390 if (Idx == SM_SentinelZero)
30391 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
30392 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
30393 if (Idx == SM_SentinelUndef)
30394 return DAG.getUNDEF(EltVT);
30396 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
30397 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
30400 // If inputs to shuffle are the same for both ops, then allow 2 uses
30401 unsigned AllowedUses =
30402 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
30404 if (LdNode.getOpcode() == ISD::BITCAST) {
30405 // Don't duplicate a load with other uses.
30406 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
30409 AllowedUses = 1; // only allow 1 load use if we have a bitcast
30410 LdNode = LdNode.getOperand(0);
30413 if (!ISD::isNormalLoad(LdNode.getNode()))
30416 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
30418 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
30421 // If there's a bitcast before the shuffle, check if the load type and
30422 // alignment is valid.
30423 unsigned Align = LN0->getAlignment();
30424 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30425 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
30426 EltVT.getTypeForEVT(*DAG.getContext()));
30428 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
30431 // All checks match so transform back to vector_shuffle so that DAG combiner
30432 // can finish the job
30435 // Create shuffle node taking into account the case that its a unary shuffle
30436 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
30437 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
30439 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
30440 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
30444 // Try to match patterns such as
30445 // (i16 bitcast (v16i1 x))
30447 // (i16 movmsk (16i8 sext (v16i1 x)))
30448 // before the illegal vector is scalarized on subtargets that don't have legal
30450 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
30451 const X86Subtarget &Subtarget) {
30452 EVT VT = BitCast.getValueType();
30453 SDValue N0 = BitCast.getOperand(0);
30454 EVT VecVT = N0->getValueType(0);
30456 if (!VT.isScalarInteger() || !VecVT.isSimple())
30459 // With AVX512 vxi1 types are legal and we prefer using k-regs.
30460 // MOVMSK is supported in SSE2 or later.
30461 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
30464 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
30465 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
30466 // v8i16 and v16i16.
30467 // For these two cases, we can shuffle the upper element bytes to a
30468 // consecutive sequence at the start of the vector and treat the results as
30469 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30470 // for v16i16 this is not the case, because the shuffle is expensive, so we
30471 // avoid sign-extending to this type entirely.
30472 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30473 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30475 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30476 switch (VecVT.getSimpleVT().SimpleTy) {
30480 SExtVT = MVT::v2i64;
30481 FPCastVT = MVT::v2f64;
30484 SExtVT = MVT::v4i32;
30485 FPCastVT = MVT::v4f32;
30486 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30487 // sign-extend to a 256-bit operation to avoid truncation.
30488 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30489 N0->getOperand(0).getValueType().is256BitVector()) {
30490 SExtVT = MVT::v4i64;
30491 FPCastVT = MVT::v4f64;
30495 SExtVT = MVT::v8i16;
30496 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30497 // sign-extend to a 256-bit operation to match the compare.
30498 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30499 // 256-bit because the shuffle is cheaper than sign extending the result of
30501 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30502 (N0->getOperand(0).getValueType().is256BitVector() ||
30503 N0->getOperand(0).getValueType().is512BitVector())) {
30504 SExtVT = MVT::v8i32;
30505 FPCastVT = MVT::v8f32;
30509 SExtVT = MVT::v16i8;
30510 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30511 // it is not profitable to sign-extend to 256-bit because this will
30512 // require an extra cross-lane shuffle which is more expensive than
30513 // truncating the result of the compare to 128-bits.
30516 SExtVT = MVT::v32i8;
30521 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30523 if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30524 // Handle pre-AVX2 cases by splitting to two v16i1's.
30525 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30526 MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30527 SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30528 SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30529 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30530 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30531 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30532 DAG.getConstant(16, DL, ShiftTy));
30533 V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30534 return DAG.getZExtOrTrunc(V, DL, VT);
30537 if (SExtVT == MVT::v8i16) {
30538 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
30539 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30540 DAG.getUNDEF(MVT::v8i16));
30542 assert(SExtVT.getScalarType() != MVT::i16 &&
30543 "Vectors of i16 must be packed");
30544 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30545 V = DAG.getBitcast(FPCastVT, V);
30546 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30547 return DAG.getZExtOrTrunc(V, DL, VT);
30550 // Convert a vXi1 constant build vector to the same width scalar integer.
30551 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
30552 EVT SrcVT = Op.getValueType();
30553 assert(SrcVT.getVectorElementType() == MVT::i1 &&
30554 "Expected a vXi1 vector");
30555 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
30556 "Expected a constant build vector");
30558 APInt Imm(SrcVT.getVectorNumElements(), 0);
30559 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
30560 SDValue In = Op.getOperand(Idx);
30561 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
30564 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
30565 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
30568 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30569 TargetLowering::DAGCombinerInfo &DCI,
30570 const X86Subtarget &Subtarget) {
30571 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
30573 if (!DCI.isBeforeLegalizeOps())
30576 // Only do this if we have k-registers.
30577 if (!Subtarget.hasAVX512())
30580 EVT DstVT = N->getValueType(0);
30581 SDValue Op = N->getOperand(0);
30582 EVT SrcVT = Op.getValueType();
30584 if (!Op.hasOneUse())
30587 // Look for logic ops.
30588 if (Op.getOpcode() != ISD::AND &&
30589 Op.getOpcode() != ISD::OR &&
30590 Op.getOpcode() != ISD::XOR)
30593 // Make sure we have a bitcast between mask registers and a scalar type.
30594 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
30595 DstVT.isScalarInteger()) &&
30596 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
30597 SrcVT.isScalarInteger()))
30600 SDValue LHS = Op.getOperand(0);
30601 SDValue RHS = Op.getOperand(1);
30603 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
30604 LHS.getOperand(0).getValueType() == DstVT)
30605 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
30606 DAG.getBitcast(DstVT, RHS));
30608 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
30609 RHS.getOperand(0).getValueType() == DstVT)
30610 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
30611 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
30613 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
30614 // Most of these have to move a constant from the scalar domain anyway.
30615 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
30616 RHS = combinevXi1ConstantToInteger(RHS, DAG);
30617 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
30618 DAG.getBitcast(DstVT, LHS), RHS);
30624 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
30625 TargetLowering::DAGCombinerInfo &DCI,
30626 const X86Subtarget &Subtarget) {
30627 SDValue N0 = N->getOperand(0);
30628 EVT VT = N->getValueType(0);
30629 EVT SrcVT = N0.getValueType();
30631 // Try to match patterns such as
30632 // (i16 bitcast (v16i1 x))
30634 // (i16 movmsk (16i8 sext (v16i1 x)))
30635 // before the setcc result is scalarized on subtargets that don't have legal
30637 if (DCI.isBeforeLegalize()) {
30638 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
30641 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
30642 // type, widen both sides to avoid a trip through memory.
30643 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
30644 Subtarget.hasAVX512()) {
30646 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
30647 N0 = DAG.getBitcast(MVT::v8i1, N0);
30648 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
30649 DAG.getIntPtrConstant(0, dl));
30652 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
30653 // type, widen both sides to avoid a trip through memory.
30654 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
30655 Subtarget.hasAVX512()) {
30657 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
30658 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
30660 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
30661 N0 = DAG.getBitcast(MVT::i8, N0);
30662 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
30666 // Since MMX types are special and don't usually play with other vector types,
30667 // it's better to handle them early to be sure we emit efficient code by
30668 // avoiding store-load conversions.
30669 if (VT == MVT::x86mmx) {
30670 // Detect zero-extended MMX constant vectors.
30672 SmallVector<APInt, 2> EltBits;
30673 if (getTargetConstantBitsFromNode(N0, 32, UndefElts, EltBits) &&
30676 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
30677 DAG.getConstant(EltBits[0], DL, MVT::i32));
30680 // Detect bitcasts between i32 to x86mmx low word.
30681 if (N0.getOpcode() == ISD::BUILD_VECTOR && SrcVT == MVT::v2i32) {
30682 SDValue N00 = N0.getOperand(0);
30683 SDValue N01 = N0.getOperand(1);
30684 if (N00.getValueType() == MVT::i32 &&
30685 (N01.getOpcode() == ISD::UNDEF || isNullConstant(N01)))
30686 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
30689 // Detect bitcasts between element or subvector extraction to x86mmx.
30690 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
30691 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
30692 isNullConstant(N0.getOperand(1))) {
30693 SDValue N00 = N0.getOperand(0);
30694 if (N00.getValueType().is128BitVector())
30695 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
30696 DAG.getBitcast(MVT::v2i64, N00));
30699 // Detect bitcasts from FP_TO_SINT to x86mmx.
30700 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
30702 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
30703 DAG.getUNDEF(MVT::v2i32));
30704 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
30705 DAG.getBitcast(MVT::v2i64, Res));
30709 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
30710 // most of these to scalar anyway.
30711 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
30712 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
30713 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
30714 return combinevXi1ConstantToInteger(N0, DAG);
30717 // Try to remove bitcasts from input and output of mask arithmetic to
30718 // remove GPR<->K-register crossings.
30719 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
30722 // Convert a bitcasted integer logic operation that has one bitcasted
30723 // floating-point operand into a floating-point logic operation. This may
30724 // create a load of a constant, but that is cheaper than materializing the
30725 // constant in an integer register and transferring it to an SSE register or
30726 // transferring the SSE operand to integer register and back.
30728 switch (N0.getOpcode()) {
30729 case ISD::AND: FPOpcode = X86ISD::FAND; break;
30730 case ISD::OR: FPOpcode = X86ISD::FOR; break;
30731 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
30732 default: return SDValue();
30735 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
30736 (Subtarget.hasSSE2() && VT == MVT::f64)))
30739 SDValue LogicOp0 = N0.getOperand(0);
30740 SDValue LogicOp1 = N0.getOperand(1);
30743 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
30744 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
30745 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
30746 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
30747 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
30748 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
30750 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
30751 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
30752 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
30753 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
30754 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
30755 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
30761 // Match a binop + shuffle pyramid that represents a horizontal reduction over
30762 // the elements of a vector.
30763 // Returns the vector that is being reduced on, or SDValue() if a reduction
30764 // was not matched.
30765 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
30766 ArrayRef<ISD::NodeType> CandidateBinOps) {
30767 // The pattern must end in an extract from index 0.
30768 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
30769 !isNullConstant(Extract->getOperand(1)))
30772 SDValue Op = Extract->getOperand(0);
30773 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
30775 // Match against one of the candidate binary ops.
30776 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
30777 return Op.getOpcode() == unsigned(BinOp);
30781 // At each stage, we're looking for something that looks like:
30782 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
30783 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
30784 // i32 undef, i32 undef, i32 undef, i32 undef>
30785 // %a = binop <8 x i32> %op, %s
30786 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
30787 // we expect something like:
30788 // <4,5,6,7,u,u,u,u>
30789 // <2,3,u,u,u,u,u,u>
30790 // <1,u,u,u,u,u,u,u>
30791 unsigned CandidateBinOp = Op.getOpcode();
30792 for (unsigned i = 0; i < Stages; ++i) {
30793 if (Op.getOpcode() != CandidateBinOp)
30796 ShuffleVectorSDNode *Shuffle =
30797 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
30799 Op = Op.getOperand(1);
30801 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
30802 Op = Op.getOperand(0);
30805 // The first operand of the shuffle should be the same as the other operand
30807 if (!Shuffle || Shuffle->getOperand(0) != Op)
30810 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
30811 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
30812 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
30816 BinOp = CandidateBinOp;
30820 // Given a select, detect the following pattern:
30821 // 1: %2 = zext <N x i8> %0 to <N x i32>
30822 // 2: %3 = zext <N x i8> %1 to <N x i32>
30823 // 3: %4 = sub nsw <N x i32> %2, %3
30824 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30825 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30826 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30827 // This is useful as it is the input into a SAD pattern.
30828 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
30830 // Check the condition of the select instruction is greater-than.
30831 SDValue SetCC = Select->getOperand(0);
30832 if (SetCC.getOpcode() != ISD::SETCC)
30834 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30835 if (CC != ISD::SETGT && CC != ISD::SETLT)
30838 SDValue SelectOp1 = Select->getOperand(1);
30839 SDValue SelectOp2 = Select->getOperand(2);
30841 // The following instructions assume SelectOp1 is the subtraction operand
30842 // and SelectOp2 is the negation operand.
30843 // In the case of SETLT this is the other way around.
30844 if (CC == ISD::SETLT)
30845 std::swap(SelectOp1, SelectOp2);
30847 // The second operand of the select should be the negation of the first
30848 // operand, which is implemented as 0 - SelectOp1.
30849 if (!(SelectOp2.getOpcode() == ISD::SUB &&
30850 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
30851 SelectOp2.getOperand(1) == SelectOp1))
30854 // The first operand of SetCC is the first operand of the select, which is the
30855 // difference between the two input vectors.
30856 if (SetCC.getOperand(0) != SelectOp1)
30859 // In SetLT case, The second operand of the comparison can be either 1 or 0.
30861 if ((CC == ISD::SETLT) &&
30862 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
30863 SplatVal.isOneValue()) ||
30864 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
30867 // In SetGT case, The second operand of the comparison can be either -1 or 0.
30868 if ((CC == ISD::SETGT) &&
30869 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30870 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30873 // The first operand of the select is the difference between the two input
30875 if (SelectOp1.getOpcode() != ISD::SUB)
30878 Op0 = SelectOp1.getOperand(0);
30879 Op1 = SelectOp1.getOperand(1);
30881 // Check if the operands of the sub are zero-extended from vectors of i8.
30882 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30883 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30884 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30885 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30891 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
30893 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
30894 const SDValue &Zext1, const SDLoc &DL) {
30896 // Find the appropriate width for the PSADBW.
30897 EVT InVT = Zext0.getOperand(0).getValueType();
30898 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
30900 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
30901 // fill in the missing vector elements with 0.
30902 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30903 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30904 Ops[0] = Zext0.getOperand(0);
30905 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30906 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30907 Ops[0] = Zext1.getOperand(0);
30908 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30910 // Actually build the SAD
30911 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30912 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
30915 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
30917 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
30918 const X86Subtarget &Subtarget) {
30919 // Bail without SSE41.
30920 if (!Subtarget.hasSSE41())
30923 EVT ExtractVT = Extract->getValueType(0);
30924 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
30927 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
30929 SDValue Src = matchBinOpReduction(
30930 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
30934 EVT SrcVT = Src.getValueType();
30935 EVT SrcSVT = SrcVT.getScalarType();
30936 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
30940 SDValue MinPos = Src;
30942 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
30943 while (SrcVT.getSizeInBits() > 128) {
30944 unsigned NumElts = SrcVT.getVectorNumElements();
30945 unsigned NumSubElts = NumElts / 2;
30946 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
30947 unsigned SubSizeInBits = SrcVT.getSizeInBits();
30948 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
30949 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
30950 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
30952 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
30953 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
30954 "Unexpected value type");
30956 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
30957 // to flip the value accordingly.
30959 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
30960 if (BinOp == ISD::SMAX)
30961 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
30962 else if (BinOp == ISD::SMIN)
30963 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
30964 else if (BinOp == ISD::UMAX)
30965 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
30968 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30970 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
30971 // shuffling each upper element down and insert zeros. This means that the
30972 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
30973 // ready for the PHMINPOS.
30974 if (ExtractVT == MVT::i8) {
30975 SDValue Upper = DAG.getVectorShuffle(
30976 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
30977 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
30978 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
30981 // Perform the PHMINPOS on a v8i16 vector,
30982 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
30983 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
30984 MinPos = DAG.getBitcast(SrcVT, MinPos);
30987 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30989 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
30990 DAG.getIntPtrConstant(0, DL));
30993 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
30994 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
30996 const X86Subtarget &Subtarget) {
30997 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
30998 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
31001 EVT ExtractVT = Extract->getValueType(0);
31002 unsigned BitWidth = ExtractVT.getSizeInBits();
31003 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
31004 ExtractVT != MVT::i8)
31007 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
31008 unsigned BinOp = 0;
31009 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
31013 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
31014 // which we can't support here for now.
31015 if (Match.getScalarValueSizeInBits() != BitWidth)
31018 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
31019 unsigned MatchSizeInBits = Match.getValueSizeInBits();
31020 if (!(MatchSizeInBits == 128 ||
31021 (MatchSizeInBits == 256 &&
31022 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
31025 // Don't bother performing this for 2-element vectors.
31026 if (Match.getValueType().getVectorNumElements() <= 2)
31029 // Check that we are extracting a reduction of all sign bits.
31030 if (DAG.ComputeNumSignBits(Match) != BitWidth)
31033 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
31035 if (64 == BitWidth || 32 == BitWidth)
31036 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
31037 MatchSizeInBits / BitWidth);
31039 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
31042 ISD::CondCode CondCode;
31043 if (BinOp == ISD::OR) {
31044 // any_of -> MOVMSK != 0
31045 CompareBits = APInt::getNullValue(32);
31046 CondCode = ISD::CondCode::SETNE;
31048 // all_of -> MOVMSK == ((1 << NumElts) - 1)
31049 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
31050 CondCode = ISD::CondCode::SETEQ;
31053 // Perform the select as i32/i64 and then truncate to avoid partial register
31055 unsigned ResWidth = std::max(BitWidth, 32u);
31056 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
31058 SDValue Zero = DAG.getConstant(0, DL, ResVT);
31059 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
31060 SDValue Res = DAG.getBitcast(MaskVT, Match);
31061 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
31062 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
31063 Ones, Zero, CondCode);
31064 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
31067 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
31068 const X86Subtarget &Subtarget) {
31069 // PSADBW is only supported on SSE2 and up.
31070 if (!Subtarget.hasSSE2())
31073 // Verify the type we're extracting from is any integer type above i16.
31074 EVT VT = Extract->getOperand(0).getValueType();
31075 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
31078 unsigned RegSize = 128;
31079 if (Subtarget.hasBWI())
31081 else if (Subtarget.hasAVX2())
31084 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
31085 // TODO: We should be able to handle larger vectors by splitting them before
31086 // feeding them into several SADs, and then reducing over those.
31087 if (RegSize / VT.getVectorNumElements() < 8)
31090 // Match shuffle + add pyramid.
31091 unsigned BinOp = 0;
31092 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
31094 // The operand is expected to be zero extended from i8
31095 // (verified in detectZextAbsDiff).
31096 // In order to convert to i64 and above, additional any/zero/sign
31097 // extend is expected.
31098 // The zero extend from 32 bit has no mathematical effect on the result.
31099 // Also the sign extend is basically zero extend
31100 // (extends the sign bit which is zero).
31101 // So it is correct to skip the sign/zero extend instruction.
31102 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
31103 Root.getOpcode() == ISD::ZERO_EXTEND ||
31104 Root.getOpcode() == ISD::ANY_EXTEND))
31105 Root = Root.getOperand(0);
31107 // If there was a match, we want Root to be a select that is the root of an
31108 // abs-diff pattern.
31109 if (!Root || (Root.getOpcode() != ISD::VSELECT))
31112 // Check whether we have an abs-diff pattern feeding into the select.
31113 SDValue Zext0, Zext1;
31114 if (!detectZextAbsDiff(Root, Zext0, Zext1))
31117 // Create the SAD instruction.
31119 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
31121 // If the original vector was wider than 8 elements, sum over the results
31122 // in the SAD vector.
31123 unsigned Stages = Log2_32(VT.getVectorNumElements());
31124 MVT SadVT = SAD.getSimpleValueType();
31126 unsigned SadElems = SadVT.getVectorNumElements();
31128 for(unsigned i = Stages - 3; i > 0; --i) {
31129 SmallVector<int, 16> Mask(SadElems, -1);
31130 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
31131 Mask[j] = MaskEnd + j;
31134 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
31135 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
31139 MVT Type = Extract->getSimpleValueType(0);
31140 unsigned TypeSizeInBits = Type.getSizeInBits();
31141 // Return the lowest TypeSizeInBits bits.
31142 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
31143 SAD = DAG.getBitcast(ResVT, SAD);
31144 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
31145 Extract->getOperand(1));
31148 // Attempt to peek through a target shuffle and extract the scalar from the
31150 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
31151 TargetLowering::DAGCombinerInfo &DCI,
31152 const X86Subtarget &Subtarget) {
31153 if (DCI.isBeforeLegalizeOps())
31156 SDValue Src = N->getOperand(0);
31157 SDValue Idx = N->getOperand(1);
31159 EVT VT = N->getValueType(0);
31160 EVT SrcVT = Src.getValueType();
31161 EVT SrcSVT = SrcVT.getVectorElementType();
31162 unsigned NumSrcElts = SrcVT.getVectorNumElements();
31164 // Don't attempt this for boolean mask vectors or unknown extraction indices.
31165 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
31168 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
31169 if (X86ISD::VBROADCAST == Src.getOpcode() &&
31170 Src.getOperand(0).getValueType() == VT)
31171 return Src.getOperand(0);
31173 // Resolve the target shuffle inputs and mask.
31174 SmallVector<int, 16> Mask;
31175 SmallVector<SDValue, 2> Ops;
31176 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
31179 // Attempt to narrow/widen the shuffle mask to the correct size.
31180 if (Mask.size() != NumSrcElts) {
31181 if ((NumSrcElts % Mask.size()) == 0) {
31182 SmallVector<int, 16> ScaledMask;
31183 int Scale = NumSrcElts / Mask.size();
31184 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
31185 Mask = std::move(ScaledMask);
31186 } else if ((Mask.size() % NumSrcElts) == 0) {
31187 SmallVector<int, 16> WidenedMask;
31188 while (Mask.size() > NumSrcElts &&
31189 canWidenShuffleElements(Mask, WidenedMask))
31190 Mask = std::move(WidenedMask);
31191 // TODO - investigate support for wider shuffle masks with known upper
31192 // undef/zero elements for implicit zero-extension.
31196 // Check if narrowing/widening failed.
31197 if (Mask.size() != NumSrcElts)
31200 int SrcIdx = Mask[N->getConstantOperandVal(1)];
31203 // If the shuffle source element is undef/zero then we can just accept it.
31204 if (SrcIdx == SM_SentinelUndef)
31205 return DAG.getUNDEF(VT);
31207 if (SrcIdx == SM_SentinelZero)
31208 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
31209 : DAG.getConstant(0, dl, VT);
31211 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
31212 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
31213 SrcIdx = SrcIdx % Mask.size();
31215 // We can only extract other elements from 128-bit vectors and in certain
31216 // circumstances, depending on SSE-level.
31217 // TODO: Investigate using extract_subvector for larger vectors.
31218 // TODO: Investigate float/double extraction if it will be just stored.
31219 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
31220 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
31221 assert(SrcSVT == VT && "Unexpected extraction type");
31222 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
31223 DAG.getIntPtrConstant(SrcIdx, dl));
31226 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
31227 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
31228 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
31229 "Unexpected extraction type");
31230 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
31231 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
31232 DAG.getIntPtrConstant(SrcIdx, dl));
31233 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
31239 /// Detect vector gather/scatter index generation and convert it from being a
31240 /// bunch of shuffles and extracts into a somewhat faster sequence.
31241 /// For i686, the best sequence is apparently storing the value and loading
31242 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
31243 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
31244 TargetLowering::DAGCombinerInfo &DCI,
31245 const X86Subtarget &Subtarget) {
31246 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
31249 // TODO - Remove this once we can handle the implicit zero-extension of
31250 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
31251 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
31252 // combineBasicSADPattern.
31253 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
31256 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
31259 SDValue InputVector = N->getOperand(0);
31260 SDValue EltIdx = N->getOperand(1);
31262 EVT SrcVT = InputVector.getValueType();
31263 EVT VT = N->getValueType(0);
31264 SDLoc dl(InputVector);
31266 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
31267 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31268 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
31269 SDValue MMXSrc = InputVector.getOperand(0);
31271 // The bitcast source is a direct mmx result.
31272 if (MMXSrc.getValueType() == MVT::x86mmx)
31273 return DAG.getBitcast(VT, InputVector);
31276 // Detect mmx to i32 conversion through a v2i32 elt extract.
31277 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31278 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
31279 SDValue MMXSrc = InputVector.getOperand(0);
31281 // The bitcast source is a direct mmx result.
31282 if (MMXSrc.getValueType() == MVT::x86mmx)
31283 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
31286 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
31287 isa<ConstantSDNode>(EltIdx) &&
31288 isa<ConstantSDNode>(InputVector.getOperand(0))) {
31289 uint64_t ExtractedElt = N->getConstantOperandVal(1);
31290 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
31291 const APInt &InputValue = InputC->getAPIntValue();
31292 uint64_t Res = InputValue[ExtractedElt];
31293 return DAG.getConstant(Res, dl, MVT::i1);
31296 // Check whether this extract is the root of a sum of absolute differences
31297 // pattern. This has to be done here because we really want it to happen
31298 // pre-legalization,
31299 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
31302 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
31303 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
31306 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
31307 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
31313 /// If a vector select has an operand that is -1 or 0, try to simplify the
31314 /// select to a bitwise logic operation.
31315 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
31317 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
31318 TargetLowering::DAGCombinerInfo &DCI,
31319 const X86Subtarget &Subtarget) {
31320 SDValue Cond = N->getOperand(0);
31321 SDValue LHS = N->getOperand(1);
31322 SDValue RHS = N->getOperand(2);
31323 EVT VT = LHS.getValueType();
31324 EVT CondVT = Cond.getValueType();
31326 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31328 if (N->getOpcode() != ISD::VSELECT)
31331 assert(CondVT.isVector() && "Vector select expects a vector selector!");
31333 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
31334 // Check if the first operand is all zeros and Cond type is vXi1.
31335 // This situation only applies to avx512.
31336 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
31337 CondVT.getVectorElementType() == MVT::i1) {
31338 // Invert the cond to not(cond) : xor(op,allones)=not(op)
31339 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
31340 DAG.getAllOnesConstant(DL, CondVT));
31341 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
31342 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
31345 // To use the condition operand as a bitwise mask, it must have elements that
31346 // are the same size as the select elements. Ie, the condition operand must
31347 // have already been promoted from the IR select condition type <N x i1>.
31348 // Don't check if the types themselves are equal because that excludes
31349 // vector floating-point selects.
31350 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
31353 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
31354 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
31356 // Try to invert the condition if true value is not all 1s and false value is
31358 if (!TValIsAllOnes && !FValIsAllZeros &&
31359 // Check if the selector will be produced by CMPP*/PCMP*.
31360 Cond.getOpcode() == ISD::SETCC &&
31361 // Check if SETCC has already been promoted.
31362 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
31364 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
31366 if (TValIsAllZeros || FValIsAllOnes) {
31367 SDValue CC = Cond.getOperand(2);
31368 ISD::CondCode NewCC =
31369 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
31370 Cond.getOperand(0).getValueType().isInteger());
31371 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
31373 std::swap(LHS, RHS);
31374 TValIsAllOnes = FValIsAllOnes;
31375 FValIsAllZeros = TValIsAllZeros;
31379 // Cond value must be 'sign splat' to be converted to a logical op.
31380 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
31383 // vselect Cond, 111..., 000... -> Cond
31384 if (TValIsAllOnes && FValIsAllZeros)
31385 return DAG.getBitcast(VT, Cond);
31387 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
31390 // vselect Cond, 111..., X -> or Cond, X
31391 if (TValIsAllOnes) {
31392 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
31393 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
31394 return DAG.getBitcast(VT, Or);
31397 // vselect Cond, X, 000... -> and Cond, X
31398 if (FValIsAllZeros) {
31399 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
31400 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
31401 return DAG.getBitcast(VT, And);
31404 // vselect Cond, 000..., X -> andn Cond, X
31405 if (TValIsAllZeros) {
31406 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
31407 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
31408 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
31409 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
31410 return DAG.getBitcast(VT, AndN);
31416 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
31417 SDValue Cond = N->getOperand(0);
31418 SDValue LHS = N->getOperand(1);
31419 SDValue RHS = N->getOperand(2);
31422 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
31423 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
31424 if (!TrueC || !FalseC)
31427 // Don't do this for crazy integer types.
31428 EVT VT = N->getValueType(0);
31429 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31432 // We're going to use the condition bit in math or logic ops. We could allow
31433 // this with a wider condition value (post-legalization it becomes an i8),
31434 // but if nothing is creating selects that late, it doesn't matter.
31435 if (Cond.getValueType() != MVT::i1)
31438 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
31439 // 3, 5, or 9 with i32/i64, so those get transformed too.
31440 // TODO: For constants that overflow or do not differ by power-of-2 or small
31441 // multiplier, convert to 'and' + 'add'.
31442 const APInt &TrueVal = TrueC->getAPIntValue();
31443 const APInt &FalseVal = FalseC->getAPIntValue();
31445 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
31449 APInt AbsDiff = Diff.abs();
31450 if (AbsDiff.isPowerOf2() ||
31451 ((VT == MVT::i32 || VT == MVT::i64) &&
31452 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
31454 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
31455 // of the condition can usually be folded into a compare predicate, but even
31456 // without that, the sequence should be cheaper than a CMOV alternative.
31457 if (TrueVal.slt(FalseVal)) {
31458 Cond = DAG.getNOT(DL, Cond, MVT::i1);
31459 std::swap(TrueC, FalseC);
31462 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
31463 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
31465 // Multiply condition by the difference if non-one.
31466 if (!AbsDiff.isOneValue())
31467 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
31469 // Add the base if non-zero.
31470 if (!FalseC->isNullValue())
31471 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
31479 /// Do target-specific dag combines on SELECT and VSELECT nodes.
31480 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31481 TargetLowering::DAGCombinerInfo &DCI,
31482 const X86Subtarget &Subtarget) {
31484 SDValue Cond = N->getOperand(0);
31485 // Get the LHS/RHS of the select.
31486 SDValue LHS = N->getOperand(1);
31487 SDValue RHS = N->getOperand(2);
31488 EVT VT = LHS.getValueType();
31489 EVT CondVT = Cond.getValueType();
31490 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31492 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31493 // instructions match the semantics of the common C idiom x<y?x:y but not
31494 // x<=y?x:y, because of how they handle negative zero (which can be
31495 // ignored in unsafe-math mode).
31496 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31497 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31498 VT != MVT::f80 && VT != MVT::f128 &&
31499 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31500 (Subtarget.hasSSE2() ||
31501 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31502 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31504 unsigned Opcode = 0;
31505 // Check for x CC y ? x : y.
31506 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31507 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31511 // Converting this to a min would handle NaNs incorrectly, and swapping
31512 // the operands would cause it to handle comparisons between positive
31513 // and negative zero incorrectly.
31514 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31515 if (!DAG.getTarget().Options.UnsafeFPMath &&
31516 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31518 std::swap(LHS, RHS);
31520 Opcode = X86ISD::FMIN;
31523 // Converting this to a min would handle comparisons between positive
31524 // and negative zero incorrectly.
31525 if (!DAG.getTarget().Options.UnsafeFPMath &&
31526 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31528 Opcode = X86ISD::FMIN;
31531 // Converting this to a min would handle both negative zeros and NaNs
31532 // incorrectly, but we can swap the operands to fix both.
31533 std::swap(LHS, RHS);
31538 Opcode = X86ISD::FMIN;
31542 // Converting this to a max would handle comparisons between positive
31543 // and negative zero incorrectly.
31544 if (!DAG.getTarget().Options.UnsafeFPMath &&
31545 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31547 Opcode = X86ISD::FMAX;
31550 // Converting this to a max would handle NaNs incorrectly, and swapping
31551 // the operands would cause it to handle comparisons between positive
31552 // and negative zero incorrectly.
31553 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31554 if (!DAG.getTarget().Options.UnsafeFPMath &&
31555 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31557 std::swap(LHS, RHS);
31559 Opcode = X86ISD::FMAX;
31562 // Converting this to a max would handle both negative zeros and NaNs
31563 // incorrectly, but we can swap the operands to fix both.
31564 std::swap(LHS, RHS);
31569 Opcode = X86ISD::FMAX;
31572 // Check for x CC y ? y : x -- a min/max with reversed arms.
31573 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
31574 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
31578 // Converting this to a min would handle comparisons between positive
31579 // and negative zero incorrectly, and swapping the operands would
31580 // cause it to handle NaNs incorrectly.
31581 if (!DAG.getTarget().Options.UnsafeFPMath &&
31582 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
31583 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31585 std::swap(LHS, RHS);
31587 Opcode = X86ISD::FMIN;
31590 // Converting this to a min would handle NaNs incorrectly.
31591 if (!DAG.getTarget().Options.UnsafeFPMath &&
31592 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
31594 Opcode = X86ISD::FMIN;
31597 // Converting this to a min would handle both negative zeros and NaNs
31598 // incorrectly, but we can swap the operands to fix both.
31599 std::swap(LHS, RHS);
31604 Opcode = X86ISD::FMIN;
31608 // Converting this to a max would handle NaNs incorrectly.
31609 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31611 Opcode = X86ISD::FMAX;
31614 // Converting this to a max would handle comparisons between positive
31615 // and negative zero incorrectly, and swapping the operands would
31616 // cause it to handle NaNs incorrectly.
31617 if (!DAG.getTarget().Options.UnsafeFPMath &&
31618 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
31619 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31621 std::swap(LHS, RHS);
31623 Opcode = X86ISD::FMAX;
31626 // Converting this to a max would handle both negative zeros and NaNs
31627 // incorrectly, but we can swap the operands to fix both.
31628 std::swap(LHS, RHS);
31633 Opcode = X86ISD::FMAX;
31639 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
31642 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
31643 // lowering on KNL. In this case we convert it to
31644 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
31645 // The same situation all vectors of i8 and i16 without BWI.
31646 // Make sure we extend these even before type legalization gets a chance to
31647 // split wide vectors.
31648 // Since SKX these selects have a proper lowering.
31649 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
31650 CondVT.getVectorElementType() == MVT::i1 &&
31651 VT.getVectorNumElements() > 4 &&
31652 (VT.getVectorElementType() == MVT::i8 ||
31653 VT.getVectorElementType() == MVT::i16)) {
31654 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
31655 DCI.AddToWorklist(Cond.getNode());
31656 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
31659 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
31662 // Canonicalize max and min:
31663 // (x > y) ? x : y -> (x >= y) ? x : y
31664 // (x < y) ? x : y -> (x <= y) ? x : y
31665 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
31666 // the need for an extra compare
31667 // against zero. e.g.
31668 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
31670 // testl %edi, %edi
31672 // cmovgl %edi, %eax
31676 // cmovsl %eax, %edi
31677 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
31678 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31679 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31680 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31685 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
31686 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
31687 Cond.getOperand(0), Cond.getOperand(1), NewCC);
31688 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
31693 // Early exit check
31694 if (!TLI.isTypeLegal(VT))
31697 // Match VSELECTs into subs with unsigned saturation.
31698 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
31699 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
31700 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
31701 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
31702 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31704 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
31705 // left side invert the predicate to simplify logic below.
31707 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
31709 CC = ISD::getSetCCInverse(CC, true);
31710 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
31714 if (Other.getNode() && Other->getNumOperands() == 2 &&
31715 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
31716 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
31717 SDValue CondRHS = Cond->getOperand(1);
31719 // Look for a general sub with unsigned saturation first.
31720 // x >= y ? x-y : 0 --> subus x, y
31721 // x > y ? x-y : 0 --> subus x, y
31722 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
31723 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
31724 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
31726 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
31727 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
31728 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
31729 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
31730 // If the RHS is a constant we have to reverse the const
31731 // canonicalization.
31732 // x > C-1 ? x+-C : 0 --> subus x, C
31733 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
31734 CondRHSConst->getAPIntValue() ==
31735 (-OpRHSConst->getAPIntValue() - 1))
31736 return DAG.getNode(
31737 X86ISD::SUBUS, DL, VT, OpLHS,
31738 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
31740 // Another special case: If C was a sign bit, the sub has been
31741 // canonicalized into a xor.
31742 // FIXME: Would it be better to use computeKnownBits to determine
31743 // whether it's safe to decanonicalize the xor?
31744 // x s< 0 ? x^C : 0 --> subus x, C
31745 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
31746 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
31747 OpRHSConst->getAPIntValue().isSignMask())
31748 // Note that we have to rebuild the RHS constant here to ensure we
31749 // don't rely on particular values of undef lanes.
31750 return DAG.getNode(
31751 X86ISD::SUBUS, DL, VT, OpLHS,
31752 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
31757 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
31760 // If this is a *dynamic* select (non-constant condition) and we can match
31761 // this node with one of the variable blend instructions, restructure the
31762 // condition so that blends can use the high (sign) bit of each element and
31763 // use SimplifyDemandedBits to simplify the condition operand.
31764 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
31765 !DCI.isBeforeLegalize() &&
31766 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
31767 unsigned BitWidth = Cond.getScalarValueSizeInBits();
31769 // Don't optimize vector selects that map to mask-registers.
31773 // We can only handle the cases where VSELECT is directly legal on the
31774 // subtarget. We custom lower VSELECT nodes with constant conditions and
31775 // this makes it hard to see whether a dynamic VSELECT will correctly
31776 // lower, so we both check the operation's status and explicitly handle the
31777 // cases where a *dynamic* blend will fail even though a constant-condition
31778 // blend could be custom lowered.
31779 // FIXME: We should find a better way to handle this class of problems.
31780 // Potentially, we should combine constant-condition vselect nodes
31781 // pre-legalization into shuffles and not mark as many types as custom
31783 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
31785 // FIXME: We don't support i16-element blends currently. We could and
31786 // should support them by making *all* the bits in the condition be set
31787 // rather than just the high bit and using an i8-element blend.
31788 if (VT.getVectorElementType() == MVT::i16)
31790 // Dynamic blending was only available from SSE4.1 onward.
31791 if (VT.is128BitVector() && !Subtarget.hasSSE41())
31793 // Byte blends are only available in AVX2
31794 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
31796 // There are no 512-bit blend instructions that use sign bits.
31797 if (VT.is512BitVector())
31800 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
31801 APInt DemandedMask(APInt::getSignMask(BitWidth));
31803 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
31804 !DCI.isBeforeLegalizeOps());
31805 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
31806 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
31807 // If we changed the computation somewhere in the DAG, this change will
31808 // affect all users of Cond. Make sure it is fine and update all the nodes
31809 // so that we do not use the generic VSELECT anymore. Otherwise, we may
31810 // perform wrong optimizations as we messed with the actual expectation
31811 // for the vector boolean values.
31812 if (Cond != TLO.Old) {
31813 // Check all uses of the condition operand to check whether it will be
31814 // consumed by non-BLEND instructions. Those may require that all bits
31815 // are set properly.
31816 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
31818 // TODO: Add other opcodes eventually lowered into BLEND.
31819 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
31823 // Update all users of the condition before committing the change, so
31824 // that the VSELECT optimizations that expect the correct vector boolean
31825 // value will not be triggered.
31826 for (SDNode *U : Cond->uses()) {
31827 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
31828 U->getValueType(0), Cond, U->getOperand(1),
31830 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
31832 DCI.CommitTargetLoweringOpt(TLO);
31835 // Only Cond (rather than other nodes in the computation chain) was
31836 // changed. Change the condition just for N to keep the opportunity to
31837 // optimize all other users their own way.
31838 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
31839 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
31844 // Custom action for SELECT MMX
31845 if (VT == MVT::x86mmx) {
31846 LHS = DAG.getBitcast(MVT::i64, LHS);
31847 RHS = DAG.getBitcast(MVT::i64, RHS);
31848 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
31849 return DAG.getBitcast(VT, newSelect);
31856 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
31858 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
31859 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
31860 /// Note that this is only legal for some op/cc combinations.
31861 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
31863 const X86Subtarget &Subtarget) {
31864 // This combine only operates on CMP-like nodes.
31865 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31866 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31869 // Can't replace the cmp if it has more uses than the one we're looking at.
31870 // FIXME: We would like to be able to handle this, but would need to make sure
31871 // all uses were updated.
31872 if (!Cmp.hasOneUse())
31875 // This only applies to variations of the common case:
31876 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
31877 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
31878 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
31879 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
31880 // Using the proper condcodes (see below), overflow is checked for.
31882 // FIXME: We can generalize both constraints:
31883 // - XOR/OR/AND (if they were made to survive AtomicExpand)
31885 // if the result is compared.
31887 SDValue CmpLHS = Cmp.getOperand(0);
31888 SDValue CmpRHS = Cmp.getOperand(1);
31890 if (!CmpLHS.hasOneUse())
31893 unsigned Opc = CmpLHS.getOpcode();
31894 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
31897 SDValue OpRHS = CmpLHS.getOperand(2);
31898 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
31902 APInt Addend = OpRHSC->getAPIntValue();
31903 if (Opc == ISD::ATOMIC_LOAD_SUB)
31906 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
31910 APInt Comparison = CmpRHSC->getAPIntValue();
31912 // If the addend is the negation of the comparison value, then we can do
31913 // a full comparison by emitting the atomic arithmetic as a locked sub.
31914 if (Comparison == -Addend) {
31915 // The CC is fine, but we need to rewrite the LHS of the comparison as an
31917 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
31918 auto AtomicSub = DAG.getAtomic(
31919 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
31920 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
31921 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
31922 AN->getMemOperand());
31923 // If the comparision uses the CF flag we can't use INC/DEC instructions.
31924 bool NeedCF = false;
31927 case X86::COND_A: case X86::COND_AE:
31928 case X86::COND_B: case X86::COND_BE:
31932 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
31933 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31934 DAG.getUNDEF(CmpLHS.getValueType()));
31935 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31939 // We can handle comparisons with zero in a number of cases by manipulating
31941 if (!Comparison.isNullValue())
31944 if (CC == X86::COND_S && Addend == 1)
31946 else if (CC == X86::COND_NS && Addend == 1)
31948 else if (CC == X86::COND_G && Addend == -1)
31950 else if (CC == X86::COND_LE && Addend == -1)
31955 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
31956 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31957 DAG.getUNDEF(CmpLHS.getValueType()));
31958 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31962 // Check whether a boolean test is testing a boolean value generated by
31963 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
31966 // Simplify the following patterns:
31967 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
31968 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
31969 // to (Op EFLAGS Cond)
31971 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
31972 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
31973 // to (Op EFLAGS !Cond)
31975 // where Op could be BRCOND or CMOV.
31977 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
31978 // This combine only operates on CMP-like nodes.
31979 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31980 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31983 // Quit if not used as a boolean value.
31984 if (CC != X86::COND_E && CC != X86::COND_NE)
31987 // Check CMP operands. One of them should be 0 or 1 and the other should be
31988 // an SetCC or extended from it.
31989 SDValue Op1 = Cmp.getOperand(0);
31990 SDValue Op2 = Cmp.getOperand(1);
31993 const ConstantSDNode* C = nullptr;
31994 bool needOppositeCond = (CC == X86::COND_E);
31995 bool checkAgainstTrue = false; // Is it a comparison against 1?
31997 if ((C = dyn_cast<ConstantSDNode>(Op1)))
31999 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
32001 else // Quit if all operands are not constants.
32004 if (C->getZExtValue() == 1) {
32005 needOppositeCond = !needOppositeCond;
32006 checkAgainstTrue = true;
32007 } else if (C->getZExtValue() != 0)
32008 // Quit if the constant is neither 0 or 1.
32011 bool truncatedToBoolWithAnd = false;
32012 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
32013 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
32014 SetCC.getOpcode() == ISD::TRUNCATE ||
32015 SetCC.getOpcode() == ISD::AND) {
32016 if (SetCC.getOpcode() == ISD::AND) {
32018 if (isOneConstant(SetCC.getOperand(0)))
32020 if (isOneConstant(SetCC.getOperand(1)))
32024 SetCC = SetCC.getOperand(OpIdx);
32025 truncatedToBoolWithAnd = true;
32027 SetCC = SetCC.getOperand(0);
32030 switch (SetCC.getOpcode()) {
32031 case X86ISD::SETCC_CARRY:
32032 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
32033 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
32034 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
32035 // truncated to i1 using 'and'.
32036 if (checkAgainstTrue && !truncatedToBoolWithAnd)
32038 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
32039 "Invalid use of SETCC_CARRY!");
32041 case X86ISD::SETCC:
32042 // Set the condition code or opposite one if necessary.
32043 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
32044 if (needOppositeCond)
32045 CC = X86::GetOppositeBranchCondition(CC);
32046 return SetCC.getOperand(1);
32047 case X86ISD::CMOV: {
32048 // Check whether false/true value has canonical one, i.e. 0 or 1.
32049 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
32050 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
32051 // Quit if true value is not a constant.
32054 // Quit if false value is not a constant.
32056 SDValue Op = SetCC.getOperand(0);
32057 // Skip 'zext' or 'trunc' node.
32058 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
32059 Op.getOpcode() == ISD::TRUNCATE)
32060 Op = Op.getOperand(0);
32061 // A special case for rdrand/rdseed, where 0 is set if false cond is
32063 if ((Op.getOpcode() != X86ISD::RDRAND &&
32064 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
32067 // Quit if false value is not the constant 0 or 1.
32068 bool FValIsFalse = true;
32069 if (FVal && FVal->getZExtValue() != 0) {
32070 if (FVal->getZExtValue() != 1)
32072 // If FVal is 1, opposite cond is needed.
32073 needOppositeCond = !needOppositeCond;
32074 FValIsFalse = false;
32076 // Quit if TVal is not the constant opposite of FVal.
32077 if (FValIsFalse && TVal->getZExtValue() != 1)
32079 if (!FValIsFalse && TVal->getZExtValue() != 0)
32081 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
32082 if (needOppositeCond)
32083 CC = X86::GetOppositeBranchCondition(CC);
32084 return SetCC.getOperand(3);
32091 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
32093 /// (X86or (X86setcc) (X86setcc))
32094 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
32095 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
32096 X86::CondCode &CC1, SDValue &Flags,
32098 if (Cond->getOpcode() == X86ISD::CMP) {
32099 if (!isNullConstant(Cond->getOperand(1)))
32102 Cond = Cond->getOperand(0);
32107 SDValue SetCC0, SetCC1;
32108 switch (Cond->getOpcode()) {
32109 default: return false;
32116 SetCC0 = Cond->getOperand(0);
32117 SetCC1 = Cond->getOperand(1);
32121 // Make sure we have SETCC nodes, using the same flags value.
32122 if (SetCC0.getOpcode() != X86ISD::SETCC ||
32123 SetCC1.getOpcode() != X86ISD::SETCC ||
32124 SetCC0->getOperand(1) != SetCC1->getOperand(1))
32127 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
32128 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
32129 Flags = SetCC0->getOperand(1);
32133 // When legalizing carry, we create carries via add X, -1
32134 // If that comes from an actual carry, via setcc, we use the
32136 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
32137 if (EFLAGS.getOpcode() == X86ISD::ADD) {
32138 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
32139 SDValue Carry = EFLAGS.getOperand(0);
32140 while (Carry.getOpcode() == ISD::TRUNCATE ||
32141 Carry.getOpcode() == ISD::ZERO_EXTEND ||
32142 Carry.getOpcode() == ISD::SIGN_EXTEND ||
32143 Carry.getOpcode() == ISD::ANY_EXTEND ||
32144 (Carry.getOpcode() == ISD::AND &&
32145 isOneConstant(Carry.getOperand(1))))
32146 Carry = Carry.getOperand(0);
32147 if (Carry.getOpcode() == X86ISD::SETCC ||
32148 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
32149 if (Carry.getConstantOperandVal(0) == X86::COND_B)
32150 return Carry.getOperand(1);
32158 /// Optimize an EFLAGS definition used according to the condition code \p CC
32159 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
32160 /// uses of chain values.
32161 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
32163 const X86Subtarget &Subtarget) {
32164 if (CC == X86::COND_B)
32165 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
32168 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
32170 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
32173 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
32174 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
32175 TargetLowering::DAGCombinerInfo &DCI,
32176 const X86Subtarget &Subtarget) {
32179 SDValue FalseOp = N->getOperand(0);
32180 SDValue TrueOp = N->getOperand(1);
32181 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
32182 SDValue Cond = N->getOperand(3);
32184 // Try to simplify the EFLAGS and condition code operands.
32185 // We can't always do this as FCMOV only supports a subset of X86 cond.
32186 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
32187 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
32188 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
32190 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32194 // If this is a select between two integer constants, try to do some
32195 // optimizations. Note that the operands are ordered the opposite of SELECT
32197 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
32198 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
32199 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
32200 // larger than FalseC (the false value).
32201 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
32202 CC = X86::GetOppositeBranchCondition(CC);
32203 std::swap(TrueC, FalseC);
32204 std::swap(TrueOp, FalseOp);
32207 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
32208 // This is efficient for any integer data type (including i8/i16) and
32210 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
32211 Cond = getSETCC(CC, Cond, DL, DAG);
32213 // Zero extend the condition if needed.
32214 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
32216 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
32217 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
32218 DAG.getConstant(ShAmt, DL, MVT::i8));
32222 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
32223 // for any integer data type, including i8/i16.
32224 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
32225 Cond = getSETCC(CC, Cond, DL, DAG);
32227 // Zero extend the condition if needed.
32228 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
32229 FalseC->getValueType(0), Cond);
32230 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32231 SDValue(FalseC, 0));
32235 // Optimize cases that will turn into an LEA instruction. This requires
32236 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
32237 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
32238 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
32239 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
32241 bool isFastMultiplier = false;
32243 switch ((unsigned char)Diff) {
32245 case 1: // result = add base, cond
32246 case 2: // result = lea base( , cond*2)
32247 case 3: // result = lea base(cond, cond*2)
32248 case 4: // result = lea base( , cond*4)
32249 case 5: // result = lea base(cond, cond*4)
32250 case 8: // result = lea base( , cond*8)
32251 case 9: // result = lea base(cond, cond*8)
32252 isFastMultiplier = true;
32257 if (isFastMultiplier) {
32258 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
32259 Cond = getSETCC(CC, Cond, DL ,DAG);
32260 // Zero extend the condition if needed.
32261 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
32263 // Scale the condition by the difference.
32265 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
32266 DAG.getConstant(Diff, DL, Cond.getValueType()));
32268 // Add the base if non-zero.
32269 if (FalseC->getAPIntValue() != 0)
32270 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32271 SDValue(FalseC, 0));
32278 // Handle these cases:
32279 // (select (x != c), e, c) -> select (x != c), e, x),
32280 // (select (x == c), c, e) -> select (x == c), x, e)
32281 // where the c is an integer constant, and the "select" is the combination
32282 // of CMOV and CMP.
32284 // The rationale for this change is that the conditional-move from a constant
32285 // needs two instructions, however, conditional-move from a register needs
32286 // only one instruction.
32288 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
32289 // some instruction-combining opportunities. This opt needs to be
32290 // postponed as late as possible.
32292 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
32293 // the DCI.xxxx conditions are provided to postpone the optimization as
32294 // late as possible.
32296 ConstantSDNode *CmpAgainst = nullptr;
32297 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
32298 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
32299 !isa<ConstantSDNode>(Cond.getOperand(0))) {
32301 if (CC == X86::COND_NE &&
32302 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
32303 CC = X86::GetOppositeBranchCondition(CC);
32304 std::swap(TrueOp, FalseOp);
32307 if (CC == X86::COND_E &&
32308 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
32309 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
32310 DAG.getConstant(CC, DL, MVT::i8), Cond };
32311 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32316 // Fold and/or of setcc's to double CMOV:
32317 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
32318 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
32320 // This combine lets us generate:
32321 // cmovcc1 (jcc1 if we don't have CMOV)
32327 // cmovne (jne if we don't have CMOV)
32328 // When we can't use the CMOV instruction, it might increase branch
32330 // When we can use CMOV, or when there is no mispredict, this improves
32331 // throughput and reduces register pressure.
32333 if (CC == X86::COND_NE) {
32335 X86::CondCode CC0, CC1;
32337 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
32339 std::swap(FalseOp, TrueOp);
32340 CC0 = X86::GetOppositeBranchCondition(CC0);
32341 CC1 = X86::GetOppositeBranchCondition(CC1);
32344 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
32346 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
32347 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
32348 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32356 /// Different mul shrinking modes.
32357 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
32359 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
32360 EVT VT = N->getOperand(0).getValueType();
32361 if (VT.getScalarSizeInBits() != 32)
32364 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
32365 unsigned SignBits[2] = {1, 1};
32366 bool IsPositive[2] = {false, false};
32367 for (unsigned i = 0; i < 2; i++) {
32368 SDValue Opd = N->getOperand(i);
32370 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
32371 // compute signbits for it separately.
32372 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
32373 // For anyextend, it is safe to assume an appropriate number of leading
32375 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
32377 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
32382 IsPositive[i] = true;
32383 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
32384 // All the operands of BUILD_VECTOR need to be int constant.
32385 // Find the smallest value range which all the operands belong to.
32387 IsPositive[i] = true;
32388 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
32389 if (SubOp.isUndef())
32391 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
32394 APInt IntVal = CN->getAPIntValue();
32395 if (IntVal.isNegative())
32396 IsPositive[i] = false;
32397 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
32400 SignBits[i] = DAG.ComputeNumSignBits(Opd);
32401 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
32402 IsPositive[i] = true;
32406 bool AllPositive = IsPositive[0] && IsPositive[1];
32407 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
32408 // When ranges are from -128 ~ 127, use MULS8 mode.
32409 if (MinSignBits >= 25)
32411 // When ranges are from 0 ~ 255, use MULU8 mode.
32412 else if (AllPositive && MinSignBits >= 24)
32414 // When ranges are from -32768 ~ 32767, use MULS16 mode.
32415 else if (MinSignBits >= 17)
32417 // When ranges are from 0 ~ 65535, use MULU16 mode.
32418 else if (AllPositive && MinSignBits >= 16)
32425 /// When the operands of vector mul are extended from smaller size values,
32426 /// like i8 and i16, the type of mul may be shrinked to generate more
32427 /// efficient code. Two typical patterns are handled:
32429 /// %2 = sext/zext <N x i8> %1 to <N x i32>
32430 /// %4 = sext/zext <N x i8> %3 to <N x i32>
32431 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32432 /// %5 = mul <N x i32> %2, %4
32435 /// %2 = zext/sext <N x i16> %1 to <N x i32>
32436 /// %4 = zext/sext <N x i16> %3 to <N x i32>
32437 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32438 /// %5 = mul <N x i32> %2, %4
32440 /// There are four mul shrinking modes:
32441 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32442 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32443 /// generate pmullw+sext32 for it (MULS8 mode).
32444 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32445 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32446 /// generate pmullw+zext32 for it (MULU8 mode).
32447 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32448 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32449 /// generate pmullw+pmulhw for it (MULS16 mode).
32450 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32451 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32452 /// generate pmullw+pmulhuw for it (MULU16 mode).
32453 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32454 const X86Subtarget &Subtarget) {
32455 // Check for legality
32456 // pmullw/pmulhw are not supported by SSE.
32457 if (!Subtarget.hasSSE2())
32460 // Check for profitability
32461 // pmulld is supported since SSE41. It is better to use pmulld
32462 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32464 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
32465 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32469 if (!canReduceVMulWidth(N, DAG, Mode))
32473 SDValue N0 = N->getOperand(0);
32474 SDValue N1 = N->getOperand(1);
32475 EVT VT = N->getOperand(0).getValueType();
32476 unsigned NumElts = VT.getVectorNumElements();
32477 if ((NumElts % 2) != 0)
32480 unsigned RegSize = 128;
32481 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32482 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32484 // Shrink the operands of mul.
32485 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32486 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32488 if (NumElts >= OpsVT.getVectorNumElements()) {
32489 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32490 // lower part is needed.
32491 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32492 if (Mode == MULU8 || Mode == MULS8) {
32493 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32496 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32497 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32498 // the higher part is also needed.
32499 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32500 ReducedVT, NewN0, NewN1);
32502 // Repack the lower part and higher part result of mul into a wider
32504 // Generate shuffle functioning as punpcklwd.
32505 SmallVector<int, 16> ShuffleMask(NumElts);
32506 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32507 ShuffleMask[2 * i] = i;
32508 ShuffleMask[2 * i + 1] = i + NumElts;
32511 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32512 ResLo = DAG.getBitcast(ResVT, ResLo);
32513 // Generate shuffle functioning as punpckhwd.
32514 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32515 ShuffleMask[2 * i] = i + NumElts / 2;
32516 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
32519 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32520 ResHi = DAG.getBitcast(ResVT, ResHi);
32521 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
32524 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
32525 // to legalize the mul explicitly because implicit legalization for type
32526 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
32527 // instructions which will not exist when we explicitly legalize it by
32528 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
32529 // <4 x i16> undef).
32531 // Legalize the operands of mul.
32532 // FIXME: We may be able to handle non-concatenated vectors by insertion.
32533 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
32534 if ((RegSize % ReducedSizeInBits) != 0)
32537 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
32538 DAG.getUNDEF(ReducedVT));
32540 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32542 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32544 if (Mode == MULU8 || Mode == MULS8) {
32545 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
32547 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32549 // convert the type of mul result to VT.
32550 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32551 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
32552 : ISD::SIGN_EXTEND_VECTOR_INREG,
32554 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32555 DAG.getIntPtrConstant(0, DL));
32557 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
32558 // MULU16/MULS16, both parts are needed.
32559 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32560 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32561 OpsVT, NewN0, NewN1);
32563 // Repack the lower part and higher part result of mul into a wider
32564 // result. Make sure the type of mul result is VT.
32565 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32566 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
32567 Res = DAG.getBitcast(ResVT, Res);
32568 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32569 DAG.getIntPtrConstant(0, DL));
32574 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
32575 EVT VT, const SDLoc &DL) {
32577 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
32578 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32579 DAG.getConstant(Mult, DL, VT));
32580 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
32581 DAG.getConstant(Shift, DL, MVT::i8));
32582 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32587 auto combineMulMulAddOrSub = [&](bool isAdd) {
32588 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32589 DAG.getConstant(9, DL, VT));
32590 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
32591 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32600 // mul x, 11 => add ((shl (mul x, 5), 1), x)
32601 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
32603 // mul x, 21 => add ((shl (mul x, 5), 2), x)
32604 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
32606 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
32607 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32608 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
32610 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
32611 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
32613 // mul x, 13 => add ((shl (mul x, 3), 2), x)
32614 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
32616 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
32617 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
32619 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
32620 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32621 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
32623 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
32624 return combineMulMulAddOrSub(/*isAdd*/ false);
32626 // mul x, 28 => add ((mul (mul x, 9), 3), x)
32627 return combineMulMulAddOrSub(/*isAdd*/ true);
32629 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
32630 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32631 combineMulMulAddOrSub(/*isAdd*/ true));
32633 // mul x, 30 => sub (sub ((shl x, 5), x), x)
32634 return DAG.getNode(
32636 DAG.getNode(ISD::SUB, DL, VT,
32637 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32638 DAG.getConstant(5, DL, MVT::i8)),
32645 /// Optimize a single multiply with constant into two operations in order to
32646 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
32647 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
32648 TargetLowering::DAGCombinerInfo &DCI,
32649 const X86Subtarget &Subtarget) {
32650 EVT VT = N->getValueType(0);
32652 // If the upper 17 bits of each element are zero then we can use PMADDWD,
32653 // which is always at least as quick as PMULLD, expect on KNL.
32654 if (Subtarget.getProcFamily() != X86Subtarget::IntelKNL &&
32655 ((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
32656 (VT == MVT::v8i32 && Subtarget.hasAVX2()) ||
32657 (VT == MVT::v16i32 && Subtarget.hasBWI()))) {
32658 SDValue N0 = N->getOperand(0);
32659 SDValue N1 = N->getOperand(1);
32660 APInt Mask17 = APInt::getHighBitsSet(32, 17);
32661 if (DAG.MaskedValueIsZero(N0, Mask17) &&
32662 DAG.MaskedValueIsZero(N1, Mask17)) {
32663 unsigned NumElts = VT.getVectorNumElements();
32664 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * NumElts);
32665 return DAG.getNode(X86ISD::VPMADDWD, SDLoc(N), VT,
32666 DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1));
32670 if (DCI.isBeforeLegalize() && VT.isVector())
32671 return reduceVMULWidth(N, DAG, Subtarget);
32673 if (!MulConstantOptimization)
32675 // An imul is usually smaller than the alternative sequence.
32676 if (DAG.getMachineFunction().getFunction().optForMinSize())
32679 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
32682 if (VT != MVT::i64 && VT != MVT::i32)
32685 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
32688 uint64_t MulAmt = C->getZExtValue();
32689 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
32692 uint64_t MulAmt1 = 0;
32693 uint64_t MulAmt2 = 0;
32694 if ((MulAmt % 9) == 0) {
32696 MulAmt2 = MulAmt / 9;
32697 } else if ((MulAmt % 5) == 0) {
32699 MulAmt2 = MulAmt / 5;
32700 } else if ((MulAmt % 3) == 0) {
32702 MulAmt2 = MulAmt / 3;
32708 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
32710 if (isPowerOf2_64(MulAmt2) &&
32711 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
32712 // If second multiplifer is pow2, issue it first. We want the multiply by
32713 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
32715 std::swap(MulAmt1, MulAmt2);
32717 if (isPowerOf2_64(MulAmt1))
32718 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32719 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
32721 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32722 DAG.getConstant(MulAmt1, DL, VT));
32724 if (isPowerOf2_64(MulAmt2))
32725 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
32726 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
32728 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
32729 DAG.getConstant(MulAmt2, DL, VT));
32730 } else if (!Subtarget.slowLEA())
32731 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
32734 assert(MulAmt != 0 &&
32735 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
32736 "Both cases that could cause potential overflows should have "
32737 "already been handled.");
32738 int64_t SignMulAmt = C->getSExtValue();
32739 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
32740 (SignMulAmt != -INT64_MAX)) {
32741 int NumSign = SignMulAmt > 0 ? 1 : -1;
32742 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
32743 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
32744 if (IsPowerOf2_64PlusOne) {
32745 // (mul x, 2^N + 1) => (add (shl x, N), x)
32746 NewMul = DAG.getNode(
32747 ISD::ADD, DL, VT, N->getOperand(0),
32748 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32749 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
32751 } else if (IsPowerOf2_64MinusOne) {
32752 // (mul x, 2^N - 1) => (sub (shl x, N), x)
32753 NewMul = DAG.getNode(
32755 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32756 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
32760 // To negate, subtract the number from zero
32761 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
32763 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
32768 // Do not add new nodes to DAG combiner worklist.
32769 DCI.CombineTo(N, NewMul, false);
32774 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
32775 SDValue N0 = N->getOperand(0);
32776 SDValue N1 = N->getOperand(1);
32777 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
32778 EVT VT = N0.getValueType();
32780 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
32781 // since the result of setcc_c is all zero's or all ones.
32782 if (VT.isInteger() && !VT.isVector() &&
32783 N1C && N0.getOpcode() == ISD::AND &&
32784 N0.getOperand(1).getOpcode() == ISD::Constant) {
32785 SDValue N00 = N0.getOperand(0);
32786 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
32787 Mask <<= N1C->getAPIntValue();
32788 bool MaskOK = false;
32789 // We can handle cases concerning bit-widening nodes containing setcc_c if
32790 // we carefully interrogate the mask to make sure we are semantics
32792 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
32793 // of the underlying setcc_c operation if the setcc_c was zero extended.
32794 // Consider the following example:
32795 // zext(setcc_c) -> i32 0x0000FFFF
32796 // c1 -> i32 0x0000FFFF
32797 // c2 -> i32 0x00000001
32798 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
32799 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
32800 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32802 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
32803 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32805 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
32806 N00.getOpcode() == ISD::ANY_EXTEND) &&
32807 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32808 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
32810 if (MaskOK && Mask != 0) {
32812 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
32816 // Hardware support for vector shifts is sparse which makes us scalarize the
32817 // vector operations in many cases. Also, on sandybridge ADD is faster than
32819 // (shl V, 1) -> add V,V
32820 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
32821 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
32822 assert(N0.getValueType().isVector() && "Invalid vector shift type");
32823 // We shift all of the values by one. In many cases we do not have
32824 // hardware support for this operation. This is better expressed as an ADD
32826 if (N1SplatC->getAPIntValue() == 1)
32827 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
32833 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
32834 SDValue N0 = N->getOperand(0);
32835 SDValue N1 = N->getOperand(1);
32836 EVT VT = N0.getValueType();
32837 unsigned Size = VT.getSizeInBits();
32839 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
32840 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
32841 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
32842 // depending on sign of (SarConst - [56,48,32,24,16])
32844 // sexts in X86 are MOVs. The MOVs have the same code size
32845 // as above SHIFTs (only SHIFT on 1 has lower code size).
32846 // However the MOVs have 2 advantages to a SHIFT:
32847 // 1. MOVs can write to a register that differs from source
32848 // 2. MOVs accept memory operands
32850 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
32851 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
32852 N0.getOperand(1).getOpcode() != ISD::Constant)
32855 SDValue N00 = N0.getOperand(0);
32856 SDValue N01 = N0.getOperand(1);
32857 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
32858 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
32859 EVT CVT = N1.getValueType();
32861 if (SarConst.isNegative())
32864 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
32865 unsigned ShiftSize = SVT.getSizeInBits();
32866 // skipping types without corresponding sext/zext and
32867 // ShlConst that is not one of [56,48,32,24,16]
32868 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
32872 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
32873 SarConst = SarConst - (Size - ShiftSize);
32876 else if (SarConst.isNegative())
32877 return DAG.getNode(ISD::SHL, DL, VT, NN,
32878 DAG.getConstant(-SarConst, DL, CVT));
32880 return DAG.getNode(ISD::SRA, DL, VT, NN,
32881 DAG.getConstant(SarConst, DL, CVT));
32886 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
32887 SDValue N0 = N->getOperand(0);
32888 SDValue N1 = N->getOperand(1);
32889 EVT VT = N0.getValueType();
32891 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
32892 // TODO: This is a generic DAG combine that became an x86-only combine to
32893 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
32894 // and-not ('andn').
32895 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
32898 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
32899 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32900 if (!ShiftC || !AndC)
32903 // If we can shrink the constant mask below 8-bits or 32-bits, then this
32904 // transform should reduce code size. It may also enable secondary transforms
32905 // from improved known-bits analysis or instruction selection.
32906 APInt MaskVal = AndC->getAPIntValue();
32908 // If this can be matched by a zero extend, don't optimize.
32909 if (MaskVal.isMask()) {
32910 unsigned TO = MaskVal.countTrailingOnes();
32911 if (TO >= 8 && isPowerOf2_32(TO))
32915 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
32916 unsigned OldMaskSize = MaskVal.getMinSignedBits();
32917 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
32918 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
32919 (OldMaskSize > 32 && NewMaskSize <= 32)) {
32920 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
32922 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
32923 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
32924 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
32929 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
32930 TargetLowering::DAGCombinerInfo &DCI,
32931 const X86Subtarget &Subtarget) {
32932 if (N->getOpcode() == ISD::SHL)
32933 if (SDValue V = combineShiftLeft(N, DAG))
32936 if (N->getOpcode() == ISD::SRA)
32937 if (SDValue V = combineShiftRightArithmetic(N, DAG))
32940 if (N->getOpcode() == ISD::SRL)
32941 if (SDValue V = combineShiftRightLogical(N, DAG))
32947 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
32948 TargetLowering::DAGCombinerInfo &DCI,
32949 const X86Subtarget &Subtarget) {
32950 unsigned Opcode = N->getOpcode();
32951 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
32952 "Unexpected shift opcode");
32954 EVT VT = N->getValueType(0);
32955 SDValue N0 = N->getOperand(0);
32956 SDValue N1 = N->getOperand(1);
32957 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
32958 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
32959 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
32960 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
32961 "Unexpected PACKSS/PACKUS input type");
32963 // Constant Folding.
32964 APInt UndefElts0, UndefElts1;
32965 SmallVector<APInt, 32> EltBits0, EltBits1;
32966 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
32967 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
32968 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
32969 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
32970 unsigned NumLanes = VT.getSizeInBits() / 128;
32971 unsigned NumDstElts = VT.getVectorNumElements();
32972 unsigned NumSrcElts = NumDstElts / 2;
32973 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
32974 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
32975 bool IsSigned = (X86ISD::PACKSS == Opcode);
32977 APInt Undefs(NumDstElts, 0);
32978 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
32979 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
32980 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
32981 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
32982 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
32983 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
32985 if (UndefElts[SrcIdx]) {
32986 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
32990 APInt &Val = EltBits[SrcIdx];
32992 // PACKSS: Truncate signed value with signed saturation.
32993 // Source values less than dst minint are saturated to minint.
32994 // Source values greater than dst maxint are saturated to maxint.
32995 if (Val.isSignedIntN(DstBitsPerElt))
32996 Val = Val.trunc(DstBitsPerElt);
32997 else if (Val.isNegative())
32998 Val = APInt::getSignedMinValue(DstBitsPerElt);
33000 Val = APInt::getSignedMaxValue(DstBitsPerElt);
33002 // PACKUS: Truncate signed value with unsigned saturation.
33003 // Source values less than zero are saturated to zero.
33004 // Source values greater than dst maxuint are saturated to maxuint.
33005 if (Val.isIntN(DstBitsPerElt))
33006 Val = Val.trunc(DstBitsPerElt);
33007 else if (Val.isNegative())
33008 Val = APInt::getNullValue(DstBitsPerElt);
33010 Val = APInt::getAllOnesValue(DstBitsPerElt);
33012 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
33016 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
33019 // Attempt to combine as shuffle.
33021 if (SDValue Res = combineX86ShufflesRecursively(
33022 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33023 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33024 DCI.CombineTo(N, Res);
33031 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
33032 TargetLowering::DAGCombinerInfo &DCI,
33033 const X86Subtarget &Subtarget) {
33034 unsigned Opcode = N->getOpcode();
33035 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
33036 X86ISD::VSRLI == Opcode) &&
33037 "Unexpected shift opcode");
33038 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
33039 EVT VT = N->getValueType(0);
33040 SDValue N0 = N->getOperand(0);
33041 SDValue N1 = N->getOperand(1);
33042 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
33043 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
33044 "Unexpected value type");
33046 // Out of range logical bit shifts are guaranteed to be zero.
33047 // Out of range arithmetic bit shifts splat the sign bit.
33048 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
33049 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
33051 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33053 ShiftVal = NumBitsPerElt - 1;
33056 // Shift N0 by zero -> N0.
33060 // Shift zero -> zero.
33061 if (ISD::isBuildVectorAllZeros(N0.getNode()))
33062 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33064 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
33065 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
33066 // TODO - support other sra opcodes as needed.
33067 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
33068 N0.getOpcode() == X86ISD::VSRAI)
33069 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
33071 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
33072 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
33073 N1 == N0.getOperand(1)) {
33074 SDValue N00 = N0.getOperand(0);
33075 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
33076 if (ShiftVal.ult(NumSignBits))
33080 // We can decode 'whole byte' logical bit shifts as shuffles.
33081 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
33083 if (SDValue Res = combineX86ShufflesRecursively(
33084 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33085 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33086 DCI.CombineTo(N, Res);
33091 // Constant Folding.
33093 SmallVector<APInt, 32> EltBits;
33094 if (N->isOnlyUserOf(N0.getNode()) &&
33095 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
33096 assert(EltBits.size() == VT.getVectorNumElements() &&
33097 "Unexpected shift value type");
33098 unsigned ShiftImm = ShiftVal.getZExtValue();
33099 for (APInt &Elt : EltBits) {
33100 if (X86ISD::VSHLI == Opcode)
33102 else if (X86ISD::VSRAI == Opcode)
33103 Elt.ashrInPlace(ShiftImm);
33105 Elt.lshrInPlace(ShiftImm);
33107 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
33113 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
33114 TargetLowering::DAGCombinerInfo &DCI,
33115 const X86Subtarget &Subtarget) {
33117 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
33118 (N->getOpcode() == X86ISD::PINSRW &&
33119 N->getValueType(0) == MVT::v8i16)) &&
33120 "Unexpected vector insertion");
33122 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
33124 if (SDValue Res = combineX86ShufflesRecursively(
33125 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33126 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33127 DCI.CombineTo(N, Res);
33134 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
33135 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
33136 /// OR -> CMPNEQSS.
33137 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
33138 TargetLowering::DAGCombinerInfo &DCI,
33139 const X86Subtarget &Subtarget) {
33142 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
33143 // we're requiring SSE2 for both.
33144 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
33145 SDValue N0 = N->getOperand(0);
33146 SDValue N1 = N->getOperand(1);
33147 SDValue CMP0 = N0->getOperand(1);
33148 SDValue CMP1 = N1->getOperand(1);
33151 // The SETCCs should both refer to the same CMP.
33152 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
33155 SDValue CMP00 = CMP0->getOperand(0);
33156 SDValue CMP01 = CMP0->getOperand(1);
33157 EVT VT = CMP00.getValueType();
33159 if (VT == MVT::f32 || VT == MVT::f64) {
33160 bool ExpectingFlags = false;
33161 // Check for any users that want flags:
33162 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
33163 !ExpectingFlags && UI != UE; ++UI)
33164 switch (UI->getOpcode()) {
33169 ExpectingFlags = true;
33171 case ISD::CopyToReg:
33172 case ISD::SIGN_EXTEND:
33173 case ISD::ZERO_EXTEND:
33174 case ISD::ANY_EXTEND:
33178 if (!ExpectingFlags) {
33179 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
33180 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
33182 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
33183 X86::CondCode tmp = cc0;
33188 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
33189 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
33190 // FIXME: need symbolic constants for these magic numbers.
33191 // See X86ATTInstPrinter.cpp:printSSECC().
33192 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
33193 if (Subtarget.hasAVX512()) {
33195 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
33196 DAG.getConstant(x86cc, DL, MVT::i8));
33197 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
33198 N->getSimpleValueType(0), FSetCC,
33199 DAG.getIntPtrConstant(0, DL));
33201 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
33202 CMP00.getValueType(), CMP00, CMP01,
33203 DAG.getConstant(x86cc, DL,
33206 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
33207 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
33209 if (is64BitFP && !Subtarget.is64Bit()) {
33210 // On a 32-bit target, we cannot bitcast the 64-bit float to a
33211 // 64-bit integer, since that's not a legal type. Since
33212 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
33213 // bits, but can do this little dance to extract the lowest 32 bits
33214 // and work with those going forward.
33215 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
33217 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
33218 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
33219 Vector32, DAG.getIntPtrConstant(0, DL));
33223 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
33224 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
33225 DAG.getConstant(1, DL, IntVT));
33226 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
33228 return OneBitOfTruth;
33236 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
33237 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
33238 assert(N->getOpcode() == ISD::AND);
33240 EVT VT = N->getValueType(0);
33241 SDValue N0 = N->getOperand(0);
33242 SDValue N1 = N->getOperand(1);
33245 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
33248 if (N0.getOpcode() == ISD::XOR &&
33249 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
33250 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
33252 if (N1.getOpcode() == ISD::XOR &&
33253 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
33254 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
33259 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
33260 // register. In most cases we actually compare or select YMM-sized registers
33261 // and mixing the two types creates horrible code. This method optimizes
33262 // some of the transition sequences.
33263 // Even with AVX-512 this is still useful for removing casts around logical
33264 // operations on vXi1 mask types.
33265 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
33266 const X86Subtarget &Subtarget) {
33267 EVT VT = N->getValueType(0);
33268 assert(VT.isVector() && "Expected vector type");
33270 assert((N->getOpcode() == ISD::ANY_EXTEND ||
33271 N->getOpcode() == ISD::ZERO_EXTEND ||
33272 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
33274 SDValue Narrow = N->getOperand(0);
33275 EVT NarrowVT = Narrow.getValueType();
33277 if (Narrow->getOpcode() != ISD::XOR &&
33278 Narrow->getOpcode() != ISD::AND &&
33279 Narrow->getOpcode() != ISD::OR)
33282 SDValue N0 = Narrow->getOperand(0);
33283 SDValue N1 = Narrow->getOperand(1);
33286 // The Left side has to be a trunc.
33287 if (N0.getOpcode() != ISD::TRUNCATE)
33290 // The type of the truncated inputs.
33291 if (N0->getOperand(0).getValueType() != VT)
33294 // The right side has to be a 'trunc' or a constant vector.
33295 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
33296 N1.getOperand(0).getValueType() == VT;
33298 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
33301 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33303 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
33306 // Set N0 and N1 to hold the inputs to the new wide operation.
33307 N0 = N0->getOperand(0);
33309 N1 = N1->getOperand(0);
33311 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
33313 // Generate the wide operation.
33314 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
33315 unsigned Opcode = N->getOpcode();
33317 default: llvm_unreachable("Unexpected opcode");
33318 case ISD::ANY_EXTEND:
33320 case ISD::ZERO_EXTEND:
33321 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
33322 case ISD::SIGN_EXTEND:
33323 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
33324 Op, DAG.getValueType(NarrowVT));
33328 /// If both input operands of a logic op are being cast from floating point
33329 /// types, try to convert this into a floating point logic node to avoid
33330 /// unnecessary moves from SSE to integer registers.
33331 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
33332 const X86Subtarget &Subtarget) {
33333 unsigned FPOpcode = ISD::DELETED_NODE;
33334 if (N->getOpcode() == ISD::AND)
33335 FPOpcode = X86ISD::FAND;
33336 else if (N->getOpcode() == ISD::OR)
33337 FPOpcode = X86ISD::FOR;
33338 else if (N->getOpcode() == ISD::XOR)
33339 FPOpcode = X86ISD::FXOR;
33341 assert(FPOpcode != ISD::DELETED_NODE &&
33342 "Unexpected input node for FP logic conversion");
33344 EVT VT = N->getValueType(0);
33345 SDValue N0 = N->getOperand(0);
33346 SDValue N1 = N->getOperand(1);
33348 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33349 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
33350 (Subtarget.hasSSE2() && VT == MVT::i64))) {
33351 SDValue N00 = N0.getOperand(0);
33352 SDValue N10 = N1.getOperand(0);
33353 EVT N00Type = N00.getValueType();
33354 EVT N10Type = N10.getValueType();
33355 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
33356 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
33357 return DAG.getBitcast(VT, FPLogic);
33363 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
33364 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
33365 /// with a shift-right to eliminate loading the vector constant mask value.
33366 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
33367 const X86Subtarget &Subtarget) {
33368 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
33369 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
33370 EVT VT0 = Op0.getValueType();
33371 EVT VT1 = Op1.getValueType();
33373 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
33377 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
33378 !SplatVal.isMask())
33381 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
33384 unsigned EltBitWidth = VT0.getScalarSizeInBits();
33385 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
33389 unsigned ShiftVal = SplatVal.countTrailingOnes();
33390 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
33391 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
33392 return DAG.getBitcast(N->getValueType(0), Shift);
33395 // Get the index node from the lowered DAG of a GEP IR instruction with one
33396 // indexing dimension.
33397 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
33398 if (Ld->isIndexed())
33401 SDValue Base = Ld->getBasePtr();
33403 if (Base.getOpcode() != ISD::ADD)
33406 SDValue ShiftedIndex = Base.getOperand(0);
33408 if (ShiftedIndex.getOpcode() != ISD::SHL)
33411 return ShiftedIndex.getOperand(0);
33415 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
33416 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
33417 switch (VT.getSizeInBits()) {
33418 default: return false;
33419 case 64: return Subtarget.is64Bit() ? true : false;
33420 case 32: return true;
33426 // This function recognizes cases where X86 bzhi instruction can replace and
33427 // 'and-load' sequence.
33428 // In case of loading integer value from an array of constants which is defined
33431 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
33433 // then applying a bitwise and on the result with another input.
33434 // It's equivalent to performing bzhi (zero high bits) on the input, with the
33435 // same index of the load.
33436 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
33437 const X86Subtarget &Subtarget) {
33438 MVT VT = Node->getSimpleValueType(0);
33441 // Check if subtarget has BZHI instruction for the node's type
33442 if (!hasBZHI(Subtarget, VT))
33445 // Try matching the pattern for both operands.
33446 for (unsigned i = 0; i < 2; i++) {
33447 SDValue N = Node->getOperand(i);
33448 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
33450 // continue if the operand is not a load instruction
33454 const Value *MemOp = Ld->getMemOperand()->getValue();
33459 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
33460 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
33461 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
33463 Constant *Init = GV->getInitializer();
33464 Type *Ty = Init->getType();
33465 if (!isa<ConstantDataArray>(Init) ||
33466 !Ty->getArrayElementType()->isIntegerTy() ||
33467 Ty->getArrayElementType()->getScalarSizeInBits() !=
33468 VT.getSizeInBits() ||
33469 Ty->getArrayNumElements() >
33470 Ty->getArrayElementType()->getScalarSizeInBits())
33473 // Check if the array's constant elements are suitable to our case.
33474 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
33475 bool ConstantsMatch = true;
33476 for (uint64_t j = 0; j < ArrayElementCount; j++) {
33477 ConstantInt *Elem =
33478 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
33479 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
33480 ConstantsMatch = false;
33484 if (!ConstantsMatch)
33487 // Do the transformation (For 32-bit type):
33488 // -> (and (load arr[idx]), inp)
33489 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
33490 // that will be replaced with one bzhi instruction.
33491 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
33492 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
33494 // Get the Node which indexes into the array.
33495 SDValue Index = getIndexFromUnindexedLoad(Ld);
33498 Index = DAG.getZExtOrTrunc(Index, dl, VT);
33500 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
33502 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
33503 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
33505 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
33513 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
33514 TargetLowering::DAGCombinerInfo &DCI,
33515 const X86Subtarget &Subtarget) {
33516 EVT VT = N->getValueType(0);
33518 // If this is SSE1 only convert to FAND to avoid scalarization.
33519 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33520 return DAG.getBitcast(
33521 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
33522 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
33523 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
33526 if (DCI.isBeforeLegalizeOps())
33529 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33532 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33535 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
33538 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
33541 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
33544 // Attempt to recursively combine a bitmask AND with shuffles.
33545 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33547 if (SDValue Res = combineX86ShufflesRecursively(
33548 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33549 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33550 DCI.CombineTo(N, Res);
33555 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
33556 if ((VT.getScalarSizeInBits() % 8) == 0 &&
33557 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33558 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
33559 SDValue BitMask = N->getOperand(1);
33560 SDValue SrcVec = N->getOperand(0).getOperand(0);
33561 EVT SrcVecVT = SrcVec.getValueType();
33563 // Check that the constant bitmask masks whole bytes.
33565 SmallVector<APInt, 64> EltBits;
33566 if (VT == SrcVecVT.getScalarType() &&
33567 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
33568 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
33569 llvm::all_of(EltBits, [](APInt M) {
33570 return M.isNullValue() || M.isAllOnesValue();
33572 unsigned NumElts = SrcVecVT.getVectorNumElements();
33573 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
33574 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
33576 // Create a root shuffle mask from the byte mask and the extracted index.
33577 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
33578 for (unsigned i = 0; i != Scale; ++i) {
33581 int VecIdx = Scale * Idx + i;
33582 ShuffleMask[VecIdx] =
33583 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
33586 if (SDValue Shuffle = combineX86ShufflesRecursively(
33587 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
33588 /*HasVarMask*/ false, DAG, DCI, Subtarget))
33589 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
33590 N->getOperand(0).getOperand(1));
33598 // (or (and (m, y), (pandn m, x)))
33600 // (vselect m, x, y)
33601 // As a special case, try to fold:
33602 // (or (and (m, (sub 0, x)), (pandn m, x)))
33604 // (sub (xor X, M), M)
33605 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
33606 const X86Subtarget &Subtarget) {
33607 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
33609 SDValue N0 = N->getOperand(0);
33610 SDValue N1 = N->getOperand(1);
33611 EVT VT = N->getValueType(0);
33613 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
33614 (VT.is256BitVector() && Subtarget.hasInt256())))
33617 // Canonicalize AND to LHS.
33618 if (N1.getOpcode() == ISD::AND)
33621 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
33622 // ANDNP combine allows other combines to happen that prevent matching.
33623 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
33626 SDValue Mask = N1.getOperand(0);
33627 SDValue X = N1.getOperand(1);
33629 if (N0.getOperand(0) == Mask)
33630 Y = N0.getOperand(1);
33631 if (N0.getOperand(1) == Mask)
33632 Y = N0.getOperand(0);
33634 // Check to see if the mask appeared in both the AND and ANDNP.
33638 // Validate that X, Y, and Mask are bitcasts, and see through them.
33639 Mask = peekThroughBitcasts(Mask);
33640 X = peekThroughBitcasts(X);
33641 Y = peekThroughBitcasts(Y);
33643 EVT MaskVT = Mask.getValueType();
33644 unsigned EltBits = MaskVT.getScalarSizeInBits();
33646 // TODO: Attempt to handle floating point cases as well?
33647 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
33653 // (or (and (M, (sub 0, X)), (pandn M, X)))
33654 // which is a special case of vselect:
33655 // (vselect M, (sub 0, X), X)
33657 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
33658 // We know that, if fNegate is 0 or 1:
33659 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
33661 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
33662 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
33663 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
33664 // This lets us transform our vselect to:
33665 // (add (xor X, M), (and M, 1))
33667 // (sub (xor X, M), M)
33668 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
33669 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
33670 auto IsNegV = [](SDNode *N, SDValue V) {
33671 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
33672 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
33675 if (IsNegV(Y.getNode(), X))
33677 else if (IsNegV(X.getNode(), Y))
33681 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
33682 SDValue SubOp2 = Mask;
33684 // If the negate was on the false side of the select, then
33685 // the operands of the SUB need to be swapped. PR 27251.
33686 // This is because the pattern being matched above is
33687 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
33688 // but if the pattern matched was
33689 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
33690 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
33691 // pattern also needs to be a negation of the replacement pattern above.
33692 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
33693 // sub accomplishes the negation of the replacement pattern.
33695 std::swap(SubOp1, SubOp2);
33697 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
33698 return DAG.getBitcast(VT, Res);
33702 // PBLENDVB is only available on SSE 4.1.
33703 if (!Subtarget.hasSSE41())
33706 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
33708 X = DAG.getBitcast(BlendVT, X);
33709 Y = DAG.getBitcast(BlendVT, Y);
33710 Mask = DAG.getBitcast(BlendVT, Mask);
33711 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
33712 return DAG.getBitcast(VT, Mask);
33715 // Helper function for combineOrCmpEqZeroToCtlzSrl
33719 // srl(ctlz x), log2(bitsize(x))
33720 // Input pattern is checked by caller.
33721 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
33722 SelectionDAG &DAG) {
33723 SDValue Cmp = Op.getOperand(1);
33724 EVT VT = Cmp.getOperand(0).getValueType();
33725 unsigned Log2b = Log2_32(VT.getSizeInBits());
33727 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
33728 // The result of the shift is true or false, and on X86, the 32-bit
33729 // encoding of shr and lzcnt is more desirable.
33730 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
33731 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
33732 DAG.getConstant(Log2b, dl, VT));
33733 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
33736 // Try to transform:
33737 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
33739 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
33740 // Will also attempt to match more generic cases, eg:
33741 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
33742 // Only applies if the target supports the FastLZCNT feature.
33743 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
33744 TargetLowering::DAGCombinerInfo &DCI,
33745 const X86Subtarget &Subtarget) {
33746 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
33749 auto isORCandidate = [](SDValue N) {
33750 return (N->getOpcode() == ISD::OR && N->hasOneUse());
33753 // Check the zero extend is extending to 32-bit or more. The code generated by
33754 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
33755 // instructions to clear the upper bits.
33756 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
33757 !isORCandidate(N->getOperand(0)))
33760 // Check the node matches: setcc(eq, cmp 0)
33761 auto isSetCCCandidate = [](SDValue N) {
33762 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
33763 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
33764 N->getOperand(1).getOpcode() == X86ISD::CMP &&
33765 isNullConstant(N->getOperand(1).getOperand(1)) &&
33766 N->getOperand(1).getValueType().bitsGE(MVT::i32);
33769 SDNode *OR = N->getOperand(0).getNode();
33770 SDValue LHS = OR->getOperand(0);
33771 SDValue RHS = OR->getOperand(1);
33773 // Save nodes matching or(or, setcc(eq, cmp 0)).
33774 SmallVector<SDNode *, 2> ORNodes;
33775 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
33776 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
33777 ORNodes.push_back(OR);
33778 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
33779 LHS = OR->getOperand(0);
33780 RHS = OR->getOperand(1);
33783 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
33784 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
33785 !isORCandidate(SDValue(OR, 0)))
33788 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
33790 // or(srl(ctlz),srl(ctlz)).
33791 // The dag combiner can then fold it into:
33792 // srl(or(ctlz, ctlz)).
33793 EVT VT = OR->getValueType(0);
33794 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
33795 SDValue Ret, NewRHS;
33796 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
33797 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
33802 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
33803 while (ORNodes.size() > 0) {
33804 OR = ORNodes.pop_back_val();
33805 LHS = OR->getOperand(0);
33806 RHS = OR->getOperand(1);
33807 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
33808 if (RHS->getOpcode() == ISD::OR)
33809 std::swap(LHS, RHS);
33810 EVT VT = OR->getValueType(0);
33811 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
33814 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
33818 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
33823 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
33824 TargetLowering::DAGCombinerInfo &DCI,
33825 const X86Subtarget &Subtarget) {
33826 SDValue N0 = N->getOperand(0);
33827 SDValue N1 = N->getOperand(1);
33828 EVT VT = N->getValueType(0);
33830 // If this is SSE1 only convert to FOR to avoid scalarization.
33831 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33832 return DAG.getBitcast(MVT::v4i32,
33833 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
33834 DAG.getBitcast(MVT::v4f32, N0),
33835 DAG.getBitcast(MVT::v4f32, N1)));
33838 if (DCI.isBeforeLegalizeOps())
33841 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33844 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33847 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
33850 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
33853 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
33854 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
33856 // SHLD/SHRD instructions have lower register pressure, but on some
33857 // platforms they have higher latency than the equivalent
33858 // series of shifts/or that would otherwise be generated.
33859 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
33860 // have higher latencies and we are not optimizing for size.
33861 if (!OptForSize && Subtarget.isSHLDSlow())
33864 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
33866 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
33868 if (!N0.hasOneUse() || !N1.hasOneUse())
33871 SDValue ShAmt0 = N0.getOperand(1);
33872 if (ShAmt0.getValueType() != MVT::i8)
33874 SDValue ShAmt1 = N1.getOperand(1);
33875 if (ShAmt1.getValueType() != MVT::i8)
33877 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
33878 ShAmt0 = ShAmt0.getOperand(0);
33879 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
33880 ShAmt1 = ShAmt1.getOperand(0);
33883 unsigned Opc = X86ISD::SHLD;
33884 SDValue Op0 = N0.getOperand(0);
33885 SDValue Op1 = N1.getOperand(0);
33886 if (ShAmt0.getOpcode() == ISD::SUB ||
33887 ShAmt0.getOpcode() == ISD::XOR) {
33888 Opc = X86ISD::SHRD;
33889 std::swap(Op0, Op1);
33890 std::swap(ShAmt0, ShAmt1);
33893 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
33894 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
33895 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
33896 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
33897 unsigned Bits = VT.getSizeInBits();
33898 if (ShAmt1.getOpcode() == ISD::SUB) {
33899 SDValue Sum = ShAmt1.getOperand(0);
33900 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
33901 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
33902 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
33903 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
33904 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
33905 return DAG.getNode(Opc, DL, VT,
33907 DAG.getNode(ISD::TRUNCATE, DL,
33910 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
33911 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
33912 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
33913 return DAG.getNode(Opc, DL, VT,
33914 N0.getOperand(0), N1.getOperand(0),
33915 DAG.getNode(ISD::TRUNCATE, DL,
33917 } else if (ShAmt1.getOpcode() == ISD::XOR) {
33918 SDValue Mask = ShAmt1.getOperand(1);
33919 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
33920 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
33921 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
33922 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
33923 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
33924 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
33925 if (Op1.getOpcode() == InnerShift &&
33926 isa<ConstantSDNode>(Op1.getOperand(1)) &&
33927 Op1.getConstantOperandVal(1) == 1) {
33928 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33929 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33931 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
33932 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
33933 Op1.getOperand(0) == Op1.getOperand(1)) {
33934 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33935 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33944 /// Try to turn tests against the signbit in the form of:
33945 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
33948 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
33949 // This is only worth doing if the output type is i8 or i1.
33950 EVT ResultType = N->getValueType(0);
33951 if (ResultType != MVT::i8 && ResultType != MVT::i1)
33954 SDValue N0 = N->getOperand(0);
33955 SDValue N1 = N->getOperand(1);
33957 // We should be performing an xor against a truncated shift.
33958 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
33961 // Make sure we are performing an xor against one.
33962 if (!isOneConstant(N1))
33965 // SetCC on x86 zero extends so only act on this if it's a logical shift.
33966 SDValue Shift = N0.getOperand(0);
33967 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
33970 // Make sure we are truncating from one of i16, i32 or i64.
33971 EVT ShiftTy = Shift.getValueType();
33972 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
33975 // Make sure the shift amount extracts the sign bit.
33976 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
33977 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
33980 // Create a greater-than comparison against -1.
33981 // N.B. Using SETGE against 0 works but we want a canonical looking
33982 // comparison, using SETGT matches up with what TranslateX86CC.
33984 SDValue ShiftOp = Shift.getOperand(0);
33985 EVT ShiftOpTy = ShiftOp.getValueType();
33986 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33987 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
33988 *DAG.getContext(), ResultType);
33989 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
33990 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
33991 if (SetCCResultType != ResultType)
33992 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
33996 /// Turn vector tests of the signbit in the form of:
33997 /// xor (sra X, elt_size(X)-1), -1
34001 /// This should be called before type legalization because the pattern may not
34002 /// persist after that.
34003 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
34004 const X86Subtarget &Subtarget) {
34005 EVT VT = N->getValueType(0);
34006 if (!VT.isSimple())
34009 switch (VT.getSimpleVT().SimpleTy) {
34010 default: return SDValue();
34013 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
34014 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
34018 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
34021 // There must be a shift right algebraic before the xor, and the xor must be a
34022 // 'not' operation.
34023 SDValue Shift = N->getOperand(0);
34024 SDValue Ones = N->getOperand(1);
34025 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
34026 !ISD::isBuildVectorAllOnes(Ones.getNode()))
34029 // The shift should be smearing the sign bit across each vector element.
34030 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
34034 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
34035 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
34036 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
34039 // Create a greater-than comparison against -1. We don't use the more obvious
34040 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
34041 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
34044 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
34045 /// is valid for the given \p Subtarget.
34046 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
34047 const X86Subtarget &Subtarget) {
34048 if (!Subtarget.hasAVX512())
34051 // FIXME: Scalar type may be supported if we move it to vector register.
34052 if (!SrcVT.isVector())
34055 EVT SrcElVT = SrcVT.getScalarType();
34056 EVT DstElVT = DstVT.getScalarType();
34057 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
34059 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
34060 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
34064 /// Detect a pattern of truncation with unsigned saturation:
34065 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34066 /// Return the source value to be truncated or SDValue() if the pattern was not
34068 static SDValue detectUSatPattern(SDValue In, EVT VT) {
34069 if (In.getOpcode() != ISD::UMIN)
34072 // Saturation with truncation. We truncate from InVT to VT.
34073 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
34074 "Unexpected types for truncate operation");
34077 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
34078 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
34079 // the element size of the destination type.
34080 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : SDValue();
34085 /// Detect patterns of truncation with signed saturation:
34086 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
34087 /// signed_max_of_dest_type)) to dest_type)
34089 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
34090 /// signed_min_of_dest_type)) to dest_type).
34091 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
34092 /// Return the source value to be truncated or SDValue() if the pattern was not
34094 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
34095 unsigned NumDstBits = VT.getScalarSizeInBits();
34096 unsigned NumSrcBits = In.getScalarValueSizeInBits();
34097 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
34099 auto MatchMinMax = [](SDValue V, unsigned Opcode,
34100 const APInt &Limit) -> SDValue {
34102 if (V.getOpcode() == Opcode &&
34103 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
34104 return V.getOperand(0);
34108 APInt SignedMax, SignedMin;
34110 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
34111 SignedMin = APInt(NumSrcBits, 0);
34113 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
34114 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
34117 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
34118 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
34121 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
34122 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
34128 /// Detect a pattern of truncation with saturation:
34129 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34130 /// The types should allow to use VPMOVUS* instruction on AVX512.
34131 /// Return the source value to be truncated or SDValue() if the pattern was not
34133 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
34134 const X86Subtarget &Subtarget,
34135 const TargetLowering &TLI) {
34136 if (!TLI.isTypeLegal(In.getValueType()))
34138 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34140 return detectUSatPattern(In, VT);
34143 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
34145 const X86Subtarget &Subtarget) {
34146 EVT InVT = In.getValueType();
34147 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34148 if (!TLI.isTypeLegal(InVT) || !TLI.isTypeLegal(VT))
34150 if (isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
34151 if (auto SSatVal = detectSSatPattern(In, VT))
34152 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
34153 if (auto USatVal = detectUSatPattern(In, VT))
34154 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
34156 if ((VT.getScalarType() == MVT::i8 && InVT.getScalarType() == MVT::i16) ||
34157 (VT.getScalarType() == MVT::i16 && InVT.getScalarType() == MVT::i32)) {
34158 if (auto SSatVal = detectSSatPattern(In, VT))
34159 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
34161 if (Subtarget.hasSSE41() || VT.getScalarType() == MVT::i8)
34162 if (auto USatVal = detectSSatPattern(In, VT, true))
34163 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
34169 // Helper for splitting operands of a binary operation to legal target size and
34170 // apply a function on each part.
34171 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
34172 // 256-bit and on AVX512BW in 512-bit.
34173 // The argument VT is the type used for deciding if/how to split the operands
34174 // Op0 and Op1. Op0 and Op1 do *not* have to be of type VT.
34175 // The argument Builder is a function that will be applied on each split psrt:
34176 // SDValue Builder(SelectionDAG&G, SDLoc, SDValue, SDValue)
34177 template <typename F>
34178 SDValue SplitBinaryOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
34179 const SDLoc &DL, EVT VT, SDValue Op0,
34180 SDValue Op1, F Builder) {
34181 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
34182 unsigned NumSubs = 1;
34183 if (Subtarget.hasBWI()) {
34184 if (VT.getSizeInBits() > 512) {
34185 NumSubs = VT.getSizeInBits() / 512;
34186 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
34188 } else if (Subtarget.hasAVX2()) {
34189 if (VT.getSizeInBits() > 256) {
34190 NumSubs = VT.getSizeInBits() / 256;
34191 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
34194 if (VT.getSizeInBits() > 128) {
34195 NumSubs = VT.getSizeInBits() / 128;
34196 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
34201 return Builder(DAG, DL, Op0, Op1);
34203 SmallVector<SDValue, 4> Subs;
34204 EVT InVT = Op0.getValueType();
34205 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34206 InVT.getVectorNumElements() / NumSubs);
34207 for (unsigned i = 0; i != NumSubs; ++i) {
34208 unsigned Idx = i * SubVT.getVectorNumElements();
34209 SDValue LHS = extractSubVector(Op0, Idx, DAG, DL, SubVT.getSizeInBits());
34210 SDValue RHS = extractSubVector(Op1, Idx, DAG, DL, SubVT.getSizeInBits());
34211 Subs.push_back(Builder(DAG, DL, LHS, RHS));
34213 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
34216 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
34217 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
34218 /// X86ISD::AVG instruction.
34219 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
34220 const X86Subtarget &Subtarget,
34222 if (!VT.isVector() || !VT.isSimple())
34224 EVT InVT = In.getValueType();
34225 unsigned NumElems = VT.getVectorNumElements();
34227 EVT ScalarVT = VT.getVectorElementType();
34228 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
34229 isPowerOf2_32(NumElems)))
34232 // InScalarVT is the intermediate type in AVG pattern and it should be greater
34233 // than the original input type (i8/i16).
34234 EVT InScalarVT = InVT.getVectorElementType();
34235 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
34238 if (!Subtarget.hasSSE2())
34241 // Detect the following pattern:
34243 // %1 = zext <N x i8> %a to <N x i32>
34244 // %2 = zext <N x i8> %b to <N x i32>
34245 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
34246 // %4 = add nuw nsw <N x i32> %3, %2
34247 // %5 = lshr <N x i32> %N, <i32 1 x N>
34248 // %6 = trunc <N x i32> %5 to <N x i8>
34250 // In AVX512, the last instruction can also be a trunc store.
34251 if (In.getOpcode() != ISD::SRL)
34254 // A lambda checking the given SDValue is a constant vector and each element
34255 // is in the range [Min, Max].
34256 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
34257 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
34258 if (!BV || !BV->isConstant())
34260 for (SDValue Op : V->ops()) {
34261 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
34264 uint64_t Val = C->getZExtValue();
34265 if (Val < Min || Val > Max)
34271 // Check if each element of the vector is left-shifted by one.
34272 auto LHS = In.getOperand(0);
34273 auto RHS = In.getOperand(1);
34274 if (!IsConstVectorInRange(RHS, 1, 1))
34276 if (LHS.getOpcode() != ISD::ADD)
34279 // Detect a pattern of a + b + 1 where the order doesn't matter.
34280 SDValue Operands[3];
34281 Operands[0] = LHS.getOperand(0);
34282 Operands[1] = LHS.getOperand(1);
34284 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
34286 return DAG.getNode(X86ISD::AVG, DL, Op0.getValueType(), Op0, Op1);
34289 // Take care of the case when one of the operands is a constant vector whose
34290 // element is in the range [1, 256].
34291 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
34292 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
34293 Operands[0].getOperand(0).getValueType() == VT) {
34294 // The pattern is detected. Subtract one from the constant vector, then
34295 // demote it and emit X86ISD::AVG instruction.
34296 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
34297 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
34298 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
34299 return SplitBinaryOpsAndApply(DAG, Subtarget, DL, VT,
34300 Operands[0].getOperand(0), Operands[1],
34304 if (Operands[0].getOpcode() == ISD::ADD)
34305 std::swap(Operands[0], Operands[1]);
34306 else if (Operands[1].getOpcode() != ISD::ADD)
34308 Operands[2] = Operands[1].getOperand(0);
34309 Operands[1] = Operands[1].getOperand(1);
34311 // Now we have three operands of two additions. Check that one of them is a
34312 // constant vector with ones, and the other two are promoted from i8/i16.
34313 for (int i = 0; i < 3; ++i) {
34314 if (!IsConstVectorInRange(Operands[i], 1, 1))
34316 std::swap(Operands[i], Operands[2]);
34318 // Check if Operands[0] and Operands[1] are results of type promotion.
34319 for (int j = 0; j < 2; ++j)
34320 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
34321 Operands[j].getOperand(0).getValueType() != VT)
34324 // The pattern is detected, emit X86ISD::AVG instruction(s).
34325 return SplitBinaryOpsAndApply(DAG, Subtarget, DL, VT,
34326 Operands[0].getOperand(0),
34327 Operands[1].getOperand(0), AVGBuilder);
34333 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
34334 TargetLowering::DAGCombinerInfo &DCI,
34335 const X86Subtarget &Subtarget) {
34336 LoadSDNode *Ld = cast<LoadSDNode>(N);
34337 EVT RegVT = Ld->getValueType(0);
34338 EVT MemVT = Ld->getMemoryVT();
34340 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34342 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
34343 // into two 16-byte operations. Also split non-temporal aligned loads on
34344 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
34345 ISD::LoadExtType Ext = Ld->getExtensionType();
34347 unsigned AddressSpace = Ld->getAddressSpace();
34348 unsigned Alignment = Ld->getAlignment();
34349 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
34350 Ext == ISD::NON_EXTLOAD &&
34351 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
34352 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
34353 AddressSpace, Alignment, &Fast) && !Fast))) {
34354 unsigned NumElems = RegVT.getVectorNumElements();
34358 SDValue Ptr = Ld->getBasePtr();
34360 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
34363 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
34364 Alignment, Ld->getMemOperand()->getFlags());
34366 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
34368 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
34369 Ld->getPointerInfo().getWithOffset(16),
34370 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
34371 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34373 Load2.getValue(1));
34375 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
34376 return DCI.CombineTo(N, NewVec, TF, true);
34382 /// If V is a build vector of boolean constants and exactly one of those
34383 /// constants is true, return the operand index of that true element.
34384 /// Otherwise, return -1.
34385 static int getOneTrueElt(SDValue V) {
34386 // This needs to be a build vector of booleans.
34387 // TODO: Checking for the i1 type matches the IR definition for the mask,
34388 // but the mask check could be loosened to i8 or other types. That might
34389 // also require checking more than 'allOnesValue'; eg, the x86 HW
34390 // instructions only require that the MSB is set for each mask element.
34391 // The ISD::MSTORE comments/definition do not specify how the mask operand
34393 auto *BV = dyn_cast<BuildVectorSDNode>(V);
34394 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
34397 int TrueIndex = -1;
34398 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
34399 for (unsigned i = 0; i < NumElts; ++i) {
34400 const SDValue &Op = BV->getOperand(i);
34403 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
34406 if (ConstNode->getAPIntValue().isAllOnesValue()) {
34407 // If we already found a one, this is too many.
34408 if (TrueIndex >= 0)
34416 /// Given a masked memory load/store operation, return true if it has one mask
34417 /// bit set. If it has one mask bit set, then also return the memory address of
34418 /// the scalar element to load/store, the vector index to insert/extract that
34419 /// scalar element, and the alignment for the scalar memory access.
34420 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
34421 SelectionDAG &DAG, SDValue &Addr,
34422 SDValue &Index, unsigned &Alignment) {
34423 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
34424 if (TrueMaskElt < 0)
34427 // Get the address of the one scalar element that is specified by the mask
34428 // using the appropriate offset from the base pointer.
34429 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
34430 Addr = MaskedOp->getBasePtr();
34431 if (TrueMaskElt != 0) {
34432 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
34433 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
34436 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
34437 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
34441 /// If exactly one element of the mask is set for a non-extending masked load,
34442 /// it is a scalar load and vector insert.
34443 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34444 /// mask have already been optimized in IR, so we don't bother with those here.
34446 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34447 TargetLowering::DAGCombinerInfo &DCI) {
34448 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34449 // However, some target hooks may need to be added to know when the transform
34450 // is profitable. Endianness would also have to be considered.
34452 SDValue Addr, VecIndex;
34453 unsigned Alignment;
34454 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
34457 // Load the one scalar element that is specified by the mask using the
34458 // appropriate offset from the base pointer.
34460 EVT VT = ML->getValueType(0);
34461 EVT EltVT = VT.getVectorElementType();
34463 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
34464 Alignment, ML->getMemOperand()->getFlags());
34466 // Insert the loaded element into the appropriate place in the vector.
34467 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
34469 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
34473 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34474 TargetLowering::DAGCombinerInfo &DCI) {
34475 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
34479 EVT VT = ML->getValueType(0);
34481 // If we are loading the first and last elements of a vector, it is safe and
34482 // always faster to load the whole vector. Replace the masked load with a
34483 // vector load and select.
34484 unsigned NumElts = VT.getVectorNumElements();
34485 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
34486 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
34487 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
34488 if (LoadFirstElt && LoadLastElt) {
34489 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34490 ML->getMemOperand());
34491 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
34492 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
34495 // Convert a masked load with a constant mask into a masked load and a select.
34496 // This allows the select operation to use a faster kind of select instruction
34497 // (for example, vblendvps -> vblendps).
34499 // Don't try this if the pass-through operand is already undefined. That would
34500 // cause an infinite loop because that's what we're about to create.
34501 if (ML->getSrc0().isUndef())
34504 // The new masked load has an undef pass-through operand. The select uses the
34505 // original pass-through operand.
34506 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
34507 ML->getMask(), DAG.getUNDEF(VT),
34508 ML->getMemoryVT(), ML->getMemOperand(),
34509 ML->getExtensionType());
34510 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
34512 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
34515 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
34516 TargetLowering::DAGCombinerInfo &DCI,
34517 const X86Subtarget &Subtarget) {
34518 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
34520 // TODO: Expanding load with constant mask may be optimized as well.
34521 if (Mld->isExpandingLoad())
34524 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
34525 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
34527 // TODO: Do some AVX512 subsets benefit from this transform?
34528 if (!Subtarget.hasAVX512())
34529 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
34533 if (Mld->getExtensionType() != ISD::SEXTLOAD)
34536 // Resolve extending loads.
34537 EVT VT = Mld->getValueType(0);
34538 unsigned NumElems = VT.getVectorNumElements();
34539 EVT LdVT = Mld->getMemoryVT();
34542 assert(LdVT != VT && "Cannot extend to the same type");
34543 unsigned ToSz = VT.getScalarSizeInBits();
34544 unsigned FromSz = LdVT.getScalarSizeInBits();
34545 // From/To sizes and ElemCount must be pow of two.
34546 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34547 "Unexpected size for extending masked load");
34549 unsigned SizeRatio = ToSz / FromSz;
34550 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
34552 // Create a type on which we perform the shuffle.
34553 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34554 LdVT.getScalarType(), NumElems*SizeRatio);
34555 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34557 // Convert Src0 value.
34558 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
34559 if (!Mld->getSrc0().isUndef()) {
34560 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34561 for (unsigned i = 0; i != NumElems; ++i)
34562 ShuffleVec[i] = i * SizeRatio;
34564 // Can't shuffle using an illegal type.
34565 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34566 "WideVecVT should be legal");
34567 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
34568 DAG.getUNDEF(WideVecVT), ShuffleVec);
34571 // Prepare the new mask.
34573 SDValue Mask = Mld->getMask();
34574 if (Mask.getValueType() == VT) {
34575 // Mask and original value have the same type.
34576 NewMask = DAG.getBitcast(WideVecVT, Mask);
34577 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34578 for (unsigned i = 0; i != NumElems; ++i)
34579 ShuffleVec[i] = i * SizeRatio;
34580 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
34581 ShuffleVec[i] = NumElems * SizeRatio;
34582 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34583 DAG.getConstant(0, dl, WideVecVT),
34586 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34587 unsigned WidenNumElts = NumElems*SizeRatio;
34588 unsigned MaskNumElts = VT.getVectorNumElements();
34589 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34592 unsigned NumConcat = WidenNumElts / MaskNumElts;
34593 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34594 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34596 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34599 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
34600 Mld->getBasePtr(), NewMask, WideSrc0,
34601 Mld->getMemoryVT(), Mld->getMemOperand(),
34603 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
34604 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
34607 /// If exactly one element of the mask is set for a non-truncating masked store,
34608 /// it is a vector extract and scalar store.
34609 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34610 /// mask have already been optimized in IR, so we don't bother with those here.
34611 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
34612 SelectionDAG &DAG) {
34613 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34614 // However, some target hooks may need to be added to know when the transform
34615 // is profitable. Endianness would also have to be considered.
34617 SDValue Addr, VecIndex;
34618 unsigned Alignment;
34619 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
34622 // Extract the one scalar element that is actually being stored.
34624 EVT VT = MS->getValue().getValueType();
34625 EVT EltVT = VT.getVectorElementType();
34626 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
34627 MS->getValue(), VecIndex);
34629 // Store that element at the appropriate offset from the base pointer.
34630 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
34631 Alignment, MS->getMemOperand()->getFlags());
34634 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
34635 const X86Subtarget &Subtarget) {
34636 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
34638 if (Mst->isCompressingStore())
34641 if (!Mst->isTruncatingStore()) {
34642 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
34643 return ScalarStore;
34645 // If the mask is checking (0 > X), we're creating a vector with all-zeros
34646 // or all-ones elements based on the sign bits of X. AVX1 masked store only
34647 // cares about the sign bit of each mask element, so eliminate the compare:
34648 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
34649 // Note that by waiting to match an x86-specific PCMPGT node, we're
34650 // eliminating potentially more complex matching of a setcc node which has
34651 // a full range of predicates.
34652 SDValue Mask = Mst->getMask();
34653 if (Mask.getOpcode() == X86ISD::PCMPGT &&
34654 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
34655 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
34656 "Unexpected type for PCMPGT");
34657 return DAG.getMaskedStore(
34658 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
34659 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
34662 // TODO: AVX512 targets should also be able to simplify something like the
34663 // pattern above, but that pattern will be different. It will either need to
34664 // match setcc more generally or match PCMPGTM later (in tablegen?).
34669 // Resolve truncating stores.
34670 EVT VT = Mst->getValue().getValueType();
34671 unsigned NumElems = VT.getVectorNumElements();
34672 EVT StVT = Mst->getMemoryVT();
34675 assert(StVT != VT && "Cannot truncate to the same type");
34676 unsigned FromSz = VT.getScalarSizeInBits();
34677 unsigned ToSz = StVT.getScalarSizeInBits();
34679 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34681 // The truncating store is legal in some cases. For example
34682 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34683 // are designated for truncate store.
34684 // In this case we don't need any further transformations.
34685 if (TLI.isTruncStoreLegal(VT, StVT))
34688 // From/To sizes and ElemCount must be pow of two.
34689 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34690 "Unexpected size for truncating masked store");
34691 // We are going to use the original vector elt for storing.
34692 // Accumulated smaller vector elements must be a multiple of the store size.
34693 assert (((NumElems * FromSz) % ToSz) == 0 &&
34694 "Unexpected ratio for truncating masked store");
34696 unsigned SizeRatio = FromSz / ToSz;
34697 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34699 // Create a type on which we perform the shuffle.
34700 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34701 StVT.getScalarType(), NumElems*SizeRatio);
34703 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34705 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
34706 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34707 for (unsigned i = 0; i != NumElems; ++i)
34708 ShuffleVec[i] = i * SizeRatio;
34710 // Can't shuffle using an illegal type.
34711 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34712 "WideVecVT should be legal");
34714 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34715 DAG.getUNDEF(WideVecVT),
34719 SDValue Mask = Mst->getMask();
34720 if (Mask.getValueType() == VT) {
34721 // Mask and original value have the same type.
34722 NewMask = DAG.getBitcast(WideVecVT, Mask);
34723 for (unsigned i = 0; i != NumElems; ++i)
34724 ShuffleVec[i] = i * SizeRatio;
34725 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
34726 ShuffleVec[i] = NumElems*SizeRatio;
34727 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34728 DAG.getConstant(0, dl, WideVecVT),
34731 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34732 unsigned WidenNumElts = NumElems*SizeRatio;
34733 unsigned MaskNumElts = VT.getVectorNumElements();
34734 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34737 unsigned NumConcat = WidenNumElts / MaskNumElts;
34738 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34739 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34741 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34744 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
34745 Mst->getBasePtr(), NewMask, StVT,
34746 Mst->getMemOperand(), false);
34749 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
34750 const X86Subtarget &Subtarget) {
34751 StoreSDNode *St = cast<StoreSDNode>(N);
34752 EVT VT = St->getValue().getValueType();
34753 EVT StVT = St->getMemoryVT();
34755 SDValue StoredVal = St->getOperand(1);
34756 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34758 // If we are saving a concatenation of two XMM registers and 32-byte stores
34759 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
34761 unsigned AddressSpace = St->getAddressSpace();
34762 unsigned Alignment = St->getAlignment();
34763 if (VT.is256BitVector() && StVT == VT &&
34764 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
34765 AddressSpace, Alignment, &Fast) &&
34767 unsigned NumElems = VT.getVectorNumElements();
34771 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
34772 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
34774 SDValue Ptr0 = St->getBasePtr();
34775 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
34778 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
34779 Alignment, St->getMemOperand()->getFlags());
34781 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
34782 St->getPointerInfo().getWithOffset(16),
34783 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
34784 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
34787 // Optimize trunc store (of multiple scalars) to shuffle and store.
34788 // First, pack all of the elements in one place. Next, store to memory
34789 // in fewer chunks.
34790 if (St->isTruncatingStore() && VT.isVector()) {
34791 // Check if we can detect an AVG pattern from the truncation. If yes,
34792 // replace the trunc store by a normal store with the result of X86ISD::AVG
34794 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
34796 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
34797 St->getPointerInfo(), St->getAlignment(),
34798 St->getMemOperand()->getFlags());
34800 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34802 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
34804 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
34805 dl, Val, St->getBasePtr(),
34806 St->getMemoryVT(), St->getMemOperand(), DAG);
34808 unsigned NumElems = VT.getVectorNumElements();
34809 assert(StVT != VT && "Cannot truncate to the same type");
34810 unsigned FromSz = VT.getScalarSizeInBits();
34811 unsigned ToSz = StVT.getScalarSizeInBits();
34813 // The truncating store is legal in some cases. For example
34814 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34815 // are designated for truncate store.
34816 // In this case we don't need any further transformations.
34817 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
34820 // From, To sizes and ElemCount must be pow of two
34821 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
34822 // We are going to use the original vector elt for storing.
34823 // Accumulated smaller vector elements must be a multiple of the store size.
34824 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
34826 unsigned SizeRatio = FromSz / ToSz;
34828 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34830 // Create a type on which we perform the shuffle
34831 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34832 StVT.getScalarType(), NumElems*SizeRatio);
34834 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34836 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
34837 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
34838 for (unsigned i = 0; i != NumElems; ++i)
34839 ShuffleVec[i] = i * SizeRatio;
34841 // Can't shuffle using an illegal type.
34842 if (!TLI.isTypeLegal(WideVecVT))
34845 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34846 DAG.getUNDEF(WideVecVT),
34848 // At this point all of the data is stored at the bottom of the
34849 // register. We now need to save it to mem.
34851 // Find the largest store unit
34852 MVT StoreType = MVT::i8;
34853 for (MVT Tp : MVT::integer_valuetypes()) {
34854 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
34858 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
34859 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
34860 (64 <= NumElems * ToSz))
34861 StoreType = MVT::f64;
34863 // Bitcast the original vector into a vector of store-size units
34864 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
34865 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
34866 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
34867 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
34868 SmallVector<SDValue, 8> Chains;
34869 SDValue Ptr = St->getBasePtr();
34871 // Perform one or more big stores into memory.
34872 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
34873 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
34874 StoreType, ShuffWide,
34875 DAG.getIntPtrConstant(i, dl));
34877 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
34878 St->getAlignment(), St->getMemOperand()->getFlags());
34879 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
34880 Chains.push_back(Ch);
34883 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
34886 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
34887 // the FP state in cases where an emms may be missing.
34888 // A preferable solution to the general problem is to figure out the right
34889 // places to insert EMMS. This qualifies as a quick hack.
34891 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
34892 if (VT.getSizeInBits() != 64)
34895 const Function &F = DAG.getMachineFunction().getFunction();
34896 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
34898 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
34899 if ((VT.isVector() ||
34900 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
34901 isa<LoadSDNode>(St->getValue()) &&
34902 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
34903 St->getChain().hasOneUse() && !St->isVolatile()) {
34904 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
34905 SmallVector<SDValue, 8> Ops;
34907 if (!ISD::isNormalLoad(Ld))
34910 // If this is not the MMX case, i.e. we are just turning i64 load/store
34911 // into f64 load/store, avoid the transformation if there are multiple
34912 // uses of the loaded value.
34913 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
34918 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
34919 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
34921 if (Subtarget.is64Bit() || F64IsLegal) {
34922 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
34923 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
34924 Ld->getMemOperand());
34926 // Make sure new load is placed in same chain order.
34927 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
34928 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
34929 St->getMemOperand());
34932 // Otherwise, lower to two pairs of 32-bit loads / stores.
34933 SDValue LoAddr = Ld->getBasePtr();
34934 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
34936 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
34937 Ld->getPointerInfo(), Ld->getAlignment(),
34938 Ld->getMemOperand()->getFlags());
34939 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
34940 Ld->getPointerInfo().getWithOffset(4),
34941 MinAlign(Ld->getAlignment(), 4),
34942 Ld->getMemOperand()->getFlags());
34943 // Make sure new loads are placed in same chain order.
34944 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
34945 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
34947 LoAddr = St->getBasePtr();
34948 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
34951 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
34952 St->getAlignment(), St->getMemOperand()->getFlags());
34953 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
34954 St->getPointerInfo().getWithOffset(4),
34955 MinAlign(St->getAlignment(), 4),
34956 St->getMemOperand()->getFlags());
34957 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
34960 // This is similar to the above case, but here we handle a scalar 64-bit
34961 // integer store that is extracted from a vector on a 32-bit target.
34962 // If we have SSE2, then we can treat it like a floating-point double
34963 // to get past legalization. The execution dependencies fixup pass will
34964 // choose the optimal machine instruction for the store if this really is
34965 // an integer or v2f32 rather than an f64.
34966 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
34967 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
34968 SDValue OldExtract = St->getOperand(1);
34969 SDValue ExtOp0 = OldExtract.getOperand(0);
34970 unsigned VecSize = ExtOp0.getValueSizeInBits();
34971 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
34972 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
34973 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
34974 BitCast, OldExtract.getOperand(1));
34975 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
34976 St->getPointerInfo(), St->getAlignment(),
34977 St->getMemOperand()->getFlags());
34983 /// Return 'true' if this vector operation is "horizontal"
34984 /// and return the operands for the horizontal operation in LHS and RHS. A
34985 /// horizontal operation performs the binary operation on successive elements
34986 /// of its first operand, then on successive elements of its second operand,
34987 /// returning the resulting values in a vector. For example, if
34988 /// A = < float a0, float a1, float a2, float a3 >
34990 /// B = < float b0, float b1, float b2, float b3 >
34991 /// then the result of doing a horizontal operation on A and B is
34992 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
34993 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
34994 /// A horizontal-op B, for some already available A and B, and if so then LHS is
34995 /// set to A, RHS to B, and the routine returns 'true'.
34996 /// Note that the binary operation should have the property that if one of the
34997 /// operands is UNDEF then the result is UNDEF.
34998 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
34999 // Look for the following pattern: if
35000 // A = < float a0, float a1, float a2, float a3 >
35001 // B = < float b0, float b1, float b2, float b3 >
35003 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
35004 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
35005 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
35006 // which is A horizontal-op B.
35008 // At least one of the operands should be a vector shuffle.
35009 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
35010 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
35013 MVT VT = LHS.getSimpleValueType();
35015 assert((VT.is128BitVector() || VT.is256BitVector()) &&
35016 "Unsupported vector type for horizontal add/sub");
35018 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
35019 // operate independently on 128-bit lanes.
35020 unsigned NumElts = VT.getVectorNumElements();
35021 unsigned NumLanes = VT.getSizeInBits()/128;
35022 unsigned NumLaneElts = NumElts / NumLanes;
35023 assert((NumLaneElts % 2 == 0) &&
35024 "Vector type should have an even number of elements in each lane");
35025 unsigned HalfLaneElts = NumLaneElts/2;
35027 // View LHS in the form
35028 // LHS = VECTOR_SHUFFLE A, B, LMask
35029 // If LHS is not a shuffle then pretend it is the shuffle
35030 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
35031 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
35034 SmallVector<int, 16> LMask(NumElts);
35035 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35036 if (!LHS.getOperand(0).isUndef())
35037 A = LHS.getOperand(0);
35038 if (!LHS.getOperand(1).isUndef())
35039 B = LHS.getOperand(1);
35040 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
35041 std::copy(Mask.begin(), Mask.end(), LMask.begin());
35043 if (!LHS.isUndef())
35045 for (unsigned i = 0; i != NumElts; ++i)
35049 // Likewise, view RHS in the form
35050 // RHS = VECTOR_SHUFFLE C, D, RMask
35052 SmallVector<int, 16> RMask(NumElts);
35053 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35054 if (!RHS.getOperand(0).isUndef())
35055 C = RHS.getOperand(0);
35056 if (!RHS.getOperand(1).isUndef())
35057 D = RHS.getOperand(1);
35058 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
35059 std::copy(Mask.begin(), Mask.end(), RMask.begin());
35061 if (!RHS.isUndef())
35063 for (unsigned i = 0; i != NumElts; ++i)
35067 // Check that the shuffles are both shuffling the same vectors.
35068 if (!(A == C && B == D) && !(A == D && B == C))
35071 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
35072 if (!A.getNode() && !B.getNode())
35075 // If A and B occur in reverse order in RHS, then "swap" them (which means
35076 // rewriting the mask).
35078 ShuffleVectorSDNode::commuteMask(RMask);
35080 // At this point LHS and RHS are equivalent to
35081 // LHS = VECTOR_SHUFFLE A, B, LMask
35082 // RHS = VECTOR_SHUFFLE A, B, RMask
35083 // Check that the masks correspond to performing a horizontal operation.
35084 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
35085 for (unsigned i = 0; i != NumLaneElts; ++i) {
35086 int LIdx = LMask[i+l], RIdx = RMask[i+l];
35088 // Ignore any UNDEF components.
35089 if (LIdx < 0 || RIdx < 0 ||
35090 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
35091 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
35094 // Check that successive elements are being operated on. If not, this is
35095 // not a horizontal operation.
35096 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
35097 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
35098 if (!(LIdx == Index && RIdx == Index + 1) &&
35099 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
35104 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
35105 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
35109 /// Do target-specific dag combines on floating-point adds/subs.
35110 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
35111 const X86Subtarget &Subtarget) {
35112 EVT VT = N->getValueType(0);
35113 SDValue LHS = N->getOperand(0);
35114 SDValue RHS = N->getOperand(1);
35115 bool IsFadd = N->getOpcode() == ISD::FADD;
35116 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
35118 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
35119 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
35120 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
35121 isHorizontalBinOp(LHS, RHS, IsFadd)) {
35122 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
35123 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
35128 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
35130 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
35131 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
35132 const X86Subtarget &Subtarget,
35134 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
35135 SDValue Src = N->getOperand(0);
35136 unsigned Opcode = Src.getOpcode();
35137 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35139 EVT VT = N->getValueType(0);
35140 EVT SrcVT = Src.getValueType();
35142 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
35143 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
35145 // Repeated operand, so we are only trading one output truncation for
35146 // one input truncation.
35150 // See if either operand has been extended from a smaller/equal size to
35151 // the truncation size, allowing a truncation to combine with the extend.
35152 unsigned Opcode0 = Op0.getOpcode();
35153 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
35154 Opcode0 == ISD::ZERO_EXTEND) &&
35155 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35158 unsigned Opcode1 = Op1.getOpcode();
35159 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
35160 Opcode1 == ISD::ZERO_EXTEND) &&
35161 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35164 // See if either operand is a single use constant which can be constant
35166 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
35167 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
35168 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
35169 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
35172 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
35173 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
35174 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
35175 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
35178 // Don't combine if the operation has other uses.
35179 if (!N->isOnlyUserOf(Src.getNode()))
35182 // Only support vector truncation for now.
35183 // TODO: i64 scalar math would benefit as well.
35184 if (!VT.isVector())
35187 // In most cases its only worth pre-truncating if we're only facing the cost
35188 // of one truncation.
35189 // i.e. if one of the inputs will constant fold or the input is repeated.
35194 SDValue Op0 = Src.getOperand(0);
35195 SDValue Op1 = Src.getOperand(1);
35196 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
35197 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35198 return TruncateArithmetic(Op0, Op1);
35203 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
35204 // better to truncate if we have the chance.
35205 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
35206 !Subtarget.hasDQI())
35207 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
35210 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
35211 SDValue Op0 = Src.getOperand(0);
35212 SDValue Op1 = Src.getOperand(1);
35213 if (TLI.isOperationLegal(Opcode, VT) &&
35214 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35215 return TruncateArithmetic(Op0, Op1);
35223 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
35225 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
35226 SmallVector<SDValue, 8> &Regs) {
35227 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
35228 Regs[0].getValueType() == MVT::v2i64));
35229 EVT OutVT = N->getValueType(0);
35230 EVT OutSVT = OutVT.getVectorElementType();
35231 EVT InVT = Regs[0].getValueType();
35232 EVT InSVT = InVT.getVectorElementType();
35235 // First, use mask to unset all bits that won't appear in the result.
35236 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
35237 "OutSVT can only be either i8 or i16.");
35239 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
35240 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
35241 for (auto &Reg : Regs)
35242 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
35244 MVT UnpackedVT, PackedVT;
35245 if (OutSVT == MVT::i8) {
35246 UnpackedVT = MVT::v8i16;
35247 PackedVT = MVT::v16i8;
35249 UnpackedVT = MVT::v4i32;
35250 PackedVT = MVT::v8i16;
35253 // In each iteration, truncate the type by a half size.
35254 auto RegNum = Regs.size();
35255 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
35256 j < e; j *= 2, RegNum /= 2) {
35257 for (unsigned i = 0; i < RegNum; i++)
35258 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
35259 for (unsigned i = 0; i < RegNum / 2; i++)
35260 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
35264 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
35265 // then extract a subvector as the result since v8i8 is not a legal type.
35266 if (OutVT == MVT::v8i8) {
35267 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
35268 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
35269 DAG.getIntPtrConstant(0, DL));
35271 } else if (RegNum > 1) {
35272 Regs.resize(RegNum);
35273 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35278 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
35280 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
35282 SmallVector<SDValue, 8> &Regs) {
35283 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
35284 EVT OutVT = N->getValueType(0);
35287 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
35288 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
35289 for (auto &Reg : Regs) {
35290 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
35292 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
35296 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
35297 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
35300 if (Regs.size() > 2) {
35301 Regs.resize(Regs.size() / 2);
35302 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35307 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
35308 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
35309 /// legalization the truncation will be translated into a BUILD_VECTOR with each
35310 /// element that is extracted from a vector and then truncated, and it is
35311 /// difficult to do this optimization based on them.
35312 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
35313 const X86Subtarget &Subtarget) {
35314 EVT OutVT = N->getValueType(0);
35315 if (!OutVT.isVector())
35318 SDValue In = N->getOperand(0);
35319 if (!In.getValueType().isSimple())
35322 EVT InVT = In.getValueType();
35323 unsigned NumElems = OutVT.getVectorNumElements();
35325 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
35326 // SSE2, and we need to take care of it specially.
35327 // AVX512 provides vpmovdb.
35328 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
35331 EVT OutSVT = OutVT.getVectorElementType();
35332 EVT InSVT = InVT.getVectorElementType();
35333 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
35334 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
35338 // SSSE3's pshufb results in less instructions in the cases below.
35339 if (Subtarget.hasSSSE3() && NumElems == 8 &&
35340 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
35341 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
35346 // Split a long vector into vectors of legal type.
35347 unsigned RegNum = InVT.getSizeInBits() / 128;
35348 SmallVector<SDValue, 8> SubVec(RegNum);
35349 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
35350 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
35352 for (unsigned i = 0; i < RegNum; i++)
35353 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
35354 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
35356 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
35357 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
35358 // truncate 2 x v4i32 to v8i16.
35359 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
35360 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
35361 else if (InSVT == MVT::i32)
35362 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
35367 /// This function transforms vector truncation of 'extended sign-bits' or
35368 /// 'extended zero-bits' values.
35369 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
35370 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
35372 const X86Subtarget &Subtarget) {
35373 // Requires SSE2 but AVX512 has fast truncate.
35374 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35377 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
35380 SDValue In = N->getOperand(0);
35381 if (!In.getValueType().isSimple())
35384 MVT VT = N->getValueType(0).getSimpleVT();
35385 MVT SVT = VT.getScalarType();
35387 MVT InVT = In.getValueType().getSimpleVT();
35388 MVT InSVT = InVT.getScalarType();
35390 // Check we have a truncation suited for PACKSS.
35391 if (!VT.is128BitVector() && !VT.is256BitVector())
35393 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
35395 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
35398 // Use PACKSS if the input has sign-bits that extend all the way to the
35399 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
35400 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
35401 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
35402 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
35403 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
35405 // Use PACKUS if the input has zero-bits that extend all the way to the
35406 // packed/truncated value. e.g. masks, zext_in_reg, etc.
35408 DAG.computeKnownBits(In, Known);
35409 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
35410 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
35411 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
35412 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
35417 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
35418 const X86Subtarget &Subtarget) {
35419 EVT VT = N->getValueType(0);
35420 SDValue Src = N->getOperand(0);
35423 // Attempt to pre-truncate inputs to arithmetic ops instead.
35424 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
35427 // Try to detect AVG pattern first.
35428 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
35431 // Try to combine truncation with signed/unsigned saturation.
35432 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
35435 // The bitcast source is a direct mmx result.
35436 // Detect bitcasts between i32 to x86mmx
35437 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
35438 SDValue BCSrc = Src.getOperand(0);
35439 if (BCSrc.getValueType() == MVT::x86mmx)
35440 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
35443 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
35444 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
35447 return combineVectorTruncation(N, DAG, Subtarget);
35450 /// Returns the negated value if the node \p N flips sign of FP value.
35452 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
35453 /// AVX512F does not have FXOR, so FNEG is lowered as
35454 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
35455 /// In this case we go though all bitcasts.
35456 static SDValue isFNEG(SDNode *N) {
35457 if (N->getOpcode() == ISD::FNEG)
35458 return N->getOperand(0);
35460 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
35461 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
35464 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
35465 if (!Op1.getValueType().isFloatingPoint())
35468 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
35470 unsigned EltBits = Op1.getScalarValueSizeInBits();
35471 auto isSignMask = [&](const ConstantFP *C) {
35472 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
35475 // There is more than one way to represent the same constant on
35476 // the different X86 targets. The type of the node may also depend on size.
35477 // - load scalar value and broadcast
35478 // - BUILD_VECTOR node
35479 // - load from a constant pool.
35480 // We check all variants here.
35481 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
35482 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
35483 if (isSignMask(cast<ConstantFP>(C)))
35486 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
35487 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
35488 if (isSignMask(CN->getConstantFPValue()))
35491 } else if (auto *C = getTargetConstantFromNode(Op1)) {
35492 if (C->getType()->isVectorTy()) {
35493 if (auto *SplatV = C->getSplatValue())
35494 if (isSignMask(cast<ConstantFP>(SplatV)))
35496 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
35497 if (isSignMask(FPConst))
35503 /// Do target-specific dag combines on floating point negations.
35504 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
35505 const X86Subtarget &Subtarget) {
35506 EVT OrigVT = N->getValueType(0);
35507 SDValue Arg = isFNEG(N);
35508 assert(Arg.getNode() && "N is expected to be an FNEG node");
35510 EVT VT = Arg.getValueType();
35511 EVT SVT = VT.getScalarType();
35514 // Let legalize expand this if it isn't a legal type yet.
35515 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35518 // If we're negating a FMUL node on a target with FMA, then we can avoid the
35519 // use of a constant by performing (-0 - A*B) instead.
35520 // FIXME: Check rounding control flags as well once it becomes available.
35521 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
35522 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
35523 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
35524 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
35525 Arg.getOperand(1), Zero);
35526 return DAG.getBitcast(OrigVT, NewNode);
35529 // If we're negating an FMA node, then we can adjust the
35530 // instruction to include the extra negation.
35531 unsigned NewOpcode = 0;
35532 if (Arg.hasOneUse()) {
35533 switch (Arg.getOpcode()) {
35534 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
35535 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
35536 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
35537 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
35538 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
35539 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
35540 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
35541 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
35542 // We can't handle scalar intrinsic node here because it would only
35543 // invert one element and not the whole vector. But we could try to handle
35544 // a negation of the lower element only.
35548 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
35549 Arg.getNode()->ops()));
35554 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
35555 const X86Subtarget &Subtarget) {
35556 MVT VT = N->getSimpleValueType(0);
35557 // If we have integer vector types available, use the integer opcodes.
35558 if (VT.isVector() && Subtarget.hasSSE2()) {
35561 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
35563 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
35564 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
35565 unsigned IntOpcode;
35566 switch (N->getOpcode()) {
35567 default: llvm_unreachable("Unexpected FP logic op");
35568 case X86ISD::FOR: IntOpcode = ISD::OR; break;
35569 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
35570 case X86ISD::FAND: IntOpcode = ISD::AND; break;
35571 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
35573 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
35574 return DAG.getBitcast(VT, IntOp);
35580 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
35581 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
35582 if (N->getOpcode() != ISD::XOR)
35585 SDValue LHS = N->getOperand(0);
35586 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
35587 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
35590 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
35591 X86::CondCode(LHS->getConstantOperandVal(0)));
35593 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
35596 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
35597 TargetLowering::DAGCombinerInfo &DCI,
35598 const X86Subtarget &Subtarget) {
35599 // If this is SSE1 only convert to FXOR to avoid scalarization.
35600 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
35601 N->getValueType(0) == MVT::v4i32) {
35602 return DAG.getBitcast(
35603 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
35604 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
35605 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
35608 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
35611 if (DCI.isBeforeLegalizeOps())
35614 if (SDValue SetCC = foldXor1SetCC(N, DAG))
35617 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
35620 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35624 return combineFneg(N, DAG, Subtarget);
35629 static bool isNullFPScalarOrVectorConst(SDValue V) {
35630 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
35633 /// If a value is a scalar FP zero or a vector FP zero (potentially including
35634 /// undefined elements), return a zero constant that may be used to fold away
35635 /// that value. In the case of a vector, the returned constant will not contain
35636 /// undefined elements even if the input parameter does. This makes it suitable
35637 /// to be used as a replacement operand with operations (eg, bitwise-and) where
35638 /// an undef should not propagate.
35639 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
35640 const X86Subtarget &Subtarget) {
35641 if (!isNullFPScalarOrVectorConst(V))
35644 if (V.getValueType().isVector())
35645 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
35650 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
35651 const X86Subtarget &Subtarget) {
35652 SDValue N0 = N->getOperand(0);
35653 SDValue N1 = N->getOperand(1);
35654 EVT VT = N->getValueType(0);
35657 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
35658 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
35659 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
35660 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
35663 auto isAllOnesConstantFP = [](SDValue V) {
35664 if (V.getSimpleValueType().isVector())
35665 return ISD::isBuildVectorAllOnes(V.getNode());
35666 auto *C = dyn_cast<ConstantFPSDNode>(V);
35667 return C && C->getConstantFPValue()->isAllOnesValue();
35670 // fand (fxor X, -1), Y --> fandn X, Y
35671 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
35672 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
35674 // fand X, (fxor Y, -1) --> fandn Y, X
35675 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
35676 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
35681 /// Do target-specific dag combines on X86ISD::FAND nodes.
35682 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
35683 const X86Subtarget &Subtarget) {
35684 // FAND(0.0, x) -> 0.0
35685 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
35688 // FAND(x, 0.0) -> 0.0
35689 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35692 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
35695 return lowerX86FPLogicOp(N, DAG, Subtarget);
35698 /// Do target-specific dag combines on X86ISD::FANDN nodes.
35699 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
35700 const X86Subtarget &Subtarget) {
35701 // FANDN(0.0, x) -> x
35702 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35703 return N->getOperand(1);
35705 // FANDN(x, 0.0) -> 0.0
35706 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35709 return lowerX86FPLogicOp(N, DAG, Subtarget);
35712 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
35713 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
35714 const X86Subtarget &Subtarget) {
35715 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
35717 // F[X]OR(0.0, x) -> x
35718 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35719 return N->getOperand(1);
35721 // F[X]OR(x, 0.0) -> x
35722 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
35723 return N->getOperand(0);
35726 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
35729 return lowerX86FPLogicOp(N, DAG, Subtarget);
35732 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
35733 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
35734 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
35736 // Only perform optimizations if UnsafeMath is used.
35737 if (!DAG.getTarget().Options.UnsafeFPMath)
35740 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
35741 // into FMINC and FMAXC, which are Commutative operations.
35742 unsigned NewOp = 0;
35743 switch (N->getOpcode()) {
35744 default: llvm_unreachable("unknown opcode");
35745 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
35746 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
35749 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
35750 N->getOperand(0), N->getOperand(1));
35753 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
35754 const X86Subtarget &Subtarget) {
35755 if (Subtarget.useSoftFloat())
35758 // TODO: Check for global or instruction-level "nnan". In that case, we
35759 // should be able to lower to FMAX/FMIN alone.
35760 // TODO: If an operand is already known to be a NaN or not a NaN, this
35761 // should be an optional swap and FMAX/FMIN.
35763 EVT VT = N->getValueType(0);
35764 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
35765 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
35766 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
35769 // This takes at least 3 instructions, so favor a library call when operating
35770 // on a scalar and minimizing code size.
35771 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
35774 SDValue Op0 = N->getOperand(0);
35775 SDValue Op1 = N->getOperand(1);
35777 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
35778 DAG.getDataLayout(), *DAG.getContext(), VT);
35780 // There are 4 possibilities involving NaN inputs, and these are the required
35784 // ----------------
35785 // Num | Max | Op0 |
35786 // Op0 ----------------
35787 // NaN | Op1 | NaN |
35788 // ----------------
35790 // The SSE FP max/min instructions were not designed for this case, but rather
35792 // Min = Op1 < Op0 ? Op1 : Op0
35793 // Max = Op1 > Op0 ? Op1 : Op0
35795 // So they always return Op0 if either input is a NaN. However, we can still
35796 // use those instructions for fmaxnum by selecting away a NaN input.
35798 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
35799 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
35800 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
35801 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
35803 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
35804 // are NaN, the NaN value of Op1 is the result.
35805 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
35808 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
35809 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
35810 TargetLowering::DAGCombinerInfo &DCI,
35811 const X86Subtarget &Subtarget) {
35812 // ANDNP(0, x) -> x
35813 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35814 return N->getOperand(1);
35816 // ANDNP(x, 0) -> 0
35817 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
35818 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
35820 EVT VT = N->getValueType(0);
35822 // Attempt to recursively combine a bitmask ANDNP with shuffles.
35823 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
35825 if (SDValue Res = combineX86ShufflesRecursively(
35826 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35827 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
35828 DCI.CombineTo(N, Res);
35836 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
35837 TargetLowering::DAGCombinerInfo &DCI) {
35838 SDValue N0 = N->getOperand(0);
35839 SDValue N1 = N->getOperand(1);
35841 // BT ignores high bits in the bit index operand.
35842 unsigned BitWidth = N1.getValueSizeInBits();
35843 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
35844 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
35845 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
35850 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
35851 const X86Subtarget &Subtarget) {
35852 EVT VT = N->getValueType(0);
35853 if (!VT.isVector())
35856 SDValue N0 = N->getOperand(0);
35857 SDValue N1 = N->getOperand(1);
35858 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
35861 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
35862 // both SSE and AVX2 since there is no sign-extended shift right
35863 // operation on a vector with 64-bit elements.
35864 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
35865 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
35866 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
35867 N0.getOpcode() == ISD::SIGN_EXTEND)) {
35868 SDValue N00 = N0.getOperand(0);
35870 // EXTLOAD has a better solution on AVX2,
35871 // it may be replaced with X86ISD::VSEXT node.
35872 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
35873 if (!ISD::isNormalLoad(N00.getNode()))
35876 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
35877 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
35879 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
35885 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
35886 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
35887 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
35888 /// opportunities to combine math ops, use an LEA, or use a complex addressing
35889 /// mode. This can eliminate extend, add, and shift instructions.
35890 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
35891 const X86Subtarget &Subtarget) {
35892 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
35893 Ext->getOpcode() != ISD::ZERO_EXTEND)
35896 // TODO: This should be valid for other integer types.
35897 EVT VT = Ext->getValueType(0);
35898 if (VT != MVT::i64)
35901 SDValue Add = Ext->getOperand(0);
35902 if (Add.getOpcode() != ISD::ADD)
35905 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
35906 bool NSW = Add->getFlags().hasNoSignedWrap();
35907 bool NUW = Add->getFlags().hasNoUnsignedWrap();
35909 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
35911 if ((Sext && !NSW) || (!Sext && !NUW))
35914 // Having a constant operand to the 'add' ensures that we are not increasing
35915 // the instruction count because the constant is extended for free below.
35916 // A constant operand can also become the displacement field of an LEA.
35917 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
35921 // Don't make the 'add' bigger if there's no hope of combining it with some
35922 // other 'add' or 'shl' instruction.
35923 // TODO: It may be profitable to generate simpler LEA instructions in place
35924 // of single 'add' instructions, but the cost model for selecting an LEA
35925 // currently has a high threshold.
35926 bool HasLEAPotential = false;
35927 for (auto *User : Ext->uses()) {
35928 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
35929 HasLEAPotential = true;
35933 if (!HasLEAPotential)
35936 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
35937 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
35938 SDValue AddOp0 = Add.getOperand(0);
35939 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
35940 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
35942 // The wider add is guaranteed to not wrap because both operands are
35945 Flags.setNoSignedWrap(NSW);
35946 Flags.setNoUnsignedWrap(NUW);
35947 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
35950 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
35951 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
35952 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
35953 /// extends from AH (which we otherwise need to do contortions to access).
35954 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
35955 SDValue N0 = N->getOperand(0);
35956 auto OpcodeN = N->getOpcode();
35957 auto OpcodeN0 = N0.getOpcode();
35958 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
35959 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
35962 EVT VT = N->getValueType(0);
35963 EVT InVT = N0.getValueType();
35964 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
35965 !(VT == MVT::i32 || VT == MVT::i64))
35968 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
35969 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
35970 : X86ISD::UDIVREM8_ZEXT_HREG;
35971 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
35973 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
35974 // If this was a 64-bit extend, complete it.
35975 if (VT == MVT::i64)
35976 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
35977 return R.getValue(1);
35980 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
35981 // operands and the result of CMOV is not used anywhere else - promote CMOV
35982 // itself instead of promoting its result. This could be beneficial, because:
35983 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
35984 // (or more) pseudo-CMOVs only when they go one-after-another and
35985 // getting rid of result extension code after CMOV will help that.
35986 // 2) Promotion of constant CMOV arguments is free, hence the
35987 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
35988 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
35989 // promotion is also good in terms of code-size.
35990 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
35992 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
35993 SDValue CMovN = Extend->getOperand(0);
35994 if (CMovN.getOpcode() != X86ISD::CMOV)
35997 EVT TargetVT = Extend->getValueType(0);
35998 unsigned ExtendOpcode = Extend->getOpcode();
36001 EVT VT = CMovN.getValueType();
36002 SDValue CMovOp0 = CMovN.getOperand(0);
36003 SDValue CMovOp1 = CMovN.getOperand(1);
36005 bool DoPromoteCMOV =
36006 (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
36007 CMovN.hasOneUse() &&
36008 (isa<ConstantSDNode>(CMovOp0.getNode()) &&
36009 isa<ConstantSDNode>(CMovOp1.getNode()));
36011 if (!DoPromoteCMOV)
36014 CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
36015 CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
36017 return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
36018 CMovN.getOperand(2), CMovN.getOperand(3));
36021 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
36022 // This is more or less the reverse of combineBitcastvxi1.
36024 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
36025 TargetLowering::DAGCombinerInfo &DCI,
36026 const X86Subtarget &Subtarget) {
36027 unsigned Opcode = N->getOpcode();
36028 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
36029 Opcode != ISD::ANY_EXTEND)
36031 if (!DCI.isBeforeLegalizeOps())
36033 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36036 SDValue N0 = N->getOperand(0);
36037 EVT VT = N->getValueType(0);
36038 EVT SVT = VT.getScalarType();
36039 EVT InSVT = N0.getValueType().getScalarType();
36040 unsigned EltSizeInBits = SVT.getSizeInBits();
36042 // Input type must be extending a bool vector (bit-casted from a scalar
36043 // integer) to legal integer types.
36044 if (!VT.isVector())
36046 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
36048 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
36051 SDValue N00 = N0.getOperand(0);
36052 EVT SclVT = N0.getOperand(0).getValueType();
36053 if (!SclVT.isScalarInteger())
36058 SmallVector<int, 32> ShuffleMask;
36059 unsigned NumElts = VT.getVectorNumElements();
36060 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
36062 // Broadcast the scalar integer to the vector elements.
36063 if (NumElts > EltSizeInBits) {
36064 // If the scalar integer is greater than the vector element size, then we
36065 // must split it down into sub-sections for broadcasting. For example:
36066 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
36067 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
36068 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
36069 unsigned Scale = NumElts / EltSizeInBits;
36071 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
36072 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
36073 Vec = DAG.getBitcast(VT, Vec);
36075 for (unsigned i = 0; i != Scale; ++i)
36076 ShuffleMask.append(EltSizeInBits, i);
36078 // For smaller scalar integers, we can simply any-extend it to the vector
36079 // element size (we don't care about the upper bits) and broadcast it to all
36081 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
36082 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
36083 ShuffleMask.append(NumElts, 0);
36085 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
36087 // Now, mask the relevant bit in each element.
36088 SmallVector<SDValue, 32> Bits;
36089 for (unsigned i = 0; i != NumElts; ++i) {
36090 int BitIdx = (i % EltSizeInBits);
36091 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
36092 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
36094 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
36095 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
36097 // Compare against the bitmask and extend the result.
36098 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
36099 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
36100 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
36102 // For SEXT, this is now done, otherwise shift the result down for
36104 if (Opcode == ISD::SIGN_EXTEND)
36106 return DAG.getNode(ISD::SRL, DL, VT, Vec,
36107 DAG.getConstant(EltSizeInBits - 1, DL, VT));
36110 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
36111 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
36112 /// with UNDEFs) of the input to vectors of the same size as the target type
36113 /// which then extends the lowest elements.
36114 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
36115 TargetLowering::DAGCombinerInfo &DCI,
36116 const X86Subtarget &Subtarget) {
36117 unsigned Opcode = N->getOpcode();
36118 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
36120 if (!DCI.isBeforeLegalizeOps())
36122 if (!Subtarget.hasSSE2())
36125 SDValue N0 = N->getOperand(0);
36126 EVT VT = N->getValueType(0);
36127 EVT SVT = VT.getScalarType();
36128 EVT InVT = N0.getValueType();
36129 EVT InSVT = InVT.getScalarType();
36131 // Input type must be a vector and we must be extending legal integer types.
36132 if (!VT.isVector())
36134 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
36136 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
36139 // On AVX2+ targets, if the input/output types are both legal then we will be
36140 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
36141 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
36142 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
36147 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
36148 EVT InVT = N.getValueType();
36149 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
36150 Size / InVT.getScalarSizeInBits());
36151 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
36152 DAG.getUNDEF(InVT));
36154 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
36157 // If target-size is less than 128-bits, extend to a type that would extend
36158 // to 128 bits, extend that and extract the original target vector.
36159 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
36160 unsigned Scale = 128 / VT.getSizeInBits();
36162 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
36163 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
36164 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
36165 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
36166 DAG.getIntPtrConstant(0, DL));
36169 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
36170 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
36171 // Also use this if we don't have SSE41 to allow the legalizer do its job.
36172 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
36173 (VT.is256BitVector() && Subtarget.hasInt256()) ||
36174 (VT.is512BitVector() && Subtarget.hasAVX512())) {
36175 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
36176 return Opcode == ISD::SIGN_EXTEND
36177 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
36178 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
36181 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
36182 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
36183 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
36184 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
36185 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
36187 SmallVector<SDValue, 8> Opnds;
36188 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
36189 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
36190 DAG.getIntPtrConstant(Offset, DL));
36191 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
36192 SrcVec = Opcode == ISD::SIGN_EXTEND
36193 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
36194 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
36195 Opnds.push_back(SrcVec);
36197 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
36200 // On pre-AVX2 targets, split into 128-bit nodes of
36201 // ISD::*_EXTEND_VECTOR_INREG.
36202 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
36203 return SplitAndExtendInReg(128);
36205 // On pre-AVX512 targets, split into 256-bit nodes of
36206 // ISD::*_EXTEND_VECTOR_INREG.
36207 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
36208 return SplitAndExtendInReg(256);
36213 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
36215 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
36216 const X86Subtarget &Subtarget) {
36217 SDValue N0 = N->getOperand(0);
36218 EVT VT = N->getValueType(0);
36221 // Only handle sext/aext for now.
36222 if (N->getOpcode() != ISD::SIGN_EXTEND && N->getOpcode() != ISD::ANY_EXTEND)
36225 // Only do this combine with AVX512 for vector extends.
36226 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
36229 // Only combine legal element types.
36230 EVT SVT = VT.getVectorElementType();
36231 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
36232 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
36235 // We can only do this if the vector size in 256 bits or less.
36236 unsigned Size = VT.getSizeInBits();
36240 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
36241 // that's the only integer compares with we have.
36242 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
36243 if (ISD::isUnsignedIntSetCC(CC))
36246 // Only do this combine if the extension will be fully consumed by the setcc.
36247 EVT N00VT = N0.getOperand(0).getValueType();
36248 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
36249 if (Size != MatchingVecType.getSizeInBits())
36252 return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
36255 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
36256 TargetLowering::DAGCombinerInfo &DCI,
36257 const X86Subtarget &Subtarget) {
36258 SDValue N0 = N->getOperand(0);
36259 EVT VT = N->getValueType(0);
36260 EVT InVT = N0.getValueType();
36263 if (SDValue DivRem8 = getDivRem8(N, DAG))
36266 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36269 if (!DCI.isBeforeLegalizeOps())
36272 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
36275 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
36276 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
36277 // Invert and sign-extend a boolean is the same as zero-extend and subtract
36278 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
36279 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
36280 // sext (xor Bool, -1) --> sub (zext Bool), 1
36281 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
36282 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
36285 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36288 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36292 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
36295 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36301 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
36302 const X86Subtarget &Subtarget) {
36303 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
36305 EVT VT = N->getValueType(0);
36307 // Let legalize expand this if it isn't a legal type yet.
36308 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36311 EVT ScalarVT = VT.getScalarType();
36312 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
36315 SDValue A = N->getOperand(0);
36316 SDValue B = N->getOperand(1);
36317 SDValue C = N->getOperand(2);
36319 auto invertIfNegative = [](SDValue &V) {
36320 if (SDValue NegVal = isFNEG(V.getNode())) {
36327 // Do not convert the passthru input of scalar intrinsics.
36328 // FIXME: We could allow negations of the lower element only.
36329 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
36330 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
36331 bool NegB = invertIfNegative(B);
36332 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
36333 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
36335 // Negative multiplication when NegA xor NegB
36336 bool NegMul = (NegA != NegB);
36337 bool HasNeg = NegA || NegB || NegC;
36339 unsigned NewOpcode;
36341 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
36343 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
36345 // For FMA, we risk reconstructing the node we started with.
36346 // In order to avoid this, we check for negation or opcode change. If
36347 // one of the two happened, then it is a new node and we return it.
36348 if (N->getOpcode() == ISD::FMA) {
36349 if (HasNeg || NewOpcode != N->getOpcode())
36350 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36354 if (N->getOpcode() == X86ISD::FMADD_RND) {
36355 switch (NewOpcode) {
36356 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
36357 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
36358 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
36359 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
36361 } else if (N->getOpcode() == X86ISD::FMADDS1) {
36362 switch (NewOpcode) {
36363 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
36364 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
36365 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
36366 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
36368 } else if (N->getOpcode() == X86ISD::FMADDS3) {
36369 switch (NewOpcode) {
36370 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
36371 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
36372 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
36373 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
36375 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
36376 switch (NewOpcode) {
36377 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
36378 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
36379 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
36380 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
36382 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
36383 switch (NewOpcode) {
36384 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
36385 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
36386 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
36387 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
36389 } else if (N->getOpcode() == X86ISD::FMADD4S) {
36390 switch (NewOpcode) {
36391 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
36392 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
36393 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
36394 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
36397 llvm_unreachable("Unexpected opcode!");
36400 // Only return the node is the opcode was changed or one of the
36401 // operand was negated. If not, we'll just recreate the same node.
36402 if (HasNeg || NewOpcode != N->getOpcode()) {
36403 if (N->getNumOperands() == 4)
36404 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
36405 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36411 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
36412 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
36413 const X86Subtarget &Subtarget) {
36415 EVT VT = N->getValueType(0);
36417 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
36421 unsigned NewOpcode;
36422 switch (N->getOpcode()) {
36423 default: llvm_unreachable("Unexpected opcode!");
36424 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
36425 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
36426 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
36427 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
36430 if (N->getNumOperands() == 4)
36431 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36432 NegVal, N->getOperand(3));
36433 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
36437 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
36438 TargetLowering::DAGCombinerInfo &DCI,
36439 const X86Subtarget &Subtarget) {
36440 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
36441 // (and (i32 x86isd::setcc_carry), 1)
36442 // This eliminates the zext. This transformation is necessary because
36443 // ISD::SETCC is always legalized to i8.
36445 SDValue N0 = N->getOperand(0);
36446 EVT VT = N->getValueType(0);
36448 if (N0.getOpcode() == ISD::AND &&
36450 N0.getOperand(0).hasOneUse()) {
36451 SDValue N00 = N0.getOperand(0);
36452 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36453 if (!isOneConstant(N0.getOperand(1)))
36455 return DAG.getNode(ISD::AND, dl, VT,
36456 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36457 N00.getOperand(0), N00.getOperand(1)),
36458 DAG.getConstant(1, dl, VT));
36462 if (N0.getOpcode() == ISD::TRUNCATE &&
36464 N0.getOperand(0).hasOneUse()) {
36465 SDValue N00 = N0.getOperand(0);
36466 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
36467 return DAG.getNode(ISD::AND, dl, VT,
36468 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
36469 N00.getOperand(0), N00.getOperand(1)),
36470 DAG.getConstant(1, dl, VT));
36474 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36477 if (DCI.isBeforeLegalizeOps())
36478 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
36481 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36484 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36488 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
36491 if (SDValue DivRem8 = getDivRem8(N, DAG))
36494 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36497 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
36503 /// Try to map a 128-bit or larger integer comparison to vector instructions
36504 /// before type legalization splits it up into chunks.
36505 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
36506 const X86Subtarget &Subtarget) {
36507 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
36508 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
36510 // We're looking for an oversized integer equality comparison.
36511 SDValue X = SetCC->getOperand(0);
36512 SDValue Y = SetCC->getOperand(1);
36513 EVT OpVT = X.getValueType();
36514 unsigned OpSize = OpVT.getSizeInBits();
36515 if (!OpVT.isScalarInteger() || OpSize < 128)
36518 // Ignore a comparison with zero because that gets special treatment in
36519 // EmitTest(). But make an exception for the special case of a pair of
36520 // logically-combined vector-sized operands compared to zero. This pattern may
36521 // be generated by the memcmp expansion pass with oversized integer compares
36523 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
36524 X.getOperand(0).getOpcode() == ISD::XOR &&
36525 X.getOperand(1).getOpcode() == ISD::XOR;
36526 if (isNullConstant(Y) && !IsOrXorXorCCZero)
36529 // Bail out if we know that this is not really just an oversized integer.
36530 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
36531 peekThroughBitcasts(Y).getValueType() == MVT::f128)
36534 // TODO: Use PXOR + PTEST for SSE4.1 or later?
36535 // TODO: Add support for AVX-512.
36536 EVT VT = SetCC->getValueType(0);
36538 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
36539 (OpSize == 256 && Subtarget.hasAVX2())) {
36540 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
36542 if (IsOrXorXorCCZero) {
36543 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
36544 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
36545 // Use 2 vector equality compares and 'and' the results before doing a
36547 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
36548 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
36549 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
36550 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
36551 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
36552 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
36553 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
36555 SDValue VecX = DAG.getBitcast(VecVT, X);
36556 SDValue VecY = DAG.getBitcast(VecVT, Y);
36557 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
36559 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
36560 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
36561 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
36562 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
36563 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
36564 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
36565 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
36567 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
36573 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
36574 const X86Subtarget &Subtarget) {
36575 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
36576 SDValue LHS = N->getOperand(0);
36577 SDValue RHS = N->getOperand(1);
36578 EVT VT = N->getValueType(0);
36579 EVT OpVT = LHS.getValueType();
36582 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
36583 // 0-x == y --> x+y == 0
36584 // 0-x != y --> x+y != 0
36585 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
36587 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
36588 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36590 // x == 0-y --> x+y == 0
36591 // x != 0-y --> x+y != 0
36592 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
36594 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
36595 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
36598 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
36602 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
36603 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
36604 // Put build_vectors on the right.
36605 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
36606 std::swap(LHS, RHS);
36607 CC = ISD::getSetCCSwappedOperands(CC);
36611 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
36612 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
36613 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
36615 if (IsSEXT0 && IsVZero1) {
36616 assert(VT == LHS.getOperand(0).getValueType() &&
36617 "Uexpected operand type");
36618 if (CC == ISD::SETGT)
36619 return DAG.getConstant(0, DL, VT);
36620 if (CC == ISD::SETLE)
36621 return DAG.getConstant(1, DL, VT);
36622 if (CC == ISD::SETEQ || CC == ISD::SETGE)
36623 return DAG.getNOT(DL, LHS.getOperand(0), VT);
36625 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
36626 "Unexpected condition code!");
36627 return LHS.getOperand(0);
36631 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
36632 // pre-promote its result type since vXi1 vectors don't get promoted
36633 // during type legalization.
36634 // NOTE: The element count check is to ignore operand types that need to
36635 // go through type promotion to a 128-bit vector.
36636 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
36637 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
36638 (OpVT.getVectorElementType() == MVT::i8 ||
36639 OpVT.getVectorElementType() == MVT::i16)) {
36640 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
36642 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
36645 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
36646 // to avoid scalarization via legalization because v4i32 is not a legal type.
36647 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
36648 LHS.getValueType() == MVT::v4f32)
36649 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
36654 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
36655 TargetLowering::DAGCombinerInfo &DCI) {
36656 SDValue Src = N->getOperand(0);
36657 MVT SrcVT = Src.getSimpleValueType();
36659 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36660 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36661 !DCI.isBeforeLegalizeOps());
36663 // MOVMSK only uses the MSB from each vector element.
36665 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
36666 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
36667 DCI.AddToWorklist(Src.getNode());
36668 DCI.CommitTargetLoweringOpt(TLO);
36669 return SDValue(N, 0);
36675 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
36676 TargetLowering::DAGCombinerInfo &DCI,
36677 const X86Subtarget &Subtarget) {
36680 if (DCI.isBeforeLegalizeOps()) {
36681 SDValue Index = N->getOperand(4);
36682 // Remove any sign extends from 32 or smaller to larger than 32.
36683 // Only do this before LegalizeOps in case we need the sign extend for
36685 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
36686 if (Index.getScalarValueSizeInBits() > 32 &&
36687 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
36688 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36689 NewOps[4] = Index.getOperand(0);
36690 DAG.UpdateNodeOperands(N, NewOps);
36691 // The original sign extend has less users, add back to worklist in case
36692 // it needs to be removed
36693 DCI.AddToWorklist(Index.getNode());
36694 DCI.AddToWorklist(N);
36695 return SDValue(N, 0);
36699 // Make sure the index is either i32 or i64
36700 unsigned ScalarSize = Index.getScalarValueSizeInBits();
36701 if (ScalarSize != 32 && ScalarSize != 64) {
36702 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
36703 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
36704 Index.getValueType().getVectorNumElements());
36705 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
36706 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36708 DAG.UpdateNodeOperands(N, NewOps);
36709 DCI.AddToWorklist(N);
36710 return SDValue(N, 0);
36713 // Try to remove zero extends from 32->64 if we know the sign bit of
36714 // the input is zero.
36715 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
36716 Index.getScalarValueSizeInBits() == 64 &&
36717 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
36718 if (DAG.SignBitIsZero(Index.getOperand(0))) {
36719 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36720 NewOps[4] = Index.getOperand(0);
36721 DAG.UpdateNodeOperands(N, NewOps);
36722 // The original zero extend has less users, add back to worklist in case
36723 // it needs to be removed
36724 DCI.AddToWorklist(Index.getNode());
36725 DCI.AddToWorklist(N);
36726 return SDValue(N, 0);
36731 // With AVX2 we only demand the upper bit of the mask.
36732 if (!Subtarget.hasAVX512()) {
36733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36734 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36735 !DCI.isBeforeLegalizeOps());
36736 SDValue Mask = N->getOperand(2);
36738 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
36739 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
36740 DCI.AddToWorklist(Mask.getNode());
36741 DCI.CommitTargetLoweringOpt(TLO);
36742 return SDValue(N, 0);
36749 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
36750 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
36751 const X86Subtarget &Subtarget) {
36753 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
36754 SDValue EFLAGS = N->getOperand(1);
36756 // Try to simplify the EFLAGS and condition code operands.
36757 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
36758 return getSETCC(CC, Flags, DL, DAG);
36763 /// Optimize branch condition evaluation.
36764 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
36765 const X86Subtarget &Subtarget) {
36767 SDValue EFLAGS = N->getOperand(3);
36768 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
36770 // Try to simplify the EFLAGS and condition code operands.
36771 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
36772 // RAUW them under us.
36773 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
36774 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
36775 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
36776 N->getOperand(1), Cond, Flags);
36782 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
36783 SelectionDAG &DAG) {
36784 // Take advantage of vector comparisons producing 0 or -1 in each lane to
36785 // optimize away operation when it's from a constant.
36787 // The general transformation is:
36788 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
36789 // AND(VECTOR_CMP(x,y), constant2)
36790 // constant2 = UNARYOP(constant)
36792 // Early exit if this isn't a vector operation, the operand of the
36793 // unary operation isn't a bitwise AND, or if the sizes of the operations
36794 // aren't the same.
36795 EVT VT = N->getValueType(0);
36796 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
36797 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
36798 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
36801 // Now check that the other operand of the AND is a constant. We could
36802 // make the transformation for non-constant splats as well, but it's unclear
36803 // that would be a benefit as it would not eliminate any operations, just
36804 // perform one more step in scalar code before moving to the vector unit.
36805 if (BuildVectorSDNode *BV =
36806 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
36807 // Bail out if the vector isn't a constant.
36808 if (!BV->isConstant())
36811 // Everything checks out. Build up the new and improved node.
36813 EVT IntVT = BV->getValueType(0);
36814 // Create a new constant of the appropriate type for the transformed
36816 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
36817 // The AND node needs bitcasts to/from an integer vector type around it.
36818 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
36819 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
36820 N->getOperand(0)->getOperand(0), MaskConst);
36821 SDValue Res = DAG.getBitcast(VT, NewAnd);
36828 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
36829 const X86Subtarget &Subtarget) {
36830 SDValue Op0 = N->getOperand(0);
36831 EVT VT = N->getValueType(0);
36832 EVT InVT = Op0.getValueType();
36833 EVT InSVT = InVT.getScalarType();
36835 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
36836 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
36837 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
36839 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36840 InVT.getVectorNumElements());
36841 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
36843 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
36844 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36847 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
36848 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
36849 // the optimization here.
36850 if (DAG.SignBitIsZero(Op0))
36851 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
36856 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
36857 const X86Subtarget &Subtarget) {
36858 // First try to optimize away the conversion entirely when it's
36859 // conditionally from a constant. Vectors only.
36860 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
36863 // Now move on to more general possibilities.
36864 SDValue Op0 = N->getOperand(0);
36865 EVT VT = N->getValueType(0);
36866 EVT InVT = Op0.getValueType();
36867 EVT InSVT = InVT.getScalarType();
36869 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
36870 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
36871 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
36872 if (InVT.isVector() &&
36873 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
36874 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
36876 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36877 InVT.getVectorNumElements());
36878 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
36879 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36882 // Without AVX512DQ we only support i64 to float scalar conversion. For both
36883 // vectors and scalars, see if we know that the upper bits are all the sign
36884 // bit, in which case we can truncate the input to i32 and convert from that.
36885 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
36886 unsigned BitWidth = InVT.getScalarSizeInBits();
36887 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
36888 if (NumSignBits >= (BitWidth - 31)) {
36889 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
36890 if (InVT.isVector())
36891 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
36892 InVT.getVectorNumElements());
36894 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
36895 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
36899 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
36900 // a 32-bit target where SSE doesn't support i64->FP operations.
36901 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
36902 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
36903 EVT LdVT = Ld->getValueType(0);
36905 // This transformation is not supported if the result type is f16 or f128.
36906 if (VT == MVT::f16 || VT == MVT::f128)
36909 if (!Ld->isVolatile() && !VT.isVector() &&
36910 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
36911 !Subtarget.is64Bit() && LdVT == MVT::i64) {
36912 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
36913 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
36914 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
36921 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
36922 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36923 MVT VT = N->getSimpleValueType(0);
36924 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36925 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
36926 N->getOperand(0), N->getOperand(1),
36933 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
36934 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
36935 TargetLowering::DAGCombinerInfo &DCI) {
36936 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
36937 // the result is either zero or one (depending on the input carry bit).
36938 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
36939 if (X86::isZeroNode(N->getOperand(0)) &&
36940 X86::isZeroNode(N->getOperand(1)) &&
36941 // We don't have a good way to replace an EFLAGS use, so only do this when
36943 SDValue(N, 1).use_empty()) {
36945 EVT VT = N->getValueType(0);
36946 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
36947 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
36948 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36949 DAG.getConstant(X86::COND_B, DL,
36952 DAG.getConstant(1, DL, VT));
36953 return DCI.CombineTo(N, Res1, CarryOut);
36956 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36957 MVT VT = N->getSimpleValueType(0);
36958 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36959 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
36960 N->getOperand(0), N->getOperand(1),
36967 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
36968 /// which is more useful than 0/1 in some cases.
36969 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
36971 // "Condition code B" is also known as "the carry flag" (CF).
36972 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
36973 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
36974 MVT VT = N->getSimpleValueType(0);
36976 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
36978 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
36979 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
36982 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
36983 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
36984 /// with CMP+{ADC, SBB}.
36985 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
36986 bool IsSub = N->getOpcode() == ISD::SUB;
36987 SDValue X = N->getOperand(0);
36988 SDValue Y = N->getOperand(1);
36990 // If this is an add, canonicalize a zext operand to the RHS.
36991 // TODO: Incomplete? What if both sides are zexts?
36992 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
36993 Y.getOpcode() != ISD::ZERO_EXTEND)
36996 // Look through a one-use zext.
36997 bool PeekedThroughZext = false;
36998 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
36999 Y = Y.getOperand(0);
37000 PeekedThroughZext = true;
37003 // If this is an add, canonicalize a setcc operand to the RHS.
37004 // TODO: Incomplete? What if both sides are setcc?
37005 // TODO: Should we allow peeking through a zext of the other operand?
37006 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
37007 Y.getOpcode() != X86ISD::SETCC)
37010 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
37014 EVT VT = N->getValueType(0);
37015 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
37017 // If X is -1 or 0, then we have an opportunity to avoid constants required in
37018 // the general case below.
37019 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
37021 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
37022 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
37023 // This is a complicated way to get -1 or 0 from the carry flag:
37024 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
37025 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
37026 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37027 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37031 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
37032 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
37033 SDValue EFLAGS = Y->getOperand(1);
37034 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
37035 EFLAGS.getValueType().isInteger() &&
37036 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
37037 // Swap the operands of a SUB, and we have the same pattern as above.
37038 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
37039 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
37040 SDValue NewSub = DAG.getNode(
37041 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
37042 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
37043 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
37044 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37045 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37051 if (CC == X86::COND_B) {
37052 // X + SETB Z --> X + (mask SBB Z, Z)
37053 // X - SETB Z --> X - (mask SBB Z, Z)
37054 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
37055 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
37056 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37057 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37058 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37061 if (CC == X86::COND_A) {
37062 SDValue EFLAGS = Y->getOperand(1);
37063 // Try to convert COND_A into COND_B in an attempt to facilitate
37064 // materializing "setb reg".
37066 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
37067 // cannot take an immediate as its first operand.
37069 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
37070 EFLAGS.getValueType().isInteger() &&
37071 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
37072 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
37073 EFLAGS.getNode()->getVTList(),
37074 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
37075 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
37076 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
37077 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37078 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37079 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37083 if (CC != X86::COND_E && CC != X86::COND_NE)
37086 SDValue Cmp = Y.getOperand(1);
37087 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
37088 !X86::isZeroNode(Cmp.getOperand(1)) ||
37089 !Cmp.getOperand(0).getValueType().isInteger())
37092 SDValue Z = Cmp.getOperand(0);
37093 EVT ZVT = Z.getValueType();
37095 // If X is -1 or 0, then we have an opportunity to avoid constants required in
37096 // the general case below.
37098 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
37100 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
37101 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
37102 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
37103 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
37104 SDValue Zero = DAG.getConstant(0, DL, ZVT);
37105 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
37106 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
37107 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37108 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37109 SDValue(Neg.getNode(), 1));
37112 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
37113 // with fake operands:
37114 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
37115 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
37116 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
37117 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
37118 SDValue One = DAG.getConstant(1, DL, ZVT);
37119 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37120 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37121 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
37125 // (cmp Z, 1) sets the carry flag if Z is 0.
37126 SDValue One = DAG.getConstant(1, DL, ZVT);
37127 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37129 // Add the flags type for ADC/SBB nodes.
37130 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
37132 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
37133 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
37134 if (CC == X86::COND_NE)
37135 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
37136 DAG.getConstant(-1ULL, DL, VT), Cmp1);
37138 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
37139 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
37140 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
37141 DAG.getConstant(0, DL, VT), Cmp1);
37144 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
37145 const X86Subtarget &Subtarget) {
37146 if (!Subtarget.hasSSE2())
37149 SDValue MulOp = N->getOperand(0);
37150 SDValue Phi = N->getOperand(1);
37152 if (MulOp.getOpcode() != ISD::MUL)
37153 std::swap(MulOp, Phi);
37154 if (MulOp.getOpcode() != ISD::MUL)
37158 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
37161 EVT VT = N->getValueType(0);
37163 unsigned RegSize = 128;
37164 if (Subtarget.hasBWI())
37166 else if (Subtarget.hasAVX2())
37168 unsigned VectorSize = VT.getVectorNumElements() * 16;
37169 // If the vector size is less than 128, or greater than the supported RegSize,
37170 // do not use PMADD.
37171 if (VectorSize < 128 || VectorSize > RegSize)
37175 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
37176 VT.getVectorNumElements());
37177 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37178 VT.getVectorNumElements() / 2);
37180 // Shrink the operands of mul.
37181 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
37182 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
37184 // Madd vector size is half of the original vector size
37185 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
37186 // Fill the rest of the output with 0
37187 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
37188 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
37189 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
37192 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
37193 const X86Subtarget &Subtarget) {
37194 if (!Subtarget.hasSSE2())
37198 EVT VT = N->getValueType(0);
37199 SDValue Op0 = N->getOperand(0);
37200 SDValue Op1 = N->getOperand(1);
37202 // TODO: There's nothing special about i32, any integer type above i16 should
37203 // work just as well.
37204 if (!VT.isVector() || !VT.isSimple() ||
37205 !(VT.getVectorElementType() == MVT::i32))
37208 unsigned RegSize = 128;
37209 if (Subtarget.hasBWI())
37211 else if (Subtarget.hasAVX2())
37214 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
37215 // TODO: We should be able to handle larger vectors by splitting them before
37216 // feeding them into several SADs, and then reducing over those.
37217 if (VT.getSizeInBits() / 4 > RegSize)
37220 // We know N is a reduction add, which means one of its operands is a phi.
37221 // To match SAD, we need the other operand to be a vector select.
37222 SDValue SelectOp, Phi;
37223 if (Op0.getOpcode() == ISD::VSELECT) {
37226 } else if (Op1.getOpcode() == ISD::VSELECT) {
37232 // Check whether we have an abs-diff pattern feeding into the select.
37233 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
37236 // SAD pattern detected. Now build a SAD instruction and an addition for
37237 // reduction. Note that the number of elements of the result of SAD is less
37238 // than the number of elements of its input. Therefore, we could only update
37239 // part of elements in the reduction vector.
37240 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
37242 // The output of PSADBW is a vector of i64.
37243 // We need to turn the vector of i64 into a vector of i32.
37244 // If the reduction vector is at least as wide as the psadbw result, just
37245 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
37247 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
37248 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
37249 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
37251 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
37253 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
37254 // Fill the upper elements with zero to match the add width.
37255 SDValue Zero = DAG.getConstant(0, DL, VT);
37256 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
37257 DAG.getIntPtrConstant(0, DL));
37260 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
37263 /// Convert vector increment or decrement to sub/add with an all-ones constant:
37264 /// add X, <1, 1...> --> sub X, <-1, -1...>
37265 /// sub X, <1, 1...> --> add X, <-1, -1...>
37266 /// The all-ones vector constant can be materialized using a pcmpeq instruction
37267 /// that is commonly recognized as an idiom (has no register dependency), so
37268 /// that's better/smaller than loading a splat 1 constant.
37269 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
37270 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
37271 "Unexpected opcode for increment/decrement transform");
37273 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
37274 // out and wait for legalization if we have an unsupported vector length.
37275 EVT VT = N->getValueType(0);
37276 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
37279 SDNode *N1 = N->getOperand(1).getNode();
37281 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
37282 !SplatVal.isOneValue())
37285 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
37286 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
37287 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
37290 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
37291 const SDLoc &DL, EVT VT,
37292 const X86Subtarget &Subtarget) {
37293 // Example of pattern we try to detect:
37294 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
37295 //(add (build_vector (extract_elt t, 0),
37296 // (extract_elt t, 2),
37297 // (extract_elt t, 4),
37298 // (extract_elt t, 6)),
37299 // (build_vector (extract_elt t, 1),
37300 // (extract_elt t, 3),
37301 // (extract_elt t, 5),
37302 // (extract_elt t, 7)))
37304 if (!Subtarget.hasSSE2())
37307 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
37308 Op1.getOpcode() != ISD::BUILD_VECTOR)
37311 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
37312 VT.getVectorNumElements() < 4 ||
37313 !isPowerOf2_32(VT.getVectorNumElements()))
37316 // Check if one of Op0,Op1 is of the form:
37317 // (build_vector (extract_elt Mul, 0),
37318 // (extract_elt Mul, 2),
37319 // (extract_elt Mul, 4),
37321 // the other is of the form:
37322 // (build_vector (extract_elt Mul, 1),
37323 // (extract_elt Mul, 3),
37324 // (extract_elt Mul, 5),
37326 // and identify Mul.
37328 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
37329 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
37330 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
37331 // TODO: Be more tolerant to undefs.
37332 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
37333 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
37334 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
37335 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
37337 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
37338 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
37339 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
37340 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
37341 if (!Const0L || !Const1L || !Const0H || !Const1H)
37343 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
37344 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
37345 // Commutativity of mul allows factors of a product to reorder.
37347 std::swap(Idx0L, Idx1L);
37349 std::swap(Idx0H, Idx1H);
37350 // Commutativity of add allows pairs of factors to reorder.
37351 if (Idx0L > Idx0H) {
37352 std::swap(Idx0L, Idx0H);
37353 std::swap(Idx1L, Idx1H);
37355 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
37356 Idx1H != 2 * i + 3)
37359 // First time an extract_elt's source vector is visited. Must be a MUL
37360 // with 2X number of vector elements than the BUILD_VECTOR.
37361 // Both extracts must be from same MUL.
37362 Mul = Op0L->getOperand(0);
37363 if (Mul->getOpcode() != ISD::MUL ||
37364 Mul.getValueType().getVectorNumElements() != 2 * e)
37367 // Check that the extract is from the same MUL previously seen.
37368 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
37369 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
37373 // Check if the Mul source can be safely shrunk.
37375 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
37378 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, SDValue Op0,
37380 // Shrink by adding truncate nodes and let DAGCombine fold with the
37382 EVT InVT = Op0.getValueType();
37383 assert(InVT.getScalarType() == MVT::i32 &&
37384 "Unexpected scalar element type");
37385 assert(InVT == Op1.getValueType() && "Operands' types mismatch");
37386 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37387 InVT.getVectorNumElements() / 2);
37388 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
37389 InVT.getVectorNumElements());
37390 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
37391 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op0),
37392 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Op1));
37394 return SplitBinaryOpsAndApply(DAG, Subtarget, DL, VT, Mul.getOperand(0),
37395 Mul.getOperand(1), PMADDBuilder);
37398 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
37399 const X86Subtarget &Subtarget) {
37400 const SDNodeFlags Flags = N->getFlags();
37401 if (Flags.hasVectorReduction()) {
37402 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
37404 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
37407 EVT VT = N->getValueType(0);
37408 SDValue Op0 = N->getOperand(0);
37409 SDValue Op1 = N->getOperand(1);
37411 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
37414 // Try to synthesize horizontal adds from adds of shuffles.
37415 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
37416 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
37417 isHorizontalBinOp(Op0, Op1, true))
37418 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
37420 if (SDValue V = combineIncDecVector(N, DAG))
37423 return combineAddOrSubToADCOrSBB(N, DAG);
37426 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
37427 const X86Subtarget &Subtarget) {
37428 SDValue Op0 = N->getOperand(0);
37429 SDValue Op1 = N->getOperand(1);
37430 EVT VT = N->getValueType(0);
37432 // PSUBUS is supported, starting from SSE2, but special preprocessing
37433 // for v8i32 requires umin, which appears in SSE41.
37434 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
37435 !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
37436 !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
37437 !(Subtarget.hasBWI() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
37438 VT == MVT::v16i32 || VT == MVT::v8i64)))
37441 SDValue SubusLHS, SubusRHS;
37442 // Try to find umax(a,b) - b or a - umin(a,b) patterns
37443 // they may be converted to subus(a,b).
37444 // TODO: Need to add IR cannonicialization for this code.
37445 if (Op0.getOpcode() == ISD::UMAX) {
37447 SDValue MaxLHS = Op0.getOperand(0);
37448 SDValue MaxRHS = Op0.getOperand(1);
37451 else if (MaxRHS == Op1)
37455 } else if (Op1.getOpcode() == ISD::UMIN) {
37457 SDValue MinLHS = Op1.getOperand(0);
37458 SDValue MinRHS = Op1.getOperand(1);
37461 else if (MinRHS == Op0)
37468 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
37469 // special preprocessing in some cases.
37470 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
37471 return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
37473 // Special preprocessing case can be only applied
37474 // if the value was zero extended from 16 bit,
37475 // so we require first 16 bits to be zeros for 32 bit
37476 // values, or first 48 bits for 64 bit values.
37478 DAG.computeKnownBits(SubusLHS, Known);
37479 unsigned NumZeros = Known.countMinLeadingZeros();
37480 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
37483 EVT ExtType = SubusLHS.getValueType();
37485 if (VT == MVT::v8i32 || VT == MVT::v8i64)
37486 ShrinkedType = MVT::v8i16;
37488 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
37490 // If SubusLHS is zeroextended - truncate SubusRHS to it's
37491 // size SubusRHS = umin(0xFFF.., SubusRHS).
37492 SDValue SaturationConst =
37493 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
37494 ShrinkedType.getScalarSizeInBits()),
37495 SDLoc(SubusLHS), ExtType);
37496 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
37498 SDValue NewSubusLHS =
37499 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
37500 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
37501 SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
37502 NewSubusLHS, NewSubusRHS);
37503 // Zero extend the result, it may be used somewhere as 32 bit,
37504 // if not zext and following trunc will shrink.
37505 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
37508 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
37509 const X86Subtarget &Subtarget) {
37510 SDValue Op0 = N->getOperand(0);
37511 SDValue Op1 = N->getOperand(1);
37513 // X86 can't encode an immediate LHS of a sub. See if we can push the
37514 // negation into a preceding instruction.
37515 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
37516 // If the RHS of the sub is a XOR with one use and a constant, invert the
37517 // immediate. Then add one to the LHS of the sub so we can turn
37518 // X-Y -> X+~Y+1, saving one register.
37519 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
37520 isa<ConstantSDNode>(Op1.getOperand(1))) {
37521 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
37522 EVT VT = Op0.getValueType();
37523 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
37525 DAG.getConstant(~XorC, SDLoc(Op1), VT));
37526 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
37527 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
37531 // Try to synthesize horizontal subs from subs of shuffles.
37532 EVT VT = N->getValueType(0);
37533 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
37534 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
37535 isHorizontalBinOp(Op0, Op1, false))
37536 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
37538 if (SDValue V = combineIncDecVector(N, DAG))
37541 // Try to create PSUBUS if SUB's argument is max/min
37542 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
37545 return combineAddOrSubToADCOrSBB(N, DAG);
37548 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
37549 TargetLowering::DAGCombinerInfo &DCI,
37550 const X86Subtarget &Subtarget) {
37551 if (DCI.isBeforeLegalize())
37555 unsigned Opcode = N->getOpcode();
37556 MVT VT = N->getSimpleValueType(0);
37557 MVT SVT = VT.getVectorElementType();
37558 unsigned NumElts = VT.getVectorNumElements();
37559 unsigned EltSizeInBits = SVT.getSizeInBits();
37561 SDValue Op = N->getOperand(0);
37562 MVT OpVT = Op.getSimpleValueType();
37563 MVT OpEltVT = OpVT.getVectorElementType();
37564 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
37565 unsigned InputBits = OpEltSizeInBits * NumElts;
37567 // Perform any constant folding.
37568 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
37570 SmallVector<APInt, 64> EltBits;
37571 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
37572 APInt Undefs(NumElts, 0);
37573 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
37575 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
37576 for (unsigned i = 0; i != NumElts; ++i) {
37577 if (UndefElts[i]) {
37581 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
37582 : EltBits[i].sextOrTrunc(EltSizeInBits);
37584 return getConstVector(Vals, Undefs, VT, DAG, DL);
37587 // (vzext (bitcast (vzext (x)) -> (vzext x)
37588 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
37589 SDValue V = peekThroughBitcasts(Op);
37590 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
37591 MVT InnerVT = V.getSimpleValueType();
37592 MVT InnerEltVT = InnerVT.getVectorElementType();
37594 // If the element sizes match exactly, we can just do one larger vzext. This
37595 // is always an exact type match as vzext operates on integer types.
37596 if (OpEltVT == InnerEltVT) {
37597 assert(OpVT == InnerVT && "Types must match for vzext!");
37598 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
37601 // The only other way we can combine them is if only a single element of the
37602 // inner vzext is used in the input to the outer vzext.
37603 if (InnerEltVT.getSizeInBits() < InputBits)
37606 // In this case, the inner vzext is completely dead because we're going to
37607 // only look at bits inside of the low element. Just do the outer vzext on
37608 // a bitcast of the input to the inner.
37609 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
37612 // Check if we can bypass extracting and re-inserting an element of an input
37613 // vector. Essentially:
37614 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
37615 // TODO: Add X86ISD::VSEXT support
37616 if (Opcode == X86ISD::VZEXT &&
37617 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
37618 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37619 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
37620 SDValue ExtractedV = V.getOperand(0);
37621 SDValue OrigV = ExtractedV.getOperand(0);
37622 if (isNullConstant(ExtractedV.getOperand(1))) {
37623 MVT OrigVT = OrigV.getSimpleValueType();
37624 // Extract a subvector if necessary...
37625 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
37626 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
37627 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
37628 OrigVT.getVectorNumElements() / Ratio);
37629 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
37630 DAG.getIntPtrConstant(0, DL));
37632 Op = DAG.getBitcast(OpVT, OrigV);
37633 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
37640 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
37641 const X86Subtarget &Subtarget) {
37642 MVT VT = N->getSimpleValueType(0);
37645 if (N->getOperand(0) == N->getOperand(1)) {
37646 if (N->getOpcode() == X86ISD::PCMPEQ)
37647 return getOnesVector(VT, DAG, DL);
37648 if (N->getOpcode() == X86ISD::PCMPGT)
37649 return getZeroVector(VT, Subtarget, DAG, DL);
37655 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
37656 TargetLowering::DAGCombinerInfo &DCI,
37657 const X86Subtarget &Subtarget) {
37658 if (DCI.isBeforeLegalizeOps())
37661 MVT OpVT = N->getSimpleValueType(0);
37663 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
37666 SDValue Vec = N->getOperand(0);
37667 SDValue SubVec = N->getOperand(1);
37669 unsigned IdxVal = N->getConstantOperandVal(2);
37670 MVT SubVecVT = SubVec.getSimpleValueType();
37672 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
37673 // Inserting zeros into zeros is a nop.
37674 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37675 return getZeroVector(OpVT, Subtarget, DAG, dl);
37677 // If we're inserting into a zero vector and then into a larger zero vector,
37678 // just insert into the larger zero vector directly.
37679 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37680 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
37681 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
37682 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37683 getZeroVector(OpVT, Subtarget, DAG, dl),
37684 SubVec.getOperand(1),
37685 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
37688 // If we're inserting into a zero vector and our input was extracted from an
37689 // insert into a zero vector of the same type and the extraction was at
37690 // least as large as the original insertion. Just insert the original
37691 // subvector into a zero vector.
37692 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
37693 SubVec.getConstantOperandVal(1) == 0 &&
37694 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
37695 SDValue Ins = SubVec.getOperand(0);
37696 if (Ins.getConstantOperandVal(2) == 0 &&
37697 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
37698 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
37699 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37700 getZeroVector(OpVT, Subtarget, DAG, dl),
37701 Ins.getOperand(1), N->getOperand(2));
37704 // If we're inserting a bitcast into zeros, rewrite the insert and move the
37705 // bitcast to the other side. This helps with detecting zero extending
37707 // TODO: Is this useful for other indices than 0?
37708 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
37709 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
37710 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
37711 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
37712 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
37713 DAG.getBitcast(NewVT, Vec),
37714 SubVec.getOperand(0), N->getOperand(2));
37715 return DAG.getBitcast(OpVT, Insert);
37719 // Stop here if this is an i1 vector.
37723 // If this is an insert of an extract, combine to a shuffle. Don't do this
37724 // if the insert or extract can be represented with a subregister operation.
37725 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
37726 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
37727 (IdxVal != 0 || !Vec.isUndef())) {
37728 int ExtIdxVal = SubVec.getConstantOperandVal(1);
37729 if (ExtIdxVal != 0) {
37730 int VecNumElts = OpVT.getVectorNumElements();
37731 int SubVecNumElts = SubVecVT.getVectorNumElements();
37732 SmallVector<int, 64> Mask(VecNumElts);
37733 // First create an identity shuffle mask.
37734 for (int i = 0; i != VecNumElts; ++i)
37736 // Now insert the extracted portion.
37737 for (int i = 0; i != SubVecNumElts; ++i)
37738 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
37740 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
37744 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
37746 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37747 // (load16 addr + 16), Elts/2)
37750 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37751 // (load32 addr + 32), Elts/2)
37753 // or a 16-byte or 32-byte broadcast:
37754 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
37755 // (load16 addr), Elts/2)
37756 // --> X86SubVBroadcast(load16 addr)
37758 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
37759 // (load32 addr), Elts/2)
37760 // --> X86SubVBroadcast(load32 addr)
37761 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
37762 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
37763 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
37764 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
37765 if (Idx2 && Idx2->getZExtValue() == 0) {
37766 SDValue SubVec2 = Vec.getOperand(1);
37767 // If needed, look through bitcasts to get to the load.
37768 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
37770 unsigned Alignment = FirstLd->getAlignment();
37771 unsigned AS = FirstLd->getAddressSpace();
37772 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
37773 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
37774 OpVT, AS, Alignment, &Fast) && Fast) {
37775 SDValue Ops[] = {SubVec2, SubVec};
37776 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
37781 // If lower/upper loads are the same and the only users of the load, then
37782 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
37783 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
37784 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
37785 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
37786 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
37788 // If this is subv_broadcast insert into both halves, use a larger
37790 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
37791 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
37792 SubVec.getOperand(0));
37794 // If we're inserting all zeros into the upper half, change this to
37795 // an insert into an all zeros vector. We will match this to a move
37796 // with implicit upper bit zeroing during isel.
37797 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
37798 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
37799 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
37800 Vec.getOperand(2));
37802 // If we are inserting into both halves of the vector, the starting
37803 // vector should be undef. If it isn't, make it so. Only do this if the
37804 // the early insert has no other uses.
37805 // TODO: Should this be a generic DAG combine?
37806 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
37807 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
37808 SubVec2, Vec.getOperand(2));
37809 DCI.AddToWorklist(Vec.getNode());
37810 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
37820 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
37821 TargetLowering::DAGCombinerInfo &DCI,
37822 const X86Subtarget &Subtarget) {
37823 if (DCI.isBeforeLegalizeOps())
37826 MVT OpVT = N->getSimpleValueType(0);
37827 SDValue InVec = N->getOperand(0);
37828 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
37830 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
37831 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
37833 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
37834 if (OpVT.getScalarType() == MVT::i1)
37835 return DAG.getConstant(1, SDLoc(N), OpVT);
37836 return getOnesVector(OpVT, DAG, SDLoc(N));
37839 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
37840 return DAG.getBuildVector(
37842 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
37847 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
37848 DAGCombinerInfo &DCI) const {
37849 SelectionDAG &DAG = DCI.DAG;
37850 switch (N->getOpcode()) {
37852 case ISD::EXTRACT_VECTOR_ELT:
37853 case X86ISD::PEXTRW:
37854 case X86ISD::PEXTRB:
37855 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
37856 case ISD::INSERT_SUBVECTOR:
37857 return combineInsertSubvector(N, DAG, DCI, Subtarget);
37858 case ISD::EXTRACT_SUBVECTOR:
37859 return combineExtractSubvector(N, DAG, DCI, Subtarget);
37862 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
37863 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
37864 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
37865 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
37866 case ISD::SUB: return combineSub(N, DAG, Subtarget);
37867 case X86ISD::SBB: return combineSBB(N, DAG);
37868 case X86ISD::ADC: return combineADC(N, DAG, DCI);
37869 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
37872 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
37873 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
37874 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
37875 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
37876 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
37877 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
37878 case ISD::STORE: return combineStore(N, DAG, Subtarget);
37879 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
37880 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
37881 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
37883 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
37884 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
37885 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
37886 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
37887 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
37888 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
37890 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
37892 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
37894 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
37895 case X86ISD::BT: return combineBT(N, DAG, DCI);
37896 case ISD::ANY_EXTEND:
37897 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
37898 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
37899 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
37900 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
37901 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
37902 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
37903 case X86ISD::PACKSS:
37904 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
37905 case X86ISD::VSHLI:
37906 case X86ISD::VSRAI:
37907 case X86ISD::VSRLI:
37908 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
37909 case ISD::SIGN_EXTEND_VECTOR_INREG:
37910 case ISD::ZERO_EXTEND_VECTOR_INREG:
37911 case X86ISD::VSEXT:
37912 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
37913 case X86ISD::PINSRB:
37914 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
37915 case X86ISD::SHUFP: // Handle all target specific shuffles
37916 case X86ISD::INSERTPS:
37917 case X86ISD::EXTRQI:
37918 case X86ISD::INSERTQI:
37919 case X86ISD::PALIGNR:
37920 case X86ISD::VSHLDQ:
37921 case X86ISD::VSRLDQ:
37922 case X86ISD::BLENDI:
37923 case X86ISD::UNPCKH:
37924 case X86ISD::UNPCKL:
37925 case X86ISD::MOVHLPS:
37926 case X86ISD::MOVLHPS:
37927 case X86ISD::PSHUFB:
37928 case X86ISD::PSHUFD:
37929 case X86ISD::PSHUFHW:
37930 case X86ISD::PSHUFLW:
37931 case X86ISD::MOVSHDUP:
37932 case X86ISD::MOVSLDUP:
37933 case X86ISD::MOVDDUP:
37934 case X86ISD::MOVSS:
37935 case X86ISD::MOVSD:
37936 case X86ISD::VBROADCAST:
37937 case X86ISD::VPPERM:
37938 case X86ISD::VPERMI:
37939 case X86ISD::VPERMV:
37940 case X86ISD::VPERMV3:
37941 case X86ISD::VPERMIV3:
37942 case X86ISD::VPERMIL2:
37943 case X86ISD::VPERMILPI:
37944 case X86ISD::VPERMILPV:
37945 case X86ISD::VPERM2X128:
37946 case X86ISD::VZEXT_MOVL:
37947 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
37948 case X86ISD::FMADD_RND:
37949 case X86ISD::FMADDS1_RND:
37950 case X86ISD::FMADDS3_RND:
37951 case X86ISD::FMADDS1:
37952 case X86ISD::FMADDS3:
37953 case X86ISD::FMADD4S:
37954 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
37955 case X86ISD::FMADDSUB_RND:
37956 case X86ISD::FMSUBADD_RND:
37957 case X86ISD::FMADDSUB:
37958 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
37959 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
37960 case X86ISD::MGATHER:
37961 case X86ISD::MSCATTER:
37963 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
37964 case X86ISD::PCMPEQ:
37965 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
37971 /// Return true if the target has native support for the specified value type
37972 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
37973 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
37974 /// some i16 instructions are slow.
37975 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
37976 if (!isTypeLegal(VT))
37979 // There are no vXi8 shifts.
37980 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
37983 if (VT != MVT::i16)
37990 case ISD::SIGN_EXTEND:
37991 case ISD::ZERO_EXTEND:
37992 case ISD::ANY_EXTEND:
38005 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
38006 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
38007 /// we don't adjust the stack we clobber the first frame index.
38008 /// See X86InstrInfo::copyPhysReg.
38009 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
38010 const MachineRegisterInfo &MRI = MF.getRegInfo();
38011 return any_of(MRI.reg_instructions(X86::EFLAGS),
38012 [](const MachineInstr &RI) { return RI.isCopy(); });
38015 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
38016 if (hasCopyImplyingStackAdjustment(MF)) {
38017 MachineFrameInfo &MFI = MF.getFrameInfo();
38018 MFI.setHasCopyImplyingStackAdjustment(true);
38021 TargetLoweringBase::finalizeLowering(MF);
38024 /// This method query the target whether it is beneficial for dag combiner to
38025 /// promote the specified node. If true, it should return the desired promotion
38026 /// type by reference.
38027 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
38028 EVT VT = Op.getValueType();
38029 if (VT != MVT::i16)
38032 bool Promote = false;
38033 bool Commute = false;
38034 switch (Op.getOpcode()) {
38036 case ISD::SIGN_EXTEND:
38037 case ISD::ZERO_EXTEND:
38038 case ISD::ANY_EXTEND:
38043 SDValue N0 = Op.getOperand(0);
38044 // Look out for (store (shl (load), x)).
38045 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
38058 SDValue N0 = Op.getOperand(0);
38059 SDValue N1 = Op.getOperand(1);
38060 if (!Commute && MayFoldLoad(N1))
38062 // Avoid disabling potential load folding opportunities.
38063 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
38065 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
38075 bool X86TargetLowering::
38076 isDesirableToCombineBuildVectorToShuffleTruncate(
38077 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
38079 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
38080 "Element count mismatch");
38082 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
38083 "Shuffle Mask expected to be legal");
38085 // For 32-bit elements VPERMD is better than shuffle+truncate.
38086 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
38087 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
38090 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
38096 //===----------------------------------------------------------------------===//
38097 // X86 Inline Assembly Support
38098 //===----------------------------------------------------------------------===//
38100 // Helper to match a string separated by whitespace.
38101 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
38102 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
38104 for (StringRef Piece : Pieces) {
38105 if (!S.startswith(Piece)) // Check if the piece matches.
38108 S = S.substr(Piece.size());
38109 StringRef::size_type Pos = S.find_first_not_of(" \t");
38110 if (Pos == 0) // We matched a prefix.
38119 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
38121 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
38122 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
38123 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
38124 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
38126 if (AsmPieces.size() == 3)
38128 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
38135 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
38136 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
38138 const std::string &AsmStr = IA->getAsmString();
38140 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
38141 if (!Ty || Ty->getBitWidth() % 16 != 0)
38144 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
38145 SmallVector<StringRef, 4> AsmPieces;
38146 SplitString(AsmStr, AsmPieces, ";\n");
38148 switch (AsmPieces.size()) {
38149 default: return false;
38151 // FIXME: this should verify that we are targeting a 486 or better. If not,
38152 // we will turn this bswap into something that will be lowered to logical
38153 // ops instead of emitting the bswap asm. For now, we don't support 486 or
38154 // lower so don't worry about this.
38156 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
38157 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
38158 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
38159 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
38160 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
38161 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
38162 // No need to check constraints, nothing other than the equivalent of
38163 // "=r,0" would be valid here.
38164 return IntrinsicLowering::LowerToByteSwap(CI);
38167 // rorw $$8, ${0:w} --> llvm.bswap.i16
38168 if (CI->getType()->isIntegerTy(16) &&
38169 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
38170 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
38171 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
38173 StringRef ConstraintsStr = IA->getConstraintString();
38174 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38175 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38176 if (clobbersFlagRegisters(AsmPieces))
38177 return IntrinsicLowering::LowerToByteSwap(CI);
38181 if (CI->getType()->isIntegerTy(32) &&
38182 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
38183 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
38184 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
38185 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
38187 StringRef ConstraintsStr = IA->getConstraintString();
38188 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38189 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38190 if (clobbersFlagRegisters(AsmPieces))
38191 return IntrinsicLowering::LowerToByteSwap(CI);
38194 if (CI->getType()->isIntegerTy(64)) {
38195 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
38196 if (Constraints.size() >= 2 &&
38197 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
38198 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
38199 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
38200 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
38201 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
38202 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
38203 return IntrinsicLowering::LowerToByteSwap(CI);
38211 /// Given a constraint letter, return the type of constraint for this target.
38212 X86TargetLowering::ConstraintType
38213 X86TargetLowering::getConstraintType(StringRef Constraint) const {
38214 if (Constraint.size() == 1) {
38215 switch (Constraint[0]) {
38227 case 'k': // AVX512 masking registers.
38228 return C_RegisterClass;
38252 else if (Constraint.size() == 2) {
38253 switch (Constraint[0]) {
38257 switch (Constraint[1]) {
38268 return C_RegisterClass;
38272 return TargetLowering::getConstraintType(Constraint);
38275 /// Examine constraint type and operand type and determine a weight value.
38276 /// This object must already have been set up with the operand type
38277 /// and the current alternative constraint selected.
38278 TargetLowering::ConstraintWeight
38279 X86TargetLowering::getSingleConstraintMatchWeight(
38280 AsmOperandInfo &info, const char *constraint) const {
38281 ConstraintWeight weight = CW_Invalid;
38282 Value *CallOperandVal = info.CallOperandVal;
38283 // If we don't have a value, we can't do a match,
38284 // but allow it at the lowest weight.
38285 if (!CallOperandVal)
38287 Type *type = CallOperandVal->getType();
38288 // Look at the constraint type.
38289 switch (*constraint) {
38291 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
38303 if (CallOperandVal->getType()->isIntegerTy())
38304 weight = CW_SpecificReg;
38309 if (type->isFloatingPointTy())
38310 weight = CW_SpecificReg;
38313 if (type->isX86_MMXTy() && Subtarget.hasMMX())
38314 weight = CW_SpecificReg;
38317 unsigned Size = StringRef(constraint).size();
38318 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
38319 char NextChar = Size == 2 ? constraint[1] : 'i';
38322 switch (NextChar) {
38328 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
38329 return CW_SpecificReg;
38331 // Conditional OpMask regs (AVX512)
38333 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
38334 return CW_Register;
38338 if (type->isX86_MMXTy() && Subtarget.hasMMX())
38341 // Any SSE reg when ISA >= SSE2, same as 'Y'
38345 if (!Subtarget.hasSSE2())
38349 // Fall through (handle "Y" constraint).
38353 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
38354 weight = CW_Register;
38357 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
38358 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
38359 weight = CW_Register;
38362 // Enable conditional vector operations using %k<#> registers.
38363 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
38364 weight = CW_Register;
38367 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
38368 if (C->getZExtValue() <= 31)
38369 weight = CW_Constant;
38373 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38374 if (C->getZExtValue() <= 63)
38375 weight = CW_Constant;
38379 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38380 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
38381 weight = CW_Constant;
38385 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38386 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
38387 weight = CW_Constant;
38391 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38392 if (C->getZExtValue() <= 3)
38393 weight = CW_Constant;
38397 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38398 if (C->getZExtValue() <= 0xff)
38399 weight = CW_Constant;
38404 if (isa<ConstantFP>(CallOperandVal)) {
38405 weight = CW_Constant;
38409 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38410 if ((C->getSExtValue() >= -0x80000000LL) &&
38411 (C->getSExtValue() <= 0x7fffffffLL))
38412 weight = CW_Constant;
38416 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
38417 if (C->getZExtValue() <= 0xffffffff)
38418 weight = CW_Constant;
38425 /// Try to replace an X constraint, which matches anything, with another that
38426 /// has more specific requirements based on the type of the corresponding
38428 const char *X86TargetLowering::
38429 LowerXConstraint(EVT ConstraintVT) const {
38430 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
38431 // 'f' like normal targets.
38432 if (ConstraintVT.isFloatingPoint()) {
38433 if (Subtarget.hasSSE2())
38435 if (Subtarget.hasSSE1())
38439 return TargetLowering::LowerXConstraint(ConstraintVT);
38442 /// Lower the specified operand into the Ops vector.
38443 /// If it is invalid, don't add anything to Ops.
38444 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
38445 std::string &Constraint,
38446 std::vector<SDValue>&Ops,
38447 SelectionDAG &DAG) const {
38450 // Only support length 1 constraints for now.
38451 if (Constraint.length() > 1) return;
38453 char ConstraintLetter = Constraint[0];
38454 switch (ConstraintLetter) {
38457 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38458 if (C->getZExtValue() <= 31) {
38459 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38460 Op.getValueType());
38466 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38467 if (C->getZExtValue() <= 63) {
38468 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38469 Op.getValueType());
38475 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38476 if (isInt<8>(C->getSExtValue())) {
38477 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38478 Op.getValueType());
38484 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38485 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
38486 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
38487 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
38488 Op.getValueType());
38494 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38495 if (C->getZExtValue() <= 3) {
38496 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38497 Op.getValueType());
38503 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38504 if (C->getZExtValue() <= 255) {
38505 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38506 Op.getValueType());
38512 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38513 if (C->getZExtValue() <= 127) {
38514 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38515 Op.getValueType());
38521 // 32-bit signed value
38522 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38523 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
38524 C->getSExtValue())) {
38525 // Widen to 64 bits here to get it sign extended.
38526 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
38529 // FIXME gcc accepts some relocatable values here too, but only in certain
38530 // memory models; it's complicated.
38535 // 32-bit unsigned value
38536 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
38537 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
38538 C->getZExtValue())) {
38539 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
38540 Op.getValueType());
38544 // FIXME gcc accepts some relocatable values here too, but only in certain
38545 // memory models; it's complicated.
38549 // Literal immediates are always ok.
38550 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
38551 // Widen to 64 bits here to get it sign extended.
38552 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
38556 // In any sort of PIC mode addresses need to be computed at runtime by
38557 // adding in a register or some sort of table lookup. These can't
38558 // be used as immediates.
38559 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
38562 // If we are in non-pic codegen mode, we allow the address of a global (with
38563 // an optional displacement) to be used with 'i'.
38564 GlobalAddressSDNode *GA = nullptr;
38565 int64_t Offset = 0;
38567 // Match either (GA), (GA+C), (GA+C1+C2), etc.
38569 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
38570 Offset += GA->getOffset();
38572 } else if (Op.getOpcode() == ISD::ADD) {
38573 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38574 Offset += C->getZExtValue();
38575 Op = Op.getOperand(0);
38578 } else if (Op.getOpcode() == ISD::SUB) {
38579 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
38580 Offset += -C->getZExtValue();
38581 Op = Op.getOperand(0);
38586 // Otherwise, this isn't something we can handle, reject it.
38590 const GlobalValue *GV = GA->getGlobal();
38591 // If we require an extra load to get this address, as in PIC mode, we
38592 // can't accept it.
38593 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
38596 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
38597 GA->getValueType(0), Offset);
38602 if (Result.getNode()) {
38603 Ops.push_back(Result);
38606 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
38609 /// Check if \p RC is a general purpose register class.
38610 /// I.e., GR* or one of their variant.
38611 static bool isGRClass(const TargetRegisterClass &RC) {
38612 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
38613 RC.hasSuperClassEq(&X86::GR16RegClass) ||
38614 RC.hasSuperClassEq(&X86::GR32RegClass) ||
38615 RC.hasSuperClassEq(&X86::GR64RegClass) ||
38616 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
38619 /// Check if \p RC is a vector register class.
38620 /// I.e., FR* / VR* or one of their variant.
38621 static bool isFRClass(const TargetRegisterClass &RC) {
38622 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
38623 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
38624 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
38625 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
38626 RC.hasSuperClassEq(&X86::VR512RegClass);
38629 std::pair<unsigned, const TargetRegisterClass *>
38630 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
38631 StringRef Constraint,
38633 // First, see if this is a constraint that directly corresponds to an LLVM
38635 if (Constraint.size() == 1) {
38636 // GCC Constraint Letters
38637 switch (Constraint[0]) {
38639 // TODO: Slight differences here in allocation order and leaving
38640 // RIP in the class. Do they matter any more here than they do
38641 // in the normal allocation?
38643 if (Subtarget.hasAVX512()) {
38644 // Only supported in AVX512 or later.
38645 switch (VT.SimpleTy) {
38648 return std::make_pair(0U, &X86::VK32RegClass);
38650 return std::make_pair(0U, &X86::VK16RegClass);
38652 return std::make_pair(0U, &X86::VK8RegClass);
38654 return std::make_pair(0U, &X86::VK1RegClass);
38656 return std::make_pair(0U, &X86::VK64RegClass);
38660 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
38661 if (Subtarget.is64Bit()) {
38662 if (VT == MVT::i32 || VT == MVT::f32)
38663 return std::make_pair(0U, &X86::GR32RegClass);
38664 if (VT == MVT::i16)
38665 return std::make_pair(0U, &X86::GR16RegClass);
38666 if (VT == MVT::i8 || VT == MVT::i1)
38667 return std::make_pair(0U, &X86::GR8RegClass);
38668 if (VT == MVT::i64 || VT == MVT::f64)
38669 return std::make_pair(0U, &X86::GR64RegClass);
38673 // 32-bit fallthrough
38674 case 'Q': // Q_REGS
38675 if (VT == MVT::i32 || VT == MVT::f32)
38676 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
38677 if (VT == MVT::i16)
38678 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
38679 if (VT == MVT::i8 || VT == MVT::i1)
38680 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
38681 if (VT == MVT::i64)
38682 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
38684 case 'r': // GENERAL_REGS
38685 case 'l': // INDEX_REGS
38686 if (VT == MVT::i8 || VT == MVT::i1)
38687 return std::make_pair(0U, &X86::GR8RegClass);
38688 if (VT == MVT::i16)
38689 return std::make_pair(0U, &X86::GR16RegClass);
38690 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
38691 return std::make_pair(0U, &X86::GR32RegClass);
38692 return std::make_pair(0U, &X86::GR64RegClass);
38693 case 'R': // LEGACY_REGS
38694 if (VT == MVT::i8 || VT == MVT::i1)
38695 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
38696 if (VT == MVT::i16)
38697 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
38698 if (VT == MVT::i32 || !Subtarget.is64Bit())
38699 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
38700 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
38701 case 'f': // FP Stack registers.
38702 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
38703 // value to the correct fpstack register class.
38704 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
38705 return std::make_pair(0U, &X86::RFP32RegClass);
38706 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
38707 return std::make_pair(0U, &X86::RFP64RegClass);
38708 return std::make_pair(0U, &X86::RFP80RegClass);
38709 case 'y': // MMX_REGS if MMX allowed.
38710 if (!Subtarget.hasMMX()) break;
38711 return std::make_pair(0U, &X86::VR64RegClass);
38712 case 'Y': // SSE_REGS if SSE2 allowed
38713 if (!Subtarget.hasSSE2()) break;
38716 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
38717 if (!Subtarget.hasSSE1()) break;
38718 bool VConstraint = (Constraint[0] == 'v');
38720 switch (VT.SimpleTy) {
38722 // Scalar SSE types.
38725 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
38726 return std::make_pair(0U, &X86::FR32XRegClass);
38727 return std::make_pair(0U, &X86::FR32RegClass);
38730 if (VConstraint && Subtarget.hasVLX())
38731 return std::make_pair(0U, &X86::FR64XRegClass);
38732 return std::make_pair(0U, &X86::FR64RegClass);
38733 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38741 if (VConstraint && Subtarget.hasVLX())
38742 return std::make_pair(0U, &X86::VR128XRegClass);
38743 return std::make_pair(0U, &X86::VR128RegClass);
38751 if (VConstraint && Subtarget.hasVLX())
38752 return std::make_pair(0U, &X86::VR256XRegClass);
38753 return std::make_pair(0U, &X86::VR256RegClass);
38758 return std::make_pair(0U, &X86::VR512RegClass);
38762 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
38763 switch (Constraint[1]) {
38769 return getRegForInlineAsmConstraint(TRI, "Y", VT);
38771 if (!Subtarget.hasMMX()) break;
38772 return std::make_pair(0U, &X86::VR64RegClass);
38775 if (!Subtarget.hasSSE1()) break;
38776 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
38778 // This register class doesn't allocate k0 for masked vector operation.
38779 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
38780 switch (VT.SimpleTy) {
38783 return std::make_pair(0U, &X86::VK32WMRegClass);
38785 return std::make_pair(0U, &X86::VK16WMRegClass);
38787 return std::make_pair(0U, &X86::VK8WMRegClass);
38789 return std::make_pair(0U, &X86::VK1WMRegClass);
38791 return std::make_pair(0U, &X86::VK64WMRegClass);
38798 // Use the default implementation in TargetLowering to convert the register
38799 // constraint into a member of a register class.
38800 std::pair<unsigned, const TargetRegisterClass*> Res;
38801 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
38803 // Not found as a standard register?
38805 // Map st(0) -> st(7) -> ST0
38806 if (Constraint.size() == 7 && Constraint[0] == '{' &&
38807 tolower(Constraint[1]) == 's' &&
38808 tolower(Constraint[2]) == 't' &&
38809 Constraint[3] == '(' &&
38810 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
38811 Constraint[5] == ')' &&
38812 Constraint[6] == '}') {
38814 Res.first = X86::FP0+Constraint[4]-'0';
38815 Res.second = &X86::RFP80RegClass;
38819 // GCC allows "st(0)" to be called just plain "st".
38820 if (StringRef("{st}").equals_lower(Constraint)) {
38821 Res.first = X86::FP0;
38822 Res.second = &X86::RFP80RegClass;
38827 if (StringRef("{flags}").equals_lower(Constraint)) {
38828 Res.first = X86::EFLAGS;
38829 Res.second = &X86::CCRRegClass;
38833 // 'A' means [ER]AX + [ER]DX.
38834 if (Constraint == "A") {
38835 if (Subtarget.is64Bit()) {
38836 Res.first = X86::RAX;
38837 Res.second = &X86::GR64_ADRegClass;
38839 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
38840 "Expecting 64, 32 or 16 bit subtarget");
38841 Res.first = X86::EAX;
38842 Res.second = &X86::GR32_ADRegClass;
38849 // Otherwise, check to see if this is a register class of the wrong value
38850 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
38851 // turn into {ax},{dx}.
38852 // MVT::Other is used to specify clobber names.
38853 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
38854 return Res; // Correct type already, nothing to do.
38856 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
38857 // return "eax". This should even work for things like getting 64bit integer
38858 // registers when given an f64 type.
38859 const TargetRegisterClass *Class = Res.second;
38860 // The generic code will match the first register class that contains the
38861 // given register. Thus, based on the ordering of the tablegened file,
38862 // the "plain" GR classes might not come first.
38863 // Therefore, use a helper method.
38864 if (isGRClass(*Class)) {
38865 unsigned Size = VT.getSizeInBits();
38866 if (Size == 1) Size = 8;
38867 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
38869 bool is64Bit = Subtarget.is64Bit();
38870 const TargetRegisterClass *RC =
38871 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
38872 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
38873 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
38874 : &X86::GR64RegClass;
38875 if (RC->contains(DestReg))
38876 Res = std::make_pair(DestReg, RC);
38878 // No register found/type mismatch.
38880 Res.second = nullptr;
38882 } else if (isFRClass(*Class)) {
38883 // Handle references to XMM physical registers that got mapped into the
38884 // wrong class. This can happen with constraints like {xmm0} where the
38885 // target independent register mapper will just pick the first match it can
38886 // find, ignoring the required type.
38888 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38889 if (VT == MVT::f32 || VT == MVT::i32)
38890 Res.second = &X86::FR32RegClass;
38891 else if (VT == MVT::f64 || VT == MVT::i64)
38892 Res.second = &X86::FR64RegClass;
38893 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
38894 Res.second = &X86::VR128RegClass;
38895 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
38896 Res.second = &X86::VR256RegClass;
38897 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
38898 Res.second = &X86::VR512RegClass;
38900 // Type mismatch and not a clobber: Return an error;
38902 Res.second = nullptr;
38909 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
38910 const AddrMode &AM, Type *Ty,
38911 unsigned AS) const {
38912 // Scaling factors are not free at all.
38913 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
38914 // will take 2 allocations in the out of order engine instead of 1
38915 // for plain addressing mode, i.e. inst (reg1).
38917 // vaddps (%rsi,%rdx), %ymm0, %ymm1
38918 // Requires two allocations (one for the load, one for the computation)
38920 // vaddps (%rsi), %ymm0, %ymm1
38921 // Requires just 1 allocation, i.e., freeing allocations for other operations
38922 // and having less micro operations to execute.
38924 // For some X86 architectures, this is even worse because for instance for
38925 // stores, the complex addressing mode forces the instruction to use the
38926 // "load" ports instead of the dedicated "store" port.
38927 // E.g., on Haswell:
38928 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
38929 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
38930 if (isLegalAddressingMode(DL, AM, Ty, AS))
38931 // Scale represents reg2 * scale, thus account for 1
38932 // as soon as we use a second register.
38933 return AM.Scale != 0;
38937 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
38938 // Integer division on x86 is expensive. However, when aggressively optimizing
38939 // for code size, we prefer to use a div instruction, as it is usually smaller
38940 // than the alternative sequence.
38941 // The exception to this is vector division. Since x86 doesn't have vector
38942 // integer division, leaving the division as-is is a loss even in terms of
38943 // size, because it will have to be scalarized, while the alternative code
38944 // sequence can be performed in vector form.
38946 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
38947 return OptSize && !VT.isVector();
38950 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
38951 if (!Subtarget.is64Bit())
38954 // Update IsSplitCSR in X86MachineFunctionInfo.
38955 X86MachineFunctionInfo *AFI =
38956 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
38957 AFI->setIsSplitCSR(true);
38960 void X86TargetLowering::insertCopiesSplitCSR(
38961 MachineBasicBlock *Entry,
38962 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
38963 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38964 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
38968 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38969 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
38970 MachineBasicBlock::iterator MBBI = Entry->begin();
38971 for (const MCPhysReg *I = IStart; *I; ++I) {
38972 const TargetRegisterClass *RC = nullptr;
38973 if (X86::GR64RegClass.contains(*I))
38974 RC = &X86::GR64RegClass;
38976 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
38978 unsigned NewVR = MRI->createVirtualRegister(RC);
38979 // Create copy from CSR to a virtual register.
38980 // FIXME: this currently does not emit CFI pseudo-instructions, it works
38981 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
38982 // nounwind. If we want to generalize this later, we may need to emit
38983 // CFI pseudo-instructions.
38984 assert(Entry->getParent()->getFunction().hasFnAttribute(
38985 Attribute::NoUnwind) &&
38986 "Function should be nounwind in insertCopiesSplitCSR!");
38987 Entry->addLiveIn(*I);
38988 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
38991 // Insert the copy-back instructions right before the terminator.
38992 for (auto *Exit : Exits)
38993 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
38994 TII->get(TargetOpcode::COPY), *I)
38999 bool X86TargetLowering::supportSwiftError() const {
39000 return Subtarget.is64Bit();
39003 /// Returns the name of the symbol used to emit stack probes or the empty
39004 /// string if not applicable.
39005 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
39006 // If the function specifically requests stack probes, emit them.
39007 if (MF.getFunction().hasFnAttribute("probe-stack"))
39008 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
39010 // Generally, if we aren't on Windows, the platform ABI does not include
39011 // support for stack probes, so don't emit them.
39012 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
39015 // We need a stack probe to conform to the Windows ABI. Choose the right
39017 if (Subtarget.is64Bit())
39018 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
39019 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";