1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
220 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
223 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
225 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
228 if (!Subtarget.useSoftFloat()) {
229 // SSE has no i16 to fp conversion, only i32.
230 if (X86ScalarSSEf32) {
231 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
232 // f32 and f64 cases are Legal, f80 case is not
233 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
235 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
236 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
239 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
240 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
243 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
245 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
246 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
248 if (!Subtarget.useSoftFloat()) {
249 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
250 // are Legal, f80 is custom lowered.
251 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
252 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
254 if (X86ScalarSSEf32) {
255 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
256 // f32 and f64 cases are Legal, f80 case is not
257 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
260 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
264 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
265 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
268 // Handle FP_TO_UINT by promoting the destination to a larger signed
270 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
272 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
274 if (Subtarget.is64Bit()) {
275 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
276 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
277 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
280 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
281 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
283 } else if (!Subtarget.useSoftFloat()) {
284 // Since AVX is a superset of SSE3, only check for SSE here.
285 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
286 // Expand FP_TO_UINT into a select.
287 // FIXME: We would like to use a Custom expander here eventually to do
288 // the optimal thing for SSE vs. the default expansion in the legalizer.
289 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
291 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
292 // With SSE3 we can use fisttpll to convert to a signed i64; without
293 // SSE, we're stuck with a fistpll.
294 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
296 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
299 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
300 if (!X86ScalarSSEf64) {
301 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
302 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
305 // Without SSE, i64->f64 goes through memory.
306 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
308 } else if (!Subtarget.is64Bit())
309 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
311 // Scalar integer divide and remainder are lowered to use operations that
312 // produce two results, to match the available instructions. This exposes
313 // the two-result form to trivial CSE, which is able to combine x/y and x%y
314 // into a single instruction.
316 // Scalar integer multiply-high is also lowered to use two-result
317 // operations, to match the available instructions. However, plain multiply
318 // (low) operations are left as Legal, as there are single-result
319 // instructions for this in x86. Using the two-result multiply instructions
320 // when both high and low results are needed must be arranged by dagcombine.
321 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
322 setOperationAction(ISD::MULHS, VT, Expand);
323 setOperationAction(ISD::MULHU, VT, Expand);
324 setOperationAction(ISD::SDIV, VT, Expand);
325 setOperationAction(ISD::UDIV, VT, Expand);
326 setOperationAction(ISD::SREM, VT, Expand);
327 setOperationAction(ISD::UREM, VT, Expand);
330 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
331 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
332 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
333 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
334 setOperationAction(ISD::BR_CC, VT, Expand);
335 setOperationAction(ISD::SELECT_CC, VT, Expand);
337 if (Subtarget.is64Bit())
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
340 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
341 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
342 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
344 setOperationAction(ISD::FREM , MVT::f32 , Expand);
345 setOperationAction(ISD::FREM , MVT::f64 , Expand);
346 setOperationAction(ISD::FREM , MVT::f80 , Expand);
347 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
349 // Promote the i8 variants and force them on up to i32 which has a shorter
351 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
352 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
353 if (!Subtarget.hasBMI()) {
354 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
355 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
364 if (Subtarget.hasLZCNT()) {
365 // When promoting the i8 variants, force them to i32 for a shorter
367 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
368 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
370 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
371 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
372 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
374 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
375 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
376 if (Subtarget.is64Bit()) {
377 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
378 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
382 // Special handling for half-precision floating point conversions.
383 // If we don't have F16C support, then lower half float conversions
384 // into library calls.
385 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
386 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
387 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
390 // There's never any support for operations beyond MVT::f32.
391 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
392 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
393 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
397 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
398 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
400 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
401 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
403 if (Subtarget.hasPOPCNT()) {
404 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
406 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
407 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
408 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
409 if (Subtarget.is64Bit())
410 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
413 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
415 if (!Subtarget.hasMOVBE())
416 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
418 // These should be promoted to a larger select which is supported.
419 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
420 // X86 wants to expand cmov itself.
421 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422 setOperationAction(ISD::SELECT, VT, Custom);
423 setOperationAction(ISD::SETCC, VT, Custom);
425 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 setOperationAction(ISD::SELECT, VT, Custom);
429 setOperationAction(ISD::SETCC, VT, Custom);
432 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
434 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
438 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
439 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
440 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
441 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
442 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
443 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
446 for (auto VT : { MVT::i32, MVT::i64 }) {
447 if (VT == MVT::i64 && !Subtarget.is64Bit())
449 setOperationAction(ISD::ConstantPool , VT, Custom);
450 setOperationAction(ISD::JumpTable , VT, Custom);
451 setOperationAction(ISD::GlobalAddress , VT, Custom);
452 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
453 setOperationAction(ISD::ExternalSymbol , VT, Custom);
454 setOperationAction(ISD::BlockAddress , VT, Custom);
457 // 64-bit shl, sra, srl (iff 32-bit x86)
458 for (auto VT : { MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
461 setOperationAction(ISD::SHL_PARTS, VT, Custom);
462 setOperationAction(ISD::SRA_PARTS, VT, Custom);
463 setOperationAction(ISD::SRL_PARTS, VT, Custom);
466 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
467 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
469 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
471 // Expand certain atomics
472 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
479 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
482 if (Subtarget.hasCmpxchg16b()) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
486 // FIXME - use subtarget debug flags
487 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
488 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
489 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
490 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
493 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
496 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
499 setOperationAction(ISD::TRAP, MVT::Other, Legal);
500 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
502 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
503 setOperationAction(ISD::VASTART , MVT::Other, Custom);
504 setOperationAction(ISD::VAEND , MVT::Other, Expand);
505 bool Is64Bit = Subtarget.is64Bit();
506 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
509 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
510 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
512 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
514 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
515 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
516 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
518 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
519 // f32 and f64 use SSE.
520 // Set up the FP register classes.
521 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
522 : &X86::FR32RegClass);
523 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
524 : &X86::FR64RegClass);
526 for (auto VT : { MVT::f32, MVT::f64 }) {
527 // Use ANDPD to simulate FABS.
528 setOperationAction(ISD::FABS, VT, Custom);
530 // Use XORP to simulate FNEG.
531 setOperationAction(ISD::FNEG, VT, Custom);
533 // Use ANDPD and ORPD to simulate FCOPYSIGN.
534 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
536 // We don't support sin/cos/fmod
537 setOperationAction(ISD::FSIN , VT, Expand);
538 setOperationAction(ISD::FCOS , VT, Expand);
539 setOperationAction(ISD::FSINCOS, VT, Expand);
542 // Lower this to MOVMSK plus an AND.
543 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
544 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
546 // Expand FP immediates into loads from the stack, except for the special
548 addLegalFPImmediate(APFloat(+0.0)); // xorpd
549 addLegalFPImmediate(APFloat(+0.0f)); // xorps
550 } else if (UseX87 && X86ScalarSSEf32) {
551 // Use SSE for f32, x87 for f64.
552 // Set up the FP register classes.
553 addRegisterClass(MVT::f32, &X86::FR32RegClass);
554 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
556 // Use ANDPS to simulate FABS.
557 setOperationAction(ISD::FABS , MVT::f32, Custom);
559 // Use XORP to simulate FNEG.
560 setOperationAction(ISD::FNEG , MVT::f32, Custom);
562 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
564 // Use ANDPS and ORPS to simulate FCOPYSIGN.
565 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
568 // We don't support sin/cos/fmod
569 setOperationAction(ISD::FSIN , MVT::f32, Expand);
570 setOperationAction(ISD::FCOS , MVT::f32, Expand);
571 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
573 // Special cases we handle for FP constants.
574 addLegalFPImmediate(APFloat(+0.0f)); // xorps
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
580 // Always expand sin/cos functions even though x87 has an instruction.
581 setOperationAction(ISD::FSIN , MVT::f64, Expand);
582 setOperationAction(ISD::FCOS , MVT::f64, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
585 // f32 and f64 in x87.
586 // Set up the FP register classes.
587 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
588 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
590 for (auto VT : { MVT::f32, MVT::f64 }) {
591 setOperationAction(ISD::UNDEF, VT, Expand);
592 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
594 // Always expand sin/cos functions even though x87 has an instruction.
595 setOperationAction(ISD::FSIN , VT, Expand);
596 setOperationAction(ISD::FCOS , VT, Expand);
597 setOperationAction(ISD::FSINCOS, VT, Expand);
599 addLegalFPImmediate(APFloat(+0.0)); // FLD0
600 addLegalFPImmediate(APFloat(+1.0)); // FLD1
601 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
602 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
603 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
604 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
605 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
606 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
609 // We don't support FMA.
610 setOperationAction(ISD::FMA, MVT::f64, Expand);
611 setOperationAction(ISD::FMA, MVT::f32, Expand);
613 // Long double always uses X87, except f128 in MMX.
615 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
616 addRegisterClass(MVT::f128, &X86::FR128RegClass);
617 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
618 setOperationAction(ISD::FABS , MVT::f128, Custom);
619 setOperationAction(ISD::FNEG , MVT::f128, Custom);
620 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
623 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
624 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
625 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
627 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
628 addLegalFPImmediate(TmpFlt); // FLD0
630 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
633 APFloat TmpFlt2(+1.0);
634 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
636 addLegalFPImmediate(TmpFlt2); // FLD1
637 TmpFlt2.changeSign();
638 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
641 // Always expand sin/cos functions even though x87 has an instruction.
642 setOperationAction(ISD::FSIN , MVT::f80, Expand);
643 setOperationAction(ISD::FCOS , MVT::f80, Expand);
644 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
646 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
647 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
648 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
649 setOperationAction(ISD::FRINT, MVT::f80, Expand);
650 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
651 setOperationAction(ISD::FMA, MVT::f80, Expand);
654 // Always use a library call for pow.
655 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
657 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
659 setOperationAction(ISD::FLOG, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
661 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP, MVT::f80, Expand);
663 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
664 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
665 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
667 // Some FP actions are always expanded for vector types.
668 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
669 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
670 setOperationAction(ISD::FSIN, VT, Expand);
671 setOperationAction(ISD::FSINCOS, VT, Expand);
672 setOperationAction(ISD::FCOS, VT, Expand);
673 setOperationAction(ISD::FREM, VT, Expand);
674 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
675 setOperationAction(ISD::FPOW, VT, Expand);
676 setOperationAction(ISD::FLOG, VT, Expand);
677 setOperationAction(ISD::FLOG2, VT, Expand);
678 setOperationAction(ISD::FLOG10, VT, Expand);
679 setOperationAction(ISD::FEXP, VT, Expand);
680 setOperationAction(ISD::FEXP2, VT, Expand);
683 // First set operation action for all vector types to either promote
684 // (for widening) or expand (for scalarization). Then we will selectively
685 // turn on ones that can be effectively codegen'd.
686 for (MVT VT : MVT::vector_valuetypes()) {
687 setOperationAction(ISD::SDIV, VT, Expand);
688 setOperationAction(ISD::UDIV, VT, Expand);
689 setOperationAction(ISD::SREM, VT, Expand);
690 setOperationAction(ISD::UREM, VT, Expand);
691 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
692 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
695 setOperationAction(ISD::FMA, VT, Expand);
696 setOperationAction(ISD::FFLOOR, VT, Expand);
697 setOperationAction(ISD::FCEIL, VT, Expand);
698 setOperationAction(ISD::FTRUNC, VT, Expand);
699 setOperationAction(ISD::FRINT, VT, Expand);
700 setOperationAction(ISD::FNEARBYINT, VT, Expand);
701 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHS, VT, Expand);
703 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
704 setOperationAction(ISD::MULHU, VT, Expand);
705 setOperationAction(ISD::SDIVREM, VT, Expand);
706 setOperationAction(ISD::UDIVREM, VT, Expand);
707 setOperationAction(ISD::CTPOP, VT, Expand);
708 setOperationAction(ISD::CTTZ, VT, Expand);
709 setOperationAction(ISD::CTLZ, VT, Expand);
710 setOperationAction(ISD::ROTL, VT, Expand);
711 setOperationAction(ISD::ROTR, VT, Expand);
712 setOperationAction(ISD::BSWAP, VT, Expand);
713 setOperationAction(ISD::SETCC, VT, Expand);
714 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
715 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
716 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
719 setOperationAction(ISD::TRUNCATE, VT, Expand);
720 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
721 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
722 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
723 setOperationAction(ISD::SELECT_CC, VT, Expand);
724 for (MVT InnerVT : MVT::vector_valuetypes()) {
725 setTruncStoreAction(InnerVT, VT, Expand);
727 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
728 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
730 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
731 // types, we have to deal with them whether we ask for Expansion or not.
732 // Setting Expand causes its own optimisation problems though, so leave
734 if (VT.getVectorElementType() == MVT::i1)
735 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
738 // split/scalarized right now.
739 if (VT.getVectorElementType() == MVT::f16)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
744 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
745 // with -msoft-float, disable use of MMX as well.
746 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
747 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
748 // No operations on x86mmx supported, everything uses intrinsics.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
752 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
755 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
756 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
757 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
758 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
759 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
760 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
762 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
763 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
766 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
767 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
771 // registers cannot be used even for integer operations.
772 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
779 : &X86::VR128RegClass);
781 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
786 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
788 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
789 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
790 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
791 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
792 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
793 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
795 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
796 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
797 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
798 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
799 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
803 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
806 // Provide custom widening for v2f32 setcc. This is really for VLX when
807 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
808 // type legalization changing the result type to v4i1 during widening.
809 // It works fine for SSE2 and is probably faster so no need to qualify with
811 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
813 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
814 setOperationAction(ISD::SETCC, VT, Custom);
815 setOperationAction(ISD::CTPOP, VT, Custom);
816 setOperationAction(ISD::CTTZ, VT, Custom);
818 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
819 // setcc all the way to isel and prefer SETGT in some isel patterns.
820 setCondCodeAction(ISD::SETLT, VT, Custom);
821 setCondCodeAction(ISD::SETLE, VT, Custom);
824 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
825 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
826 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
827 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
828 setOperationAction(ISD::VSELECT, VT, Custom);
829 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
832 // We support custom legalizing of sext and anyext loads for specific
833 // memory vector types which we can load as a scalar (or sequence of
834 // scalars) and extend in-register to a legal 128-bit vector type. For sext
835 // loads these must work with a single scalar load.
836 for (MVT VT : MVT::integer_vector_valuetypes()) {
837 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
838 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
839 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
840 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
841 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
842 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
843 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
844 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
845 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
848 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
849 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
850 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
851 setOperationAction(ISD::VSELECT, VT, Custom);
853 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
856 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
860 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
861 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
862 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
863 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
864 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
865 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
866 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
869 // Custom lower v2i64 and v2f64 selects.
870 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
871 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
873 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
874 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
876 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
877 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
879 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
881 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
882 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
884 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
885 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
887 for (MVT VT : MVT::fp_vector_valuetypes())
888 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
890 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
891 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
892 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
893 if (!Subtarget.hasAVX512())
894 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
896 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
897 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
898 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
900 // In the customized shift lowering, the legal v4i32/v2i64 cases
901 // in AVX2 will be recognized.
902 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
903 setOperationAction(ISD::SRL, VT, Custom);
904 setOperationAction(ISD::SHL, VT, Custom);
905 setOperationAction(ISD::SRA, VT, Custom);
908 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
909 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
912 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
913 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
914 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
915 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
916 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
917 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
918 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
919 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
920 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
923 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
924 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
925 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
926 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
927 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
928 setOperationAction(ISD::FRINT, RoundedTy, Legal);
929 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
932 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
933 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
934 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
935 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
936 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
937 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
938 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
939 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
941 // FIXME: Do we need to handle scalar-to-vector here?
942 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
944 // We directly match byte blends in the backend as they match the VSELECT
946 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
948 // SSE41 brings specific instructions for doing vector sign extend even in
949 // cases where we don't have SRA.
950 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
951 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
952 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
955 for (MVT VT : MVT::integer_vector_valuetypes()) {
956 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
957 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
958 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
961 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
962 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
963 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
964 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
965 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
966 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
967 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
968 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
969 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
972 // i8 vectors are custom because the source register and source
973 // source memory operand types are not the same width.
974 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
977 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
978 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
979 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
980 setOperationAction(ISD::ROTL, VT, Custom);
982 // XOP can efficiently perform BITREVERSE with VPPERM.
983 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
984 setOperationAction(ISD::BITREVERSE, VT, Custom);
986 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
987 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
988 setOperationAction(ISD::BITREVERSE, VT, Custom);
991 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
992 bool HasInt256 = Subtarget.hasInt256();
994 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
995 : &X86::VR256RegClass);
996 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
997 : &X86::VR256RegClass);
998 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
999 : &X86::VR256RegClass);
1000 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1001 : &X86::VR256RegClass);
1002 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1003 : &X86::VR256RegClass);
1004 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1005 : &X86::VR256RegClass);
1007 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1008 setOperationAction(ISD::FFLOOR, VT, Legal);
1009 setOperationAction(ISD::FCEIL, VT, Legal);
1010 setOperationAction(ISD::FTRUNC, VT, Legal);
1011 setOperationAction(ISD::FRINT, VT, Legal);
1012 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1013 setOperationAction(ISD::FNEG, VT, Custom);
1014 setOperationAction(ISD::FABS, VT, Custom);
1015 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1018 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1019 // even though v8i16 is a legal type.
1020 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1021 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1022 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1024 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1025 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1027 if (!Subtarget.hasAVX512())
1028 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1030 for (MVT VT : MVT::fp_vector_valuetypes())
1031 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1033 // In the customized shift lowering, the legal v8i32/v4i64 cases
1034 // in AVX2 will be recognized.
1035 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1036 setOperationAction(ISD::SRL, VT, Custom);
1037 setOperationAction(ISD::SHL, VT, Custom);
1038 setOperationAction(ISD::SRA, VT, Custom);
1041 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1042 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1044 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1045 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1046 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1048 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1049 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1050 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1051 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1054 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1055 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1056 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1057 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1059 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1060 setOperationAction(ISD::SETCC, VT, Custom);
1061 setOperationAction(ISD::CTPOP, VT, Custom);
1062 setOperationAction(ISD::CTTZ, VT, Custom);
1063 setOperationAction(ISD::CTLZ, VT, Custom);
1065 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1066 // setcc all the way to isel and prefer SETGT in some isel patterns.
1067 setCondCodeAction(ISD::SETLT, VT, Custom);
1068 setCondCodeAction(ISD::SETLE, VT, Custom);
1071 if (Subtarget.hasAnyFMA()) {
1072 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1073 MVT::v2f64, MVT::v4f64 })
1074 setOperationAction(ISD::FMA, VT, Legal);
1077 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1078 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1079 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1082 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1083 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1085 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1087 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1088 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1090 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1091 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1092 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1093 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1095 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1096 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1097 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1098 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1100 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1101 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1102 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1103 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1104 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1105 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1109 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1110 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1111 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1113 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1114 // when we have a 256bit-wide blend with immediate.
1115 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1117 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1118 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1119 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1120 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1121 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1122 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1123 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1124 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1128 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1129 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1130 setOperationAction(ISD::MLOAD, VT, Legal);
1131 setOperationAction(ISD::MSTORE, VT, Legal);
1134 // Extract subvector is special because the value type
1135 // (result) is 128-bit but the source is 256-bit wide.
1136 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1137 MVT::v4f32, MVT::v2f64 }) {
1138 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1141 // Custom lower several nodes for 256-bit types.
1142 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1143 MVT::v8f32, MVT::v4f64 }) {
1144 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1145 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1146 setOperationAction(ISD::VSELECT, VT, Custom);
1147 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1148 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1149 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1150 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1151 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1155 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1157 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1158 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1159 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1160 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1161 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1167 // Custom legalize 2x32 to get a little better code.
1168 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1169 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1171 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1172 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1173 setOperationAction(ISD::MGATHER, VT, Custom);
1177 // This block controls legalization of the mask vector sizes that are
1178 // available with AVX512. 512-bit vectors are in a separate block controlled
1179 // by useAVX512Regs.
1180 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1181 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1182 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1183 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1184 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1185 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1187 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1188 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1189 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1191 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1192 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1193 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1194 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1195 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1196 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1198 // There is no byte sized k-register load or store without AVX512DQ.
1199 if (!Subtarget.hasDQI()) {
1200 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1201 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1202 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1205 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1206 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1207 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1208 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1211 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1212 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1213 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1214 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1215 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1218 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1219 setOperationAction(ISD::ADD, VT, Custom);
1220 setOperationAction(ISD::SUB, VT, Custom);
1221 setOperationAction(ISD::MUL, VT, Custom);
1222 setOperationAction(ISD::SETCC, VT, Custom);
1223 setOperationAction(ISD::SELECT, VT, Custom);
1224 setOperationAction(ISD::TRUNCATE, VT, Custom);
1226 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1227 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1228 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1229 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1230 setOperationAction(ISD::VSELECT, VT, Expand);
1233 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1234 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1235 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1236 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1237 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1238 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1239 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1240 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1241 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1244 // This block controls legalization for 512-bit operations with 32/64 bit
1245 // elements. 512-bits can be disabled based on prefer-vector-width and
1246 // required-vector-width function attributes.
1247 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1248 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1249 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1250 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1251 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1253 for (MVT VT : MVT::fp_vector_valuetypes())
1254 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1256 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1257 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1258 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1259 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1260 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1261 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1264 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1265 setOperationAction(ISD::FNEG, VT, Custom);
1266 setOperationAction(ISD::FABS, VT, Custom);
1267 setOperationAction(ISD::FMA, VT, Legal);
1268 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1271 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1272 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1273 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1274 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1275 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1276 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1277 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1278 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1279 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1280 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1282 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1283 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1284 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1285 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1286 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1288 if (!Subtarget.hasVLX()) {
1289 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1290 // to 512-bit rather than use the AVX2 instructions so that we can use
1292 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1293 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1294 setOperationAction(ISD::MLOAD, VT, Custom);
1295 setOperationAction(ISD::MSTORE, VT, Custom);
1299 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1300 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1301 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1302 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1303 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1304 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1305 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1306 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1308 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1309 setOperationAction(ISD::FFLOOR, VT, Legal);
1310 setOperationAction(ISD::FCEIL, VT, Legal);
1311 setOperationAction(ISD::FTRUNC, VT, Legal);
1312 setOperationAction(ISD::FRINT, VT, Legal);
1313 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1316 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1317 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1319 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1320 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1321 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1323 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1324 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1325 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1326 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1328 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1329 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1331 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1332 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1334 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1335 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1336 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1338 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1339 setOperationAction(ISD::SMAX, VT, Legal);
1340 setOperationAction(ISD::UMAX, VT, Legal);
1341 setOperationAction(ISD::SMIN, VT, Legal);
1342 setOperationAction(ISD::UMIN, VT, Legal);
1343 setOperationAction(ISD::ABS, VT, Legal);
1344 setOperationAction(ISD::SRL, VT, Custom);
1345 setOperationAction(ISD::SHL, VT, Custom);
1346 setOperationAction(ISD::SRA, VT, Custom);
1347 setOperationAction(ISD::CTPOP, VT, Custom);
1348 setOperationAction(ISD::CTTZ, VT, Custom);
1349 setOperationAction(ISD::ROTL, VT, Custom);
1350 setOperationAction(ISD::ROTR, VT, Custom);
1351 setOperationAction(ISD::SETCC, VT, Custom);
1353 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1354 // setcc all the way to isel and prefer SETGT in some isel patterns.
1355 setCondCodeAction(ISD::SETLT, VT, Custom);
1356 setCondCodeAction(ISD::SETLE, VT, Custom);
1359 // Need to promote to 64-bit even though we have 32-bit masked instructions
1360 // because the IR optimizers rearrange bitcasts around logic ops leaving
1361 // too many variations to handle if we don't promote them.
1362 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1363 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1364 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1366 if (Subtarget.hasDQI()) {
1367 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1368 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1369 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1370 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1372 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1375 if (Subtarget.hasCDI()) {
1376 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1377 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1378 setOperationAction(ISD::CTLZ, VT, Legal);
1379 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1381 } // Subtarget.hasCDI()
1383 if (Subtarget.hasVPOPCNTDQ()) {
1384 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1385 setOperationAction(ISD::CTPOP, VT, Legal);
1388 // Extract subvector is special because the value type
1389 // (result) is 256-bit but the source is 512-bit wide.
1390 // 128-bit was made Legal under AVX1.
1391 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1392 MVT::v8f32, MVT::v4f64 })
1393 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1395 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1396 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1397 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1398 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1399 setOperationAction(ISD::VSELECT, VT, Custom);
1400 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1401 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1402 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1403 setOperationAction(ISD::MLOAD, VT, Legal);
1404 setOperationAction(ISD::MSTORE, VT, Legal);
1405 setOperationAction(ISD::MGATHER, VT, Custom);
1406 setOperationAction(ISD::MSCATTER, VT, Custom);
1408 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1409 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1410 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1413 // Need to custom split v32i16/v64i8 bitcasts.
1414 if (!Subtarget.hasBWI()) {
1415 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1416 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1420 // This block controls legalization for operations that don't have
1421 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1423 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1424 // These operations are handled on non-VLX by artificially widening in
1426 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1428 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1429 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1430 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1431 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1432 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1434 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1435 setOperationAction(ISD::SMAX, VT, Legal);
1436 setOperationAction(ISD::UMAX, VT, Legal);
1437 setOperationAction(ISD::SMIN, VT, Legal);
1438 setOperationAction(ISD::UMIN, VT, Legal);
1439 setOperationAction(ISD::ABS, VT, Legal);
1442 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1443 setOperationAction(ISD::ROTL, VT, Custom);
1444 setOperationAction(ISD::ROTR, VT, Custom);
1447 // Custom legalize 2x32 to get a little better code.
1448 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1449 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1451 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1452 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1453 setOperationAction(ISD::MSCATTER, VT, Custom);
1455 if (Subtarget.hasDQI()) {
1456 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1457 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1458 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1459 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1460 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1462 setOperationAction(ISD::MUL, VT, Legal);
1466 if (Subtarget.hasCDI()) {
1467 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1468 setOperationAction(ISD::CTLZ, VT, Legal);
1469 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1471 } // Subtarget.hasCDI()
1473 if (Subtarget.hasVPOPCNTDQ()) {
1474 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1475 setOperationAction(ISD::CTPOP, VT, Legal);
1479 // This block control legalization of v32i1/v64i1 which are available with
1480 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1482 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1483 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1484 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1486 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1487 setOperationAction(ISD::ADD, VT, Custom);
1488 setOperationAction(ISD::SUB, VT, Custom);
1489 setOperationAction(ISD::MUL, VT, Custom);
1490 setOperationAction(ISD::VSELECT, VT, Expand);
1492 setOperationAction(ISD::TRUNCATE, VT, Custom);
1493 setOperationAction(ISD::SETCC, VT, Custom);
1494 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1495 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1496 setOperationAction(ISD::SELECT, VT, Custom);
1497 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1498 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1501 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1502 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1503 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1504 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1505 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1506 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1508 // Extends from v32i1 masks to 256-bit vectors.
1509 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1510 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1511 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1514 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1515 // disabled based on prefer-vector-width and required-vector-width function
1517 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1518 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1519 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1521 // Extends from v64i1 masks to 512-bit vectors.
1522 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1523 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1524 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1527 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1528 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1529 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1530 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1531 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1532 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1533 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1534 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1535 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1536 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1537 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1538 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1539 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1540 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1541 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1542 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1543 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1544 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1545 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1546 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1547 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1548 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1550 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1552 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1554 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1555 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1556 setOperationAction(ISD::VSELECT, VT, Custom);
1557 setOperationAction(ISD::ABS, VT, Legal);
1558 setOperationAction(ISD::SRL, VT, Custom);
1559 setOperationAction(ISD::SHL, VT, Custom);
1560 setOperationAction(ISD::SRA, VT, Custom);
1561 setOperationAction(ISD::MLOAD, VT, Legal);
1562 setOperationAction(ISD::MSTORE, VT, Legal);
1563 setOperationAction(ISD::CTPOP, VT, Custom);
1564 setOperationAction(ISD::CTTZ, VT, Custom);
1565 setOperationAction(ISD::CTLZ, VT, Custom);
1566 setOperationAction(ISD::SMAX, VT, Legal);
1567 setOperationAction(ISD::UMAX, VT, Legal);
1568 setOperationAction(ISD::SMIN, VT, Legal);
1569 setOperationAction(ISD::UMIN, VT, Legal);
1570 setOperationAction(ISD::SETCC, VT, Custom);
1572 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1573 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1574 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1577 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1578 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1581 if (Subtarget.hasBITALG()) {
1582 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1583 setOperationAction(ISD::CTPOP, VT, Legal);
1587 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1588 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1589 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1590 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1593 // These operations are handled on non-VLX by artificially widening in
1595 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1597 if (Subtarget.hasBITALG()) {
1598 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1599 setOperationAction(ISD::CTPOP, VT, Legal);
1603 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1604 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1605 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1606 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1607 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1608 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1610 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1611 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1612 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1613 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1614 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1616 if (Subtarget.hasDQI()) {
1617 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1618 // v2f32 UINT_TO_FP is already custom under SSE2.
1619 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1620 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1621 "Unexpected operation action!");
1622 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1623 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1624 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1627 if (Subtarget.hasBWI()) {
1628 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1629 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1633 // We want to custom lower some of our intrinsics.
1634 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1635 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1636 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1637 if (!Subtarget.is64Bit()) {
1638 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1639 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1642 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1643 // handle type legalization for these operations here.
1645 // FIXME: We really should do custom legalization for addition and
1646 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1647 // than generic legalization for 64-bit multiplication-with-overflow, though.
1648 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1649 if (VT == MVT::i64 && !Subtarget.is64Bit())
1651 // Add/Sub/Mul with overflow operations are custom lowered.
1652 setOperationAction(ISD::SADDO, VT, Custom);
1653 setOperationAction(ISD::UADDO, VT, Custom);
1654 setOperationAction(ISD::SSUBO, VT, Custom);
1655 setOperationAction(ISD::USUBO, VT, Custom);
1656 setOperationAction(ISD::SMULO, VT, Custom);
1657 setOperationAction(ISD::UMULO, VT, Custom);
1659 // Support carry in as value rather than glue.
1660 setOperationAction(ISD::ADDCARRY, VT, Custom);
1661 setOperationAction(ISD::SUBCARRY, VT, Custom);
1662 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1665 if (!Subtarget.is64Bit()) {
1666 // These libcalls are not available in 32-bit.
1667 setLibcallName(RTLIB::SHL_I128, nullptr);
1668 setLibcallName(RTLIB::SRL_I128, nullptr);
1669 setLibcallName(RTLIB::SRA_I128, nullptr);
1670 setLibcallName(RTLIB::MUL_I128, nullptr);
1673 // Combine sin / cos into _sincos_stret if it is available.
1674 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1675 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1676 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1677 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1680 if (Subtarget.isTargetWin64()) {
1681 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1682 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1683 setOperationAction(ISD::SREM, MVT::i128, Custom);
1684 setOperationAction(ISD::UREM, MVT::i128, Custom);
1685 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1686 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1689 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1690 // is. We should promote the value to 64-bits to solve this.
1691 // This is what the CRT headers do - `fmodf` is an inline header
1692 // function casting to f64 and calling `fmod`.
1693 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1694 Subtarget.isTargetWindowsItanium()))
1695 for (ISD::NodeType Op :
1696 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1697 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1698 if (isOperationExpand(Op, MVT::f32))
1699 setOperationAction(Op, MVT::f32, Promote);
1701 // We have target-specific dag combine patterns for the following nodes:
1702 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1703 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1704 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1705 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1706 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1707 setTargetDAGCombine(ISD::BITCAST);
1708 setTargetDAGCombine(ISD::VSELECT);
1709 setTargetDAGCombine(ISD::SELECT);
1710 setTargetDAGCombine(ISD::SHL);
1711 setTargetDAGCombine(ISD::SRA);
1712 setTargetDAGCombine(ISD::SRL);
1713 setTargetDAGCombine(ISD::OR);
1714 setTargetDAGCombine(ISD::AND);
1715 setTargetDAGCombine(ISD::ADD);
1716 setTargetDAGCombine(ISD::FADD);
1717 setTargetDAGCombine(ISD::FSUB);
1718 setTargetDAGCombine(ISD::FNEG);
1719 setTargetDAGCombine(ISD::FMA);
1720 setTargetDAGCombine(ISD::FMINNUM);
1721 setTargetDAGCombine(ISD::FMAXNUM);
1722 setTargetDAGCombine(ISD::SUB);
1723 setTargetDAGCombine(ISD::LOAD);
1724 setTargetDAGCombine(ISD::MLOAD);
1725 setTargetDAGCombine(ISD::STORE);
1726 setTargetDAGCombine(ISD::MSTORE);
1727 setTargetDAGCombine(ISD::TRUNCATE);
1728 setTargetDAGCombine(ISD::ZERO_EXTEND);
1729 setTargetDAGCombine(ISD::ANY_EXTEND);
1730 setTargetDAGCombine(ISD::SIGN_EXTEND);
1731 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1732 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1733 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1734 setTargetDAGCombine(ISD::SINT_TO_FP);
1735 setTargetDAGCombine(ISD::UINT_TO_FP);
1736 setTargetDAGCombine(ISD::SETCC);
1737 setTargetDAGCombine(ISD::MUL);
1738 setTargetDAGCombine(ISD::XOR);
1739 setTargetDAGCombine(ISD::MSCATTER);
1740 setTargetDAGCombine(ISD::MGATHER);
1742 computeRegisterProperties(Subtarget.getRegisterInfo());
1744 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1745 MaxStoresPerMemsetOptSize = 8;
1746 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1747 MaxStoresPerMemcpyOptSize = 4;
1748 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1749 MaxStoresPerMemmoveOptSize = 4;
1751 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1752 // that needs to benchmarked and balanced with the potential use of vector
1753 // load/store types (PR33329, PR33914).
1754 MaxLoadsPerMemcmp = 2;
1755 MaxLoadsPerMemcmpOptSize = 2;
1757 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1758 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1760 // An out-of-order CPU can speculatively execute past a predictable branch,
1761 // but a conditional move could be stalled by an expensive earlier operation.
1762 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1763 EnableExtLdPromotion = true;
1764 setPrefFunctionAlignment(4); // 2^4 bytes.
1766 verifyIntrinsicTables();
1769 // This has so far only been implemented for 64-bit MachO.
1770 bool X86TargetLowering::useLoadStackGuardNode() const {
1771 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1774 bool X86TargetLowering::useStackGuardXorFP() const {
1775 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1776 return Subtarget.getTargetTriple().isOSMSVCRT();
1779 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1780 const SDLoc &DL) const {
1781 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1782 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1783 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1784 return SDValue(Node, 0);
1787 TargetLoweringBase::LegalizeTypeAction
1788 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1789 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1790 return TypeSplitVector;
1792 if (ExperimentalVectorWideningLegalization &&
1793 VT.getVectorNumElements() != 1 &&
1794 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1795 return TypeWidenVector;
1797 return TargetLoweringBase::getPreferredVectorAction(VT);
1800 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1802 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1804 return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
1807 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1809 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1811 return TargetLowering::getNumRegistersForCallingConv(Context, VT);
1814 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1815 LLVMContext& Context,
1820 if (Subtarget.hasAVX512()) {
1821 const unsigned NumElts = VT.getVectorNumElements();
1823 // Figure out what this type will be legalized to.
1825 while (getTypeAction(Context, LegalVT) != TypeLegal)
1826 LegalVT = getTypeToTransformTo(Context, LegalVT);
1828 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1829 if (LegalVT.getSimpleVT().is512BitVector())
1830 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1832 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1833 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1834 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1836 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1837 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1838 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1842 return VT.changeVectorElementTypeToInteger();
1845 /// Helper for getByValTypeAlignment to determine
1846 /// the desired ByVal argument alignment.
1847 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1850 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1851 if (VTy->getBitWidth() == 128)
1853 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1854 unsigned EltAlign = 0;
1855 getMaxByValAlign(ATy->getElementType(), EltAlign);
1856 if (EltAlign > MaxAlign)
1857 MaxAlign = EltAlign;
1858 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1859 for (auto *EltTy : STy->elements()) {
1860 unsigned EltAlign = 0;
1861 getMaxByValAlign(EltTy, EltAlign);
1862 if (EltAlign > MaxAlign)
1863 MaxAlign = EltAlign;
1870 /// Return the desired alignment for ByVal aggregate
1871 /// function arguments in the caller parameter area. For X86, aggregates
1872 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1873 /// are at 4-byte boundaries.
1874 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1875 const DataLayout &DL) const {
1876 if (Subtarget.is64Bit()) {
1877 // Max of 8 and alignment of type.
1878 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1885 if (Subtarget.hasSSE1())
1886 getMaxByValAlign(Ty, Align);
1890 /// Returns the target specific optimal type for load
1891 /// and store operations as a result of memset, memcpy, and memmove
1892 /// lowering. If DstAlign is zero that means it's safe to destination
1893 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1894 /// means there isn't a need to check it against alignment requirement,
1895 /// probably because the source does not need to be loaded. If 'IsMemset' is
1896 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1897 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1898 /// source is constant so it does not need to be loaded.
1899 /// It returns EVT::Other if the type should be determined using generic
1900 /// target-independent logic.
1902 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1903 unsigned DstAlign, unsigned SrcAlign,
1904 bool IsMemset, bool ZeroMemset,
1906 MachineFunction &MF) const {
1907 const Function &F = MF.getFunction();
1908 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1910 (!Subtarget.isUnalignedMem16Slow() ||
1911 ((DstAlign == 0 || DstAlign >= 16) &&
1912 (SrcAlign == 0 || SrcAlign >= 16)))) {
1913 // FIXME: Check if unaligned 32-byte accesses are slow.
1914 if (Size >= 32 && Subtarget.hasAVX()) {
1915 // Although this isn't a well-supported type for AVX1, we'll let
1916 // legalization and shuffle lowering produce the optimal codegen. If we
1917 // choose an optimal type with a vector element larger than a byte,
1918 // getMemsetStores() may create an intermediate splat (using an integer
1919 // multiply) before we splat as a vector.
1922 if (Subtarget.hasSSE2())
1924 // TODO: Can SSE1 handle a byte vector?
1925 if (Subtarget.hasSSE1())
1927 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1928 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1929 // Do not use f64 to lower memcpy if source is string constant. It's
1930 // better to use i32 to avoid the loads.
1931 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1932 // The gymnastics of splatting a byte value into an XMM register and then
1933 // only using 8-byte stores (because this is a CPU with slow unaligned
1934 // 16-byte accesses) makes that a loser.
1938 // This is a compromise. If we reach here, unaligned accesses may be slow on
1939 // this target. However, creating smaller, aligned accesses could be even
1940 // slower and would certainly be a lot more code.
1941 if (Subtarget.is64Bit() && Size >= 8)
1946 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1948 return X86ScalarSSEf32;
1949 else if (VT == MVT::f64)
1950 return X86ScalarSSEf64;
1955 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1960 switch (VT.getSizeInBits()) {
1962 // 8-byte and under are always assumed to be fast.
1966 *Fast = !Subtarget.isUnalignedMem16Slow();
1969 *Fast = !Subtarget.isUnalignedMem32Slow();
1971 // TODO: What about AVX-512 (512-bit) accesses?
1974 // Misaligned accesses of any size are always allowed.
1978 /// Return the entry encoding for a jump table in the
1979 /// current function. The returned value is a member of the
1980 /// MachineJumpTableInfo::JTEntryKind enum.
1981 unsigned X86TargetLowering::getJumpTableEncoding() const {
1982 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1984 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1985 return MachineJumpTableInfo::EK_Custom32;
1987 // Otherwise, use the normal jump table encoding heuristics.
1988 return TargetLowering::getJumpTableEncoding();
1991 bool X86TargetLowering::useSoftFloat() const {
1992 return Subtarget.useSoftFloat();
1995 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1996 ArgListTy &Args) const {
1998 // Only relabel X86-32 for C / Stdcall CCs.
1999 if (Subtarget.is64Bit())
2001 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2003 unsigned ParamRegs = 0;
2004 if (auto *M = MF->getFunction().getParent())
2005 ParamRegs = M->getNumberRegisterParameters();
2007 // Mark the first N int arguments as having reg
2008 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2009 Type *T = Args[Idx].Ty;
2010 if (T->isPointerTy() || T->isIntegerTy())
2011 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2012 unsigned numRegs = 1;
2013 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2015 if (ParamRegs < numRegs)
2017 ParamRegs -= numRegs;
2018 Args[Idx].IsInReg = true;
2024 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2025 const MachineBasicBlock *MBB,
2026 unsigned uid,MCContext &Ctx) const{
2027 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2028 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2030 return MCSymbolRefExpr::create(MBB->getSymbol(),
2031 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2034 /// Returns relocation base for the given PIC jumptable.
2035 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2036 SelectionDAG &DAG) const {
2037 if (!Subtarget.is64Bit())
2038 // This doesn't have SDLoc associated with it, but is not really the
2039 // same as a Register.
2040 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2041 getPointerTy(DAG.getDataLayout()));
2045 /// This returns the relocation base for the given PIC jumptable,
2046 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2047 const MCExpr *X86TargetLowering::
2048 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2049 MCContext &Ctx) const {
2050 // X86-64 uses RIP relative addressing based on the jump table label.
2051 if (Subtarget.isPICStyleRIPRel())
2052 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2054 // Otherwise, the reference is relative to the PIC base.
2055 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2058 std::pair<const TargetRegisterClass *, uint8_t>
2059 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2061 const TargetRegisterClass *RRC = nullptr;
2063 switch (VT.SimpleTy) {
2065 return TargetLowering::findRepresentativeClass(TRI, VT);
2066 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2067 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2070 RRC = &X86::VR64RegClass;
2072 case MVT::f32: case MVT::f64:
2073 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2074 case MVT::v4f32: case MVT::v2f64:
2075 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2076 case MVT::v8f32: case MVT::v4f64:
2077 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2078 case MVT::v16f32: case MVT::v8f64:
2079 RRC = &X86::VR128XRegClass;
2082 return std::make_pair(RRC, Cost);
2085 unsigned X86TargetLowering::getAddressSpace() const {
2086 if (Subtarget.is64Bit())
2087 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2091 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2092 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2093 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2096 static Constant* SegmentOffset(IRBuilder<> &IRB,
2097 unsigned Offset, unsigned AddressSpace) {
2098 return ConstantExpr::getIntToPtr(
2099 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2100 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2103 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2104 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2105 // tcbhead_t; use it instead of the usual global variable (see
2106 // sysdeps/{i386,x86_64}/nptl/tls.h)
2107 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2108 if (Subtarget.isTargetFuchsia()) {
2109 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2110 return SegmentOffset(IRB, 0x10, getAddressSpace());
2112 // %fs:0x28, unless we're using a Kernel code model, in which case
2113 // it's %gs:0x28. gs:0x14 on i386.
2114 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2115 return SegmentOffset(IRB, Offset, getAddressSpace());
2119 return TargetLowering::getIRStackGuard(IRB);
2122 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2123 // MSVC CRT provides functionalities for stack protection.
2124 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2125 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2126 // MSVC CRT has a global variable holding security cookie.
2127 M.getOrInsertGlobal("__security_cookie",
2128 Type::getInt8PtrTy(M.getContext()));
2130 // MSVC CRT has a function to validate security cookie.
2131 auto *SecurityCheckCookie = cast<Function>(
2132 M.getOrInsertFunction("__security_check_cookie",
2133 Type::getVoidTy(M.getContext()),
2134 Type::getInt8PtrTy(M.getContext())));
2135 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2136 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2139 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2140 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2142 TargetLowering::insertSSPDeclarations(M);
2145 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2146 // MSVC CRT has a global variable holding security cookie.
2147 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2148 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2149 return M.getGlobalVariable("__security_cookie");
2151 return TargetLowering::getSDagStackGuard(M);
2154 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2155 // MSVC CRT has a function to validate security cookie.
2156 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2157 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2158 return M.getFunction("__security_check_cookie");
2160 return TargetLowering::getSSPStackGuardCheck(M);
2163 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2164 if (Subtarget.getTargetTriple().isOSContiki())
2165 return getDefaultSafeStackPointerLocation(IRB, false);
2167 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2168 // definition of TLS_SLOT_SAFESTACK in
2169 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2170 if (Subtarget.isTargetAndroid()) {
2171 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2173 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2174 return SegmentOffset(IRB, Offset, getAddressSpace());
2177 // Fuchsia is similar.
2178 if (Subtarget.isTargetFuchsia()) {
2179 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2180 return SegmentOffset(IRB, 0x18, getAddressSpace());
2183 return TargetLowering::getSafeStackPointerLocation(IRB);
2186 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2187 unsigned DestAS) const {
2188 assert(SrcAS != DestAS && "Expected different address spaces!");
2190 return SrcAS < 256 && DestAS < 256;
2193 //===----------------------------------------------------------------------===//
2194 // Return Value Calling Convention Implementation
2195 //===----------------------------------------------------------------------===//
2197 #include "X86GenCallingConv.inc"
2199 bool X86TargetLowering::CanLowerReturn(
2200 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2201 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2202 SmallVector<CCValAssign, 16> RVLocs;
2203 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2204 return CCInfo.CheckReturn(Outs, RetCC_X86);
2207 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2208 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2212 /// Lowers masks values (v*i1) to the local register values
2213 /// \returns DAG node after lowering to register type
2214 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2215 const SDLoc &Dl, SelectionDAG &DAG) {
2216 EVT ValVT = ValArg.getValueType();
2218 if (ValVT == MVT::v1i1)
2219 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2220 DAG.getIntPtrConstant(0, Dl));
2222 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2223 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2224 // Two stage lowering might be required
2225 // bitcast: v8i1 -> i8 / v16i1 -> i16
2226 // anyextend: i8 -> i32 / i16 -> i32
2227 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2228 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2229 if (ValLoc == MVT::i32)
2230 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2234 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2235 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2236 // One stage lowering is required
2237 // bitcast: v32i1 -> i32 / v64i1 -> i64
2238 return DAG.getBitcast(ValLoc, ValArg);
2241 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2244 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2245 static void Passv64i1ArgInRegs(
2246 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2247 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2248 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2249 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2250 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2251 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2252 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2253 "The value should reside in two registers");
2255 // Before splitting the value we cast it to i64
2256 Arg = DAG.getBitcast(MVT::i64, Arg);
2258 // Splitting the value into two i32 types
2260 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2261 DAG.getConstant(0, Dl, MVT::i32));
2262 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2263 DAG.getConstant(1, Dl, MVT::i32));
2265 // Attach the two i32 types into corresponding registers
2266 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2267 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2271 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2273 const SmallVectorImpl<ISD::OutputArg> &Outs,
2274 const SmallVectorImpl<SDValue> &OutVals,
2275 const SDLoc &dl, SelectionDAG &DAG) const {
2276 MachineFunction &MF = DAG.getMachineFunction();
2277 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2279 // In some cases we need to disable registers from the default CSR list.
2280 // For example, when they are used for argument passing.
2281 bool ShouldDisableCalleeSavedRegister =
2282 CallConv == CallingConv::X86_RegCall ||
2283 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2285 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2286 report_fatal_error("X86 interrupts may not return any value");
2288 SmallVector<CCValAssign, 16> RVLocs;
2289 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2290 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2293 SmallVector<SDValue, 6> RetOps;
2294 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2295 // Operand #1 = Bytes To Pop
2296 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2299 // Copy the result values into the output registers.
2300 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2302 CCValAssign &VA = RVLocs[I];
2303 assert(VA.isRegLoc() && "Can only return in registers!");
2305 // Add the register to the CalleeSaveDisableRegs list.
2306 if (ShouldDisableCalleeSavedRegister)
2307 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2309 SDValue ValToCopy = OutVals[OutsIndex];
2310 EVT ValVT = ValToCopy.getValueType();
2312 // Promote values to the appropriate types.
2313 if (VA.getLocInfo() == CCValAssign::SExt)
2314 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2315 else if (VA.getLocInfo() == CCValAssign::ZExt)
2316 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2317 else if (VA.getLocInfo() == CCValAssign::AExt) {
2318 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2319 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2321 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2323 else if (VA.getLocInfo() == CCValAssign::BCvt)
2324 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2326 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2327 "Unexpected FP-extend for return value.");
2329 // If this is x86-64, and we disabled SSE, we can't return FP values,
2330 // or SSE or MMX vectors.
2331 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2332 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2333 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2334 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2335 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2336 } else if (ValVT == MVT::f64 &&
2337 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2338 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2339 // llvm-gcc has never done it right and no one has noticed, so this
2340 // should be OK for now.
2341 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2342 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2345 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2346 // the RET instruction and handled by the FP Stackifier.
2347 if (VA.getLocReg() == X86::FP0 ||
2348 VA.getLocReg() == X86::FP1) {
2349 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2350 // change the value to the FP stack register class.
2351 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2352 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2353 RetOps.push_back(ValToCopy);
2354 // Don't emit a copytoreg.
2358 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2359 // which is returned in RAX / RDX.
2360 if (Subtarget.is64Bit()) {
2361 if (ValVT == MVT::x86mmx) {
2362 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2363 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2364 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2366 // If we don't have SSE2 available, convert to v4f32 so the generated
2367 // register is legal.
2368 if (!Subtarget.hasSSE2())
2369 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2374 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2376 if (VA.needsCustom()) {
2377 assert(VA.getValVT() == MVT::v64i1 &&
2378 "Currently the only custom case is when we split v64i1 to 2 regs");
2380 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2383 assert(2 == RegsToPass.size() &&
2384 "Expecting two registers after Pass64BitArgInRegs");
2386 // Add the second register to the CalleeSaveDisableRegs list.
2387 if (ShouldDisableCalleeSavedRegister)
2388 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2390 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2393 // Add nodes to the DAG and add the values into the RetOps list
2394 for (auto &Reg : RegsToPass) {
2395 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2396 Flag = Chain.getValue(1);
2397 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2401 // Swift calling convention does not require we copy the sret argument
2402 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2404 // All x86 ABIs require that for returning structs by value we copy
2405 // the sret argument into %rax/%eax (depending on ABI) for the return.
2406 // We saved the argument into a virtual register in the entry block,
2407 // so now we copy the value out and into %rax/%eax.
2409 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2410 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2411 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2412 // either case FuncInfo->setSRetReturnReg() will have been called.
2413 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2414 // When we have both sret and another return value, we should use the
2415 // original Chain stored in RetOps[0], instead of the current Chain updated
2416 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2418 // For the case of sret and another return value, we have
2419 // Chain_0 at the function entry
2420 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2421 // If we use Chain_1 in getCopyFromReg, we will have
2422 // Val = getCopyFromReg(Chain_1)
2423 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2425 // getCopyToReg(Chain_0) will be glued together with
2426 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2427 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2428 // Data dependency from Unit B to Unit A due to usage of Val in
2429 // getCopyToReg(Chain_1, Val)
2430 // Chain dependency from Unit A to Unit B
2432 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2433 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2434 getPointerTy(MF.getDataLayout()));
2437 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2438 X86::RAX : X86::EAX;
2439 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2440 Flag = Chain.getValue(1);
2442 // RAX/EAX now acts like a return value.
2444 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2446 // Add the returned register to the CalleeSaveDisableRegs list.
2447 if (ShouldDisableCalleeSavedRegister)
2448 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2451 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2452 const MCPhysReg *I =
2453 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2456 if (X86::GR64RegClass.contains(*I))
2457 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2459 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2463 RetOps[0] = Chain; // Update chain.
2465 // Add the flag if we have it.
2467 RetOps.push_back(Flag);
2469 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2470 if (CallConv == CallingConv::X86_INTR)
2471 opcode = X86ISD::IRET;
2472 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2475 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2476 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2479 SDValue TCChain = Chain;
2480 SDNode *Copy = *N->use_begin();
2481 if (Copy->getOpcode() == ISD::CopyToReg) {
2482 // If the copy has a glue operand, we conservatively assume it isn't safe to
2483 // perform a tail call.
2484 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2486 TCChain = Copy->getOperand(0);
2487 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2490 bool HasRet = false;
2491 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2493 if (UI->getOpcode() != X86ISD::RET_FLAG)
2495 // If we are returning more than one value, we can definitely
2496 // not make a tail call see PR19530
2497 if (UI->getNumOperands() > 4)
2499 if (UI->getNumOperands() == 4 &&
2500 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2512 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2513 ISD::NodeType ExtendKind) const {
2514 MVT ReturnMVT = MVT::i32;
2516 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2517 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2518 // The ABI does not require i1, i8 or i16 to be extended.
2520 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2521 // always extending i8/i16 return values, so keep doing that for now.
2523 ReturnMVT = MVT::i8;
2526 EVT MinVT = getRegisterType(Context, ReturnMVT);
2527 return VT.bitsLT(MinVT) ? MinVT : VT;
2530 /// Reads two 32 bit registers and creates a 64 bit mask value.
2531 /// \param VA The current 32 bit value that need to be assigned.
2532 /// \param NextVA The next 32 bit value that need to be assigned.
2533 /// \param Root The parent DAG node.
2534 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2535 /// glue purposes. In the case the DAG is already using
2536 /// physical register instead of virtual, we should glue
2537 /// our new SDValue to InFlag SDvalue.
2538 /// \return a new SDvalue of size 64bit.
2539 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2540 SDValue &Root, SelectionDAG &DAG,
2541 const SDLoc &Dl, const X86Subtarget &Subtarget,
2542 SDValue *InFlag = nullptr) {
2543 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2544 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2545 assert(VA.getValVT() == MVT::v64i1 &&
2546 "Expecting first location of 64 bit width type");
2547 assert(NextVA.getValVT() == VA.getValVT() &&
2548 "The locations should have the same type");
2549 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2550 "The values should reside in two registers");
2554 SDValue ArgValueLo, ArgValueHi;
2556 MachineFunction &MF = DAG.getMachineFunction();
2557 const TargetRegisterClass *RC = &X86::GR32RegClass;
2559 // Read a 32 bit value from the registers.
2560 if (nullptr == InFlag) {
2561 // When no physical register is present,
2562 // create an intermediate virtual register.
2563 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2564 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2565 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2566 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2568 // When a physical register is available read the value from it and glue
2569 // the reads together.
2571 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2572 *InFlag = ArgValueLo.getValue(2);
2574 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2575 *InFlag = ArgValueHi.getValue(2);
2578 // Convert the i32 type into v32i1 type.
2579 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2581 // Convert the i32 type into v32i1 type.
2582 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2584 // Concatenate the two values together.
2585 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2588 /// The function will lower a register of various sizes (8/16/32/64)
2589 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2590 /// \returns a DAG node contains the operand after lowering to mask type.
2591 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2592 const EVT &ValLoc, const SDLoc &Dl,
2593 SelectionDAG &DAG) {
2594 SDValue ValReturned = ValArg;
2596 if (ValVT == MVT::v1i1)
2597 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2599 if (ValVT == MVT::v64i1) {
2600 // In 32 bit machine, this case is handled by getv64i1Argument
2601 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2602 // In 64 bit machine, There is no need to truncate the value only bitcast
2605 switch (ValVT.getSimpleVT().SimpleTy) {
2616 llvm_unreachable("Expecting a vector of i1 types");
2619 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2621 return DAG.getBitcast(ValVT, ValReturned);
2624 /// Lower the result values of a call into the
2625 /// appropriate copies out of appropriate physical registers.
2627 SDValue X86TargetLowering::LowerCallResult(
2628 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2629 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2630 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2631 uint32_t *RegMask) const {
2633 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2634 // Assign locations to each value returned by this call.
2635 SmallVector<CCValAssign, 16> RVLocs;
2636 bool Is64Bit = Subtarget.is64Bit();
2637 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2639 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2641 // Copy all of the result registers out of their specified physreg.
2642 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2644 CCValAssign &VA = RVLocs[I];
2645 EVT CopyVT = VA.getLocVT();
2647 // In some calling conventions we need to remove the used registers
2648 // from the register mask.
2650 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2651 SubRegs.isValid(); ++SubRegs)
2652 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2655 // If this is x86-64, and we disabled SSE, we can't return FP values
2656 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2657 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2658 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2659 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2662 // If we prefer to use the value in xmm registers, copy it out as f80 and
2663 // use a truncate to move it from fp stack reg to xmm reg.
2664 bool RoundAfterCopy = false;
2665 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2666 isScalarFPTypeInSSEReg(VA.getValVT())) {
2667 if (!Subtarget.hasX87())
2668 report_fatal_error("X87 register return with X87 disabled");
2670 RoundAfterCopy = (CopyVT != VA.getLocVT());
2674 if (VA.needsCustom()) {
2675 assert(VA.getValVT() == MVT::v64i1 &&
2676 "Currently the only custom case is when we split v64i1 to 2 regs");
2678 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2680 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2682 Val = Chain.getValue(0);
2683 InFlag = Chain.getValue(2);
2687 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2688 // This truncation won't change the value.
2689 DAG.getIntPtrConstant(1, dl));
2691 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2692 if (VA.getValVT().isVector() &&
2693 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2694 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2695 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2696 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2698 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2701 InVals.push_back(Val);
2707 //===----------------------------------------------------------------------===//
2708 // C & StdCall & Fast Calling Convention implementation
2709 //===----------------------------------------------------------------------===//
2710 // StdCall calling convention seems to be standard for many Windows' API
2711 // routines and around. It differs from C calling convention just a little:
2712 // callee should clean up the stack, not caller. Symbols should be also
2713 // decorated in some fancy way :) It doesn't support any vector arguments.
2714 // For info on fast calling convention see Fast Calling Convention (tail call)
2715 // implementation LowerX86_32FastCCCallTo.
2717 /// CallIsStructReturn - Determines whether a call uses struct return
2719 enum StructReturnType {
2724 static StructReturnType
2725 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2727 return NotStructReturn;
2729 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2730 if (!Flags.isSRet())
2731 return NotStructReturn;
2732 if (Flags.isInReg() || IsMCU)
2733 return RegStructReturn;
2734 return StackStructReturn;
2737 /// Determines whether a function uses struct return semantics.
2738 static StructReturnType
2739 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2741 return NotStructReturn;
2743 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2744 if (!Flags.isSRet())
2745 return NotStructReturn;
2746 if (Flags.isInReg() || IsMCU)
2747 return RegStructReturn;
2748 return StackStructReturn;
2751 /// Make a copy of an aggregate at address specified by "Src" to address
2752 /// "Dst" with size and alignment information specified by the specific
2753 /// parameter attribute. The copy will be passed as a byval function parameter.
2754 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2755 SDValue Chain, ISD::ArgFlagsTy Flags,
2756 SelectionDAG &DAG, const SDLoc &dl) {
2757 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2759 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2760 /*isVolatile*/false, /*AlwaysInline=*/true,
2761 /*isTailCall*/false,
2762 MachinePointerInfo(), MachinePointerInfo());
2765 /// Return true if the calling convention is one that we can guarantee TCO for.
2766 static bool canGuaranteeTCO(CallingConv::ID CC) {
2767 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2768 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2769 CC == CallingConv::HHVM);
2772 /// Return true if we might ever do TCO for calls with this calling convention.
2773 static bool mayTailCallThisCC(CallingConv::ID CC) {
2775 // C calling conventions:
2776 case CallingConv::C:
2777 case CallingConv::Win64:
2778 case CallingConv::X86_64_SysV:
2779 // Callee pop conventions:
2780 case CallingConv::X86_ThisCall:
2781 case CallingConv::X86_StdCall:
2782 case CallingConv::X86_VectorCall:
2783 case CallingConv::X86_FastCall:
2786 return canGuaranteeTCO(CC);
2790 /// Return true if the function is being made into a tailcall target by
2791 /// changing its ABI.
2792 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2793 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2796 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2798 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2799 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2802 ImmutableCallSite CS(CI);
2803 CallingConv::ID CalleeCC = CS.getCallingConv();
2804 if (!mayTailCallThisCC(CalleeCC))
2811 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2812 const SmallVectorImpl<ISD::InputArg> &Ins,
2813 const SDLoc &dl, SelectionDAG &DAG,
2814 const CCValAssign &VA,
2815 MachineFrameInfo &MFI, unsigned i) const {
2816 // Create the nodes corresponding to a load from this parameter slot.
2817 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2818 bool AlwaysUseMutable = shouldGuaranteeTCO(
2819 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2820 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2822 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2824 // If value is passed by pointer we have address passed instead of the value
2825 // itself. No need to extend if the mask value and location share the same
2827 bool ExtendedInMem =
2828 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2829 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2831 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2832 ValVT = VA.getLocVT();
2834 ValVT = VA.getValVT();
2836 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2837 // taken by a return address.
2839 if (CallConv == CallingConv::X86_INTR) {
2840 // X86 interrupts may take one or two arguments.
2841 // On the stack there will be no return address as in regular call.
2842 // Offset of last argument need to be set to -4/-8 bytes.
2843 // Where offset of the first argument out of two, should be set to 0 bytes.
2844 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2845 if (Subtarget.is64Bit() && Ins.size() == 2) {
2846 // The stack pointer needs to be realigned for 64 bit handlers with error
2847 // code, so the argument offset changes by 8 bytes.
2852 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2853 // changed with more analysis.
2854 // In case of tail call optimization mark all arguments mutable. Since they
2855 // could be overwritten by lowering of arguments in case of a tail call.
2856 if (Flags.isByVal()) {
2857 unsigned Bytes = Flags.getByValSize();
2858 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2860 // FIXME: For now, all byval parameter objects are marked as aliasing. This
2861 // can be improved with deeper analysis.
2862 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
2863 /*isAliased=*/true);
2864 // Adjust SP offset of interrupt parameter.
2865 if (CallConv == CallingConv::X86_INTR) {
2866 MFI.setObjectOffset(FI, Offset);
2868 return DAG.getFrameIndex(FI, PtrVT);
2871 // This is an argument in memory. We might be able to perform copy elision.
2872 if (Flags.isCopyElisionCandidate()) {
2873 EVT ArgVT = Ins[i].ArgVT;
2875 if (Ins[i].PartOffset == 0) {
2876 // If this is a one-part value or the first part of a multi-part value,
2877 // create a stack object for the entire argument value type and return a
2878 // load from our portion of it. This assumes that if the first part of an
2879 // argument is in memory, the rest will also be in memory.
2880 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2881 /*Immutable=*/false);
2882 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2884 ValVT, dl, Chain, PartAddr,
2885 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2887 // This is not the first piece of an argument in memory. See if there is
2888 // already a fixed stack object including this offset. If so, assume it
2889 // was created by the PartOffset == 0 branch above and create a load from
2890 // the appropriate offset into it.
2891 int64_t PartBegin = VA.getLocMemOffset();
2892 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2893 int FI = MFI.getObjectIndexBegin();
2894 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2895 int64_t ObjBegin = MFI.getObjectOffset(FI);
2896 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2897 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2900 if (MFI.isFixedObjectIndex(FI)) {
2902 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2903 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2905 ValVT, dl, Chain, Addr,
2906 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2907 Ins[i].PartOffset));
2912 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2913 VA.getLocMemOffset(), isImmutable);
2915 // Set SExt or ZExt flag.
2916 if (VA.getLocInfo() == CCValAssign::ZExt) {
2917 MFI.setObjectZExt(FI, true);
2918 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2919 MFI.setObjectSExt(FI, true);
2922 // Adjust SP offset of interrupt parameter.
2923 if (CallConv == CallingConv::X86_INTR) {
2924 MFI.setObjectOffset(FI, Offset);
2927 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2928 SDValue Val = DAG.getLoad(
2929 ValVT, dl, Chain, FIN,
2930 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2931 return ExtendedInMem
2932 ? (VA.getValVT().isVector()
2933 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2934 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2938 // FIXME: Get this from tablegen.
2939 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2940 const X86Subtarget &Subtarget) {
2941 assert(Subtarget.is64Bit());
2943 if (Subtarget.isCallingConvWin64(CallConv)) {
2944 static const MCPhysReg GPR64ArgRegsWin64[] = {
2945 X86::RCX, X86::RDX, X86::R8, X86::R9
2947 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2950 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2951 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2953 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2956 // FIXME: Get this from tablegen.
2957 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2958 CallingConv::ID CallConv,
2959 const X86Subtarget &Subtarget) {
2960 assert(Subtarget.is64Bit());
2961 if (Subtarget.isCallingConvWin64(CallConv)) {
2962 // The XMM registers which might contain var arg parameters are shadowed
2963 // in their paired GPR. So we only need to save the GPR to their home
2965 // TODO: __vectorcall will change this.
2969 const Function &F = MF.getFunction();
2970 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2971 bool isSoftFloat = Subtarget.useSoftFloat();
2972 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2973 "SSE register cannot be used when SSE is disabled!");
2974 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2975 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2979 static const MCPhysReg XMMArgRegs64Bit[] = {
2980 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2981 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2983 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2987 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
2988 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2989 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2990 return A.getValNo() < B.getValNo();
2995 SDValue X86TargetLowering::LowerFormalArguments(
2996 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2997 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2998 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2999 MachineFunction &MF = DAG.getMachineFunction();
3000 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3001 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3003 const Function &F = MF.getFunction();
3004 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3005 F.getName() == "main")
3006 FuncInfo->setForceFramePointer(true);
3008 MachineFrameInfo &MFI = MF.getFrameInfo();
3009 bool Is64Bit = Subtarget.is64Bit();
3010 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3013 !(isVarArg && canGuaranteeTCO(CallConv)) &&
3014 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3016 if (CallConv == CallingConv::X86_INTR) {
3017 bool isLegal = Ins.size() == 1 ||
3018 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
3019 (!Is64Bit && Ins[1].VT == MVT::i32)));
3021 report_fatal_error("X86 interrupts may take one or two arguments");
3024 // Assign locations to all of the incoming arguments.
3025 SmallVector<CCValAssign, 16> ArgLocs;
3026 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3028 // Allocate shadow area for Win64.
3030 CCInfo.AllocateStack(32, 8);
3032 CCInfo.AnalyzeArguments(Ins, CC_X86);
3034 // In vectorcall calling convention a second pass is required for the HVA
3036 if (CallingConv::X86_VectorCall == CallConv) {
3037 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3040 // The next loop assumes that the locations are in the same order of the
3042 assert(isSortedByValueNo(ArgLocs) &&
3043 "Argument Location list must be sorted before lowering");
3046 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3048 assert(InsIndex < Ins.size() && "Invalid Ins index");
3049 CCValAssign &VA = ArgLocs[I];
3051 if (VA.isRegLoc()) {
3052 EVT RegVT = VA.getLocVT();
3053 if (VA.needsCustom()) {
3055 VA.getValVT() == MVT::v64i1 &&
3056 "Currently the only custom case is when we split v64i1 to 2 regs");
3058 // v64i1 values, in regcall calling convention, that are
3059 // compiled to 32 bit arch, are split up into two registers.
3061 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3063 const TargetRegisterClass *RC;
3064 if (RegVT == MVT::i8)
3065 RC = &X86::GR8RegClass;
3066 else if (RegVT == MVT::i16)
3067 RC = &X86::GR16RegClass;
3068 else if (RegVT == MVT::i32)
3069 RC = &X86::GR32RegClass;
3070 else if (Is64Bit && RegVT == MVT::i64)
3071 RC = &X86::GR64RegClass;
3072 else if (RegVT == MVT::f32)
3073 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3074 else if (RegVT == MVT::f64)
3075 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3076 else if (RegVT == MVT::f80)
3077 RC = &X86::RFP80RegClass;
3078 else if (RegVT == MVT::f128)
3079 RC = &X86::FR128RegClass;
3080 else if (RegVT.is512BitVector())
3081 RC = &X86::VR512RegClass;
3082 else if (RegVT.is256BitVector())
3083 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3084 else if (RegVT.is128BitVector())
3085 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3086 else if (RegVT == MVT::x86mmx)
3087 RC = &X86::VR64RegClass;
3088 else if (RegVT == MVT::v1i1)
3089 RC = &X86::VK1RegClass;
3090 else if (RegVT == MVT::v8i1)
3091 RC = &X86::VK8RegClass;
3092 else if (RegVT == MVT::v16i1)
3093 RC = &X86::VK16RegClass;
3094 else if (RegVT == MVT::v32i1)
3095 RC = &X86::VK32RegClass;
3096 else if (RegVT == MVT::v64i1)
3097 RC = &X86::VK64RegClass;
3099 llvm_unreachable("Unknown argument type!");
3101 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3102 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3105 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3106 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3108 if (VA.getLocInfo() == CCValAssign::SExt)
3109 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3110 DAG.getValueType(VA.getValVT()));
3111 else if (VA.getLocInfo() == CCValAssign::ZExt)
3112 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3113 DAG.getValueType(VA.getValVT()));
3114 else if (VA.getLocInfo() == CCValAssign::BCvt)
3115 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3117 if (VA.isExtInLoc()) {
3118 // Handle MMX values passed in XMM regs.
3119 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3120 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3121 else if (VA.getValVT().isVector() &&
3122 VA.getValVT().getScalarType() == MVT::i1 &&
3123 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3124 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3125 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3126 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3128 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3131 assert(VA.isMemLoc());
3133 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3136 // If value is passed via pointer - do a load.
3137 if (VA.getLocInfo() == CCValAssign::Indirect)
3139 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3141 InVals.push_back(ArgValue);
3144 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3145 // Swift calling convention does not require we copy the sret argument
3146 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3147 if (CallConv == CallingConv::Swift)
3150 // All x86 ABIs require that for returning structs by value we copy the
3151 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3152 // the argument into a virtual register so that we can access it from the
3154 if (Ins[I].Flags.isSRet()) {
3155 unsigned Reg = FuncInfo->getSRetReturnReg();
3157 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3158 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3159 FuncInfo->setSRetReturnReg(Reg);
3161 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3162 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3167 unsigned StackSize = CCInfo.getNextStackOffset();
3168 // Align stack specially for tail calls.
3169 if (shouldGuaranteeTCO(CallConv,
3170 MF.getTarget().Options.GuaranteedTailCallOpt))
3171 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3173 // If the function takes variable number of arguments, make a frame index for
3174 // the start of the first vararg value... for expansion of llvm.va_start. We
3175 // can skip this if there are no va_start calls.
3176 if (MFI.hasVAStart() &&
3177 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3178 CallConv != CallingConv::X86_ThisCall))) {
3179 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3182 // Figure out if XMM registers are in use.
3183 assert(!(Subtarget.useSoftFloat() &&
3184 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3185 "SSE register cannot be used when SSE is disabled!");
3187 // 64-bit calling conventions support varargs and register parameters, so we
3188 // have to do extra work to spill them in the prologue.
3189 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3190 // Find the first unallocated argument registers.
3191 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3192 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3193 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3194 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3195 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3196 "SSE register cannot be used when SSE is disabled!");
3198 // Gather all the live in physical registers.
3199 SmallVector<SDValue, 6> LiveGPRs;
3200 SmallVector<SDValue, 8> LiveXMMRegs;
3202 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3203 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3205 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3207 if (!ArgXMMs.empty()) {
3208 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3209 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3210 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3211 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3212 LiveXMMRegs.push_back(
3213 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3218 // Get to the caller-allocated home save location. Add 8 to account
3219 // for the return address.
3220 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3221 FuncInfo->setRegSaveFrameIndex(
3222 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3223 // Fixup to set vararg frame on shadow area (4 x i64).
3225 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3227 // For X86-64, if there are vararg parameters that are passed via
3228 // registers, then we must store them to their spots on the stack so
3229 // they may be loaded by dereferencing the result of va_next.
3230 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3231 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3232 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3233 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3236 // Store the integer parameter registers.
3237 SmallVector<SDValue, 8> MemOps;
3238 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3239 getPointerTy(DAG.getDataLayout()));
3240 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3241 for (SDValue Val : LiveGPRs) {
3242 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3243 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3245 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3246 MachinePointerInfo::getFixedStack(
3247 DAG.getMachineFunction(),
3248 FuncInfo->getRegSaveFrameIndex(), Offset));
3249 MemOps.push_back(Store);
3253 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3254 // Now store the XMM (fp + vector) parameter registers.
3255 SmallVector<SDValue, 12> SaveXMMOps;
3256 SaveXMMOps.push_back(Chain);
3257 SaveXMMOps.push_back(ALVal);
3258 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3259 FuncInfo->getRegSaveFrameIndex(), dl));
3260 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3261 FuncInfo->getVarArgsFPOffset(), dl));
3262 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3264 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3265 MVT::Other, SaveXMMOps));
3268 if (!MemOps.empty())
3269 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3272 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3273 // Find the largest legal vector type.
3274 MVT VecVT = MVT::Other;
3275 // FIXME: Only some x86_32 calling conventions support AVX512.
3276 if (Subtarget.hasAVX512() &&
3277 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3278 CallConv == CallingConv::Intel_OCL_BI)))
3279 VecVT = MVT::v16f32;
3280 else if (Subtarget.hasAVX())
3282 else if (Subtarget.hasSSE2())
3285 // We forward some GPRs and some vector types.
3286 SmallVector<MVT, 2> RegParmTypes;
3287 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3288 RegParmTypes.push_back(IntVT);
3289 if (VecVT != MVT::Other)
3290 RegParmTypes.push_back(VecVT);
3292 // Compute the set of forwarded registers. The rest are scratch.
3293 SmallVectorImpl<ForwardedRegister> &Forwards =
3294 FuncInfo->getForwardedMustTailRegParms();
3295 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3297 // Conservatively forward AL on x86_64, since it might be used for varargs.
3298 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3299 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3300 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3303 // Copy all forwards from physical to virtual registers.
3304 for (ForwardedRegister &F : Forwards) {
3305 // FIXME: Can we use a less constrained schedule?
3306 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3307 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3308 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3312 // Some CCs need callee pop.
3313 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3314 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3315 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3316 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3317 // X86 interrupts must pop the error code (and the alignment padding) if
3319 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3321 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3322 // If this is an sret function, the return should pop the hidden pointer.
3323 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3324 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3325 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3326 FuncInfo->setBytesToPopOnReturn(4);
3330 // RegSaveFrameIndex is X86-64 only.
3331 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3332 if (CallConv == CallingConv::X86_FastCall ||
3333 CallConv == CallingConv::X86_ThisCall)
3334 // fastcc functions can't have varargs.
3335 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3338 FuncInfo->setArgumentStackSize(StackSize);
3340 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3341 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3342 if (Personality == EHPersonality::CoreCLR) {
3344 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3345 // that we'd prefer this slot be allocated towards the bottom of the frame
3346 // (i.e. near the stack pointer after allocating the frame). Every
3347 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3348 // offset from the bottom of this and each funclet's frame must be the
3349 // same, so the size of funclets' (mostly empty) frames is dictated by
3350 // how far this slot is from the bottom (since they allocate just enough
3351 // space to accommodate holding this slot at the correct offset).
3352 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3353 EHInfo->PSPSymFrameIdx = PSPSymFI;
3357 if (CallConv == CallingConv::X86_RegCall ||
3358 F.hasFnAttribute("no_caller_saved_registers")) {
3359 MachineRegisterInfo &MRI = MF.getRegInfo();
3360 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3361 MRI.disableCalleeSavedRegister(Pair.first);
3367 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3368 SDValue Arg, const SDLoc &dl,
3370 const CCValAssign &VA,
3371 ISD::ArgFlagsTy Flags) const {
3372 unsigned LocMemOffset = VA.getLocMemOffset();
3373 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3374 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3376 if (Flags.isByVal())
3377 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3379 return DAG.getStore(
3380 Chain, dl, Arg, PtrOff,
3381 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3384 /// Emit a load of return address if tail call
3385 /// optimization is performed and it is required.
3386 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3387 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3388 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3389 // Adjust the Return address stack slot.
3390 EVT VT = getPointerTy(DAG.getDataLayout());
3391 OutRetAddr = getReturnAddressFrameIndex(DAG);
3393 // Load the "old" Return address.
3394 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3395 return SDValue(OutRetAddr.getNode(), 1);
3398 /// Emit a store of the return address if tail call
3399 /// optimization is performed and it is required (FPDiff!=0).
3400 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3401 SDValue Chain, SDValue RetAddrFrIdx,
3402 EVT PtrVT, unsigned SlotSize,
3403 int FPDiff, const SDLoc &dl) {
3404 // Store the return address to the appropriate stack slot.
3405 if (!FPDiff) return Chain;
3406 // Calculate the new stack slot for the return address.
3407 int NewReturnAddrFI =
3408 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3410 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3411 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3412 MachinePointerInfo::getFixedStack(
3413 DAG.getMachineFunction(), NewReturnAddrFI));
3417 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3418 /// operation of specified width.
3419 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3421 unsigned NumElems = VT.getVectorNumElements();
3422 SmallVector<int, 8> Mask;
3423 Mask.push_back(NumElems);
3424 for (unsigned i = 1; i != NumElems; ++i)
3426 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3430 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3431 SmallVectorImpl<SDValue> &InVals) const {
3432 SelectionDAG &DAG = CLI.DAG;
3434 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3435 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3436 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3437 SDValue Chain = CLI.Chain;
3438 SDValue Callee = CLI.Callee;
3439 CallingConv::ID CallConv = CLI.CallConv;
3440 bool &isTailCall = CLI.IsTailCall;
3441 bool isVarArg = CLI.IsVarArg;
3443 MachineFunction &MF = DAG.getMachineFunction();
3444 bool Is64Bit = Subtarget.is64Bit();
3445 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3446 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3447 bool IsSibcall = false;
3448 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3449 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3450 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3451 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3452 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3453 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3454 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3456 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3457 const Module *M = MF.getMMI().getModule();
3458 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3460 if (CallConv == CallingConv::X86_INTR)
3461 report_fatal_error("X86 interrupts may not be called directly");
3463 if (Attr.getValueAsString() == "true")
3466 if (Subtarget.isPICStyleGOT() &&
3467 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3468 // If we are using a GOT, disable tail calls to external symbols with
3469 // default visibility. Tail calling such a symbol requires using a GOT
3470 // relocation, which forces early binding of the symbol. This breaks code
3471 // that require lazy function symbol resolution. Using musttail or
3472 // GuaranteedTailCallOpt will override this.
3473 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3474 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3475 G->getGlobal()->hasDefaultVisibility()))
3479 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3481 // Force this to be a tail call. The verifier rules are enough to ensure
3482 // that we can lower this successfully without moving the return address
3485 } else if (isTailCall) {
3486 // Check if it's really possible to do a tail call.
3487 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3488 isVarArg, SR != NotStructReturn,
3489 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3490 Outs, OutVals, Ins, DAG);
3492 // Sibcalls are automatically detected tailcalls which do not require
3494 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3501 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3502 "Var args not supported with calling convention fastcc, ghc or hipe");
3504 // Analyze operands of the call, assigning locations to each operand.
3505 SmallVector<CCValAssign, 16> ArgLocs;
3506 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3508 // Allocate shadow area for Win64.
3510 CCInfo.AllocateStack(32, 8);
3512 CCInfo.AnalyzeArguments(Outs, CC_X86);
3514 // In vectorcall calling convention a second pass is required for the HVA
3516 if (CallingConv::X86_VectorCall == CallConv) {
3517 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3520 // Get a count of how many bytes are to be pushed on the stack.
3521 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3523 // This is a sibcall. The memory operands are available in caller's
3524 // own caller's stack.
3526 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3527 canGuaranteeTCO(CallConv))
3528 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3531 if (isTailCall && !IsSibcall && !IsMustTail) {
3532 // Lower arguments at fp - stackoffset + fpdiff.
3533 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3535 FPDiff = NumBytesCallerPushed - NumBytes;
3537 // Set the delta of movement of the returnaddr stackslot.
3538 // But only set if delta is greater than previous delta.
3539 if (FPDiff < X86Info->getTCReturnAddrDelta())
3540 X86Info->setTCReturnAddrDelta(FPDiff);
3543 unsigned NumBytesToPush = NumBytes;
3544 unsigned NumBytesToPop = NumBytes;
3546 // If we have an inalloca argument, all stack space has already been allocated
3547 // for us and be right at the top of the stack. We don't support multiple
3548 // arguments passed in memory when using inalloca.
3549 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3551 if (!ArgLocs.back().isMemLoc())
3552 report_fatal_error("cannot use inalloca attribute on a register "
3554 if (ArgLocs.back().getLocMemOffset() != 0)
3555 report_fatal_error("any parameter with the inalloca attribute must be "
3556 "the only memory argument");
3560 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3561 NumBytes - NumBytesToPush, dl);
3563 SDValue RetAddrFrIdx;
3564 // Load return address for tail calls.
3565 if (isTailCall && FPDiff)
3566 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3567 Is64Bit, FPDiff, dl);
3569 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3570 SmallVector<SDValue, 8> MemOpChains;
3573 // The next loop assumes that the locations are in the same order of the
3575 assert(isSortedByValueNo(ArgLocs) &&
3576 "Argument Location list must be sorted before lowering");
3578 // Walk the register/memloc assignments, inserting copies/loads. In the case
3579 // of tail call optimization arguments are handle later.
3580 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3581 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3583 assert(OutIndex < Outs.size() && "Invalid Out index");
3584 // Skip inalloca arguments, they have already been written.
3585 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3586 if (Flags.isInAlloca())
3589 CCValAssign &VA = ArgLocs[I];
3590 EVT RegVT = VA.getLocVT();
3591 SDValue Arg = OutVals[OutIndex];
3592 bool isByVal = Flags.isByVal();
3594 // Promote the value if needed.
3595 switch (VA.getLocInfo()) {
3596 default: llvm_unreachable("Unknown loc info!");
3597 case CCValAssign::Full: break;
3598 case CCValAssign::SExt:
3599 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3601 case CCValAssign::ZExt:
3602 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3604 case CCValAssign::AExt:
3605 if (Arg.getValueType().isVector() &&
3606 Arg.getValueType().getVectorElementType() == MVT::i1)
3607 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3608 else if (RegVT.is128BitVector()) {
3609 // Special case: passing MMX values in XMM registers.
3610 Arg = DAG.getBitcast(MVT::i64, Arg);
3611 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3612 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3614 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3616 case CCValAssign::BCvt:
3617 Arg = DAG.getBitcast(RegVT, Arg);
3619 case CCValAssign::Indirect: {
3620 // Store the argument.
3621 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3622 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3623 Chain = DAG.getStore(
3624 Chain, dl, Arg, SpillSlot,
3625 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3631 if (VA.needsCustom()) {
3632 assert(VA.getValVT() == MVT::v64i1 &&
3633 "Currently the only custom case is when we split v64i1 to 2 regs");
3634 // Split v64i1 value into two registers
3635 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3637 } else if (VA.isRegLoc()) {
3638 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3639 if (isVarArg && IsWin64) {
3640 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3641 // shadow reg if callee is a varargs function.
3642 unsigned ShadowReg = 0;
3643 switch (VA.getLocReg()) {
3644 case X86::XMM0: ShadowReg = X86::RCX; break;
3645 case X86::XMM1: ShadowReg = X86::RDX; break;
3646 case X86::XMM2: ShadowReg = X86::R8; break;
3647 case X86::XMM3: ShadowReg = X86::R9; break;
3650 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3652 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3653 assert(VA.isMemLoc());
3654 if (!StackPtr.getNode())
3655 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3656 getPointerTy(DAG.getDataLayout()));
3657 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3658 dl, DAG, VA, Flags));
3662 if (!MemOpChains.empty())
3663 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3665 if (Subtarget.isPICStyleGOT()) {
3666 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3669 RegsToPass.push_back(std::make_pair(
3670 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3671 getPointerTy(DAG.getDataLayout()))));
3673 // If we are tail calling and generating PIC/GOT style code load the
3674 // address of the callee into ECX. The value in ecx is used as target of
3675 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3676 // for tail calls on PIC/GOT architectures. Normally we would just put the
3677 // address of GOT into ebx and then call target@PLT. But for tail calls
3678 // ebx would be restored (since ebx is callee saved) before jumping to the
3681 // Note: The actual moving to ECX is done further down.
3682 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3683 if (G && !G->getGlobal()->hasLocalLinkage() &&
3684 G->getGlobal()->hasDefaultVisibility())
3685 Callee = LowerGlobalAddress(Callee, DAG);
3686 else if (isa<ExternalSymbolSDNode>(Callee))
3687 Callee = LowerExternalSymbol(Callee, DAG);
3691 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3692 // From AMD64 ABI document:
3693 // For calls that may call functions that use varargs or stdargs
3694 // (prototype-less calls or calls to functions containing ellipsis (...) in
3695 // the declaration) %al is used as hidden argument to specify the number
3696 // of SSE registers used. The contents of %al do not need to match exactly
3697 // the number of registers, but must be an ubound on the number of SSE
3698 // registers used and is in the range 0 - 8 inclusive.
3700 // Count the number of XMM registers allocated.
3701 static const MCPhysReg XMMArgRegs[] = {
3702 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3703 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3705 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3706 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3707 && "SSE registers cannot be used when SSE is disabled");
3709 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3710 DAG.getConstant(NumXMMRegs, dl,
3714 if (isVarArg && IsMustTail) {
3715 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3716 for (const auto &F : Forwards) {
3717 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3718 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3722 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3723 // don't need this because the eligibility check rejects calls that require
3724 // shuffling arguments passed in memory.
3725 if (!IsSibcall && isTailCall) {
3726 // Force all the incoming stack arguments to be loaded from the stack
3727 // before any new outgoing arguments are stored to the stack, because the
3728 // outgoing stack slots may alias the incoming argument stack slots, and
3729 // the alias isn't otherwise explicit. This is slightly more conservative
3730 // than necessary, because it means that each store effectively depends
3731 // on every argument instead of just those arguments it would clobber.
3732 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3734 SmallVector<SDValue, 8> MemOpChains2;
3737 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3739 CCValAssign &VA = ArgLocs[I];
3741 if (VA.isRegLoc()) {
3742 if (VA.needsCustom()) {
3743 assert((CallConv == CallingConv::X86_RegCall) &&
3744 "Expecting custom case only in regcall calling convention");
3745 // This means that we are in special case where one argument was
3746 // passed through two register locations - Skip the next location
3753 assert(VA.isMemLoc());
3754 SDValue Arg = OutVals[OutsIndex];
3755 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3756 // Skip inalloca arguments. They don't require any work.
3757 if (Flags.isInAlloca())
3759 // Create frame index.
3760 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3761 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3762 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3763 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3765 if (Flags.isByVal()) {
3766 // Copy relative to framepointer.
3767 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3768 if (!StackPtr.getNode())
3769 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3770 getPointerTy(DAG.getDataLayout()));
3771 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3774 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3778 // Store relative to framepointer.
3779 MemOpChains2.push_back(DAG.getStore(
3780 ArgChain, dl, Arg, FIN,
3781 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3785 if (!MemOpChains2.empty())
3786 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3788 // Store the return address to the appropriate stack slot.
3789 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3790 getPointerTy(DAG.getDataLayout()),
3791 RegInfo->getSlotSize(), FPDiff, dl);
3794 // Build a sequence of copy-to-reg nodes chained together with token chain
3795 // and flag operands which copy the outgoing args into registers.
3797 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3798 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3799 RegsToPass[i].second, InFlag);
3800 InFlag = Chain.getValue(1);
3803 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3804 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3805 // In the 64-bit large code model, we have to make all calls
3806 // through a register, since the call instruction's 32-bit
3807 // pc-relative offset may not be large enough to hold the whole
3809 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3810 // If the callee is a GlobalAddress node (quite common, every direct call
3811 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3813 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3815 // We should use extra load for direct calls to dllimported functions in
3817 const GlobalValue *GV = G->getGlobal();
3818 if (!GV->hasDLLImportStorageClass()) {
3819 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3821 Callee = DAG.getTargetGlobalAddress(
3822 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3824 if (OpFlags == X86II::MO_GOTPCREL) {
3826 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3827 getPointerTy(DAG.getDataLayout()), Callee);
3828 // Add extra indirection
3829 Callee = DAG.getLoad(
3830 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3831 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3834 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3835 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3836 unsigned char OpFlags =
3837 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3839 Callee = DAG.getTargetExternalSymbol(
3840 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3842 if (OpFlags == X86II::MO_GOTPCREL) {
3843 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3844 getPointerTy(DAG.getDataLayout()), Callee);
3845 Callee = DAG.getLoad(
3846 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3847 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3849 } else if (Subtarget.isTarget64BitILP32() &&
3850 Callee->getValueType(0) == MVT::i32) {
3851 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3852 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3855 // Returns a chain & a flag for retval copy to use.
3856 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3857 SmallVector<SDValue, 8> Ops;
3859 if (!IsSibcall && isTailCall) {
3860 Chain = DAG.getCALLSEQ_END(Chain,
3861 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3862 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3863 InFlag = Chain.getValue(1);
3866 Ops.push_back(Chain);
3867 Ops.push_back(Callee);
3870 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3872 // Add argument registers to the end of the list so that they are known live
3874 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3875 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3876 RegsToPass[i].second.getValueType()));
3878 // Add a register mask operand representing the call-preserved registers.
3879 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3880 // set X86_INTR calling convention because it has the same CSR mask
3881 // (same preserved registers).
3882 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3883 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3884 assert(Mask && "Missing call preserved mask for calling convention");
3886 // If this is an invoke in a 32-bit function using a funclet-based
3887 // personality, assume the function clobbers all registers. If an exception
3888 // is thrown, the runtime will not restore CSRs.
3889 // FIXME: Model this more precisely so that we can register allocate across
3890 // the normal edge and spill and fill across the exceptional edge.
3891 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3892 const Function &CallerFn = MF.getFunction();
3893 EHPersonality Pers =
3894 CallerFn.hasPersonalityFn()
3895 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3896 : EHPersonality::Unknown;
3897 if (isFuncletEHPersonality(Pers))
3898 Mask = RegInfo->getNoPreservedMask();
3901 // Define a new register mask from the existing mask.
3902 uint32_t *RegMask = nullptr;
3904 // In some calling conventions we need to remove the used physical registers
3905 // from the reg mask.
3906 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3907 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3909 // Allocate a new Reg Mask and copy Mask.
3910 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3911 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3912 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3914 // Make sure all sub registers of the argument registers are reset
3916 for (auto const &RegPair : RegsToPass)
3917 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3918 SubRegs.isValid(); ++SubRegs)
3919 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3921 // Create the RegMask Operand according to our updated mask.
3922 Ops.push_back(DAG.getRegisterMask(RegMask));
3924 // Create the RegMask Operand according to the static mask.
3925 Ops.push_back(DAG.getRegisterMask(Mask));
3928 if (InFlag.getNode())
3929 Ops.push_back(InFlag);
3933 //// If this is the first return lowered for this function, add the regs
3934 //// to the liveout set for the function.
3935 // This isn't right, although it's probably harmless on x86; liveouts
3936 // should be computed from returns not tail calls. Consider a void
3937 // function making a tail call to a function returning int.
3938 MF.getFrameInfo().setHasTailCall();
3939 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3942 if (HasNoCfCheck && IsCFProtectionSupported) {
3943 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
3945 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3947 InFlag = Chain.getValue(1);
3949 // Create the CALLSEQ_END node.
3950 unsigned NumBytesForCalleeToPop;
3951 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3952 DAG.getTarget().Options.GuaranteedTailCallOpt))
3953 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3954 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3955 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3956 SR == StackStructReturn)
3957 // If this is a call to a struct-return function, the callee
3958 // pops the hidden struct pointer, so we have to push it back.
3959 // This is common for Darwin/X86, Linux & Mingw32 targets.
3960 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3961 NumBytesForCalleeToPop = 4;
3963 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3965 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3966 // No need to reset the stack after the call if the call doesn't return. To
3967 // make the MI verify, we'll pretend the callee does it for us.
3968 NumBytesForCalleeToPop = NumBytes;
3971 // Returns a flag for retval copy to use.
3973 Chain = DAG.getCALLSEQ_END(Chain,
3974 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3975 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3978 InFlag = Chain.getValue(1);
3981 // Handle result values, copying them out of physregs into vregs that we
3983 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3987 //===----------------------------------------------------------------------===//
3988 // Fast Calling Convention (tail call) implementation
3989 //===----------------------------------------------------------------------===//
3991 // Like std call, callee cleans arguments, convention except that ECX is
3992 // reserved for storing the tail called function address. Only 2 registers are
3993 // free for argument passing (inreg). Tail call optimization is performed
3995 // * tailcallopt is enabled
3996 // * caller/callee are fastcc
3997 // On X86_64 architecture with GOT-style position independent code only local
3998 // (within module) calls are supported at the moment.
3999 // To keep the stack aligned according to platform abi the function
4000 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
4001 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4002 // If a tail called function callee has more arguments than the caller the
4003 // caller needs to make sure that there is room to move the RETADDR to. This is
4004 // achieved by reserving an area the size of the argument delta right after the
4005 // original RETADDR, but before the saved framepointer or the spilled registers
4006 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4018 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4021 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
4022 SelectionDAG& DAG) const {
4023 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4024 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
4025 unsigned StackAlignment = TFI.getStackAlignment();
4026 uint64_t AlignMask = StackAlignment - 1;
4027 int64_t Offset = StackSize;
4028 unsigned SlotSize = RegInfo->getSlotSize();
4029 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
4030 // Number smaller than 12 so just add the difference.
4031 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
4033 // Mask out lower bits, add stackalignment once plus the 12 bytes.
4034 Offset = ((~AlignMask) & Offset) + StackAlignment +
4035 (StackAlignment-SlotSize);
4040 /// Return true if the given stack call argument is already available in the
4041 /// same position (relatively) of the caller's incoming argument stack.
4043 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4044 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4045 const X86InstrInfo *TII, const CCValAssign &VA) {
4046 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4049 // Look through nodes that don't alter the bits of the incoming value.
4050 unsigned Op = Arg.getOpcode();
4051 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4052 Arg = Arg.getOperand(0);
4055 if (Op == ISD::TRUNCATE) {
4056 const SDValue &TruncInput = Arg.getOperand(0);
4057 if (TruncInput.getOpcode() == ISD::AssertZext &&
4058 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4059 Arg.getValueType()) {
4060 Arg = TruncInput.getOperand(0);
4068 if (Arg.getOpcode() == ISD::CopyFromReg) {
4069 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4070 if (!TargetRegisterInfo::isVirtualRegister(VR))
4072 MachineInstr *Def = MRI->getVRegDef(VR);
4075 if (!Flags.isByVal()) {
4076 if (!TII->isLoadFromStackSlot(*Def, FI))
4079 unsigned Opcode = Def->getOpcode();
4080 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4081 Opcode == X86::LEA64_32r) &&
4082 Def->getOperand(1).isFI()) {
4083 FI = Def->getOperand(1).getIndex();
4084 Bytes = Flags.getByValSize();
4088 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4089 if (Flags.isByVal())
4090 // ByVal argument is passed in as a pointer but it's now being
4091 // dereferenced. e.g.
4092 // define @foo(%struct.X* %A) {
4093 // tail call @bar(%struct.X* byval %A)
4096 SDValue Ptr = Ld->getBasePtr();
4097 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4100 FI = FINode->getIndex();
4101 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4102 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4103 FI = FINode->getIndex();
4104 Bytes = Flags.getByValSize();
4108 assert(FI != INT_MAX);
4109 if (!MFI.isFixedObjectIndex(FI))
4112 if (Offset != MFI.getObjectOffset(FI))
4115 // If this is not byval, check that the argument stack object is immutable.
4116 // inalloca and argument copy elision can create mutable argument stack
4117 // objects. Byval objects can be mutated, but a byval call intends to pass the
4119 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4122 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4123 // If the argument location is wider than the argument type, check that any
4124 // extension flags match.
4125 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4126 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4131 return Bytes == MFI.getObjectSize(FI);
4134 /// Check whether the call is eligible for tail call optimization. Targets
4135 /// that want to do tail call optimization should implement this function.
4136 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4137 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4138 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4139 const SmallVectorImpl<ISD::OutputArg> &Outs,
4140 const SmallVectorImpl<SDValue> &OutVals,
4141 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4142 if (!mayTailCallThisCC(CalleeCC))
4145 // If -tailcallopt is specified, make fastcc functions tail-callable.
4146 MachineFunction &MF = DAG.getMachineFunction();
4147 const Function &CallerF = MF.getFunction();
4149 // If the function return type is x86_fp80 and the callee return type is not,
4150 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4151 // perform a tailcall optimization here.
4152 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4155 CallingConv::ID CallerCC = CallerF.getCallingConv();
4156 bool CCMatch = CallerCC == CalleeCC;
4157 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4158 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4160 // Win64 functions have extra shadow space for argument homing. Don't do the
4161 // sibcall if the caller and callee have mismatched expectations for this
4163 if (IsCalleeWin64 != IsCallerWin64)
4166 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4167 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4172 // Look for obvious safe cases to perform tail call optimization that do not
4173 // require ABI changes. This is what gcc calls sibcall.
4175 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4176 // emit a special epilogue.
4177 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4178 if (RegInfo->needsStackRealignment(MF))
4181 // Also avoid sibcall optimization if either caller or callee uses struct
4182 // return semantics.
4183 if (isCalleeStructRet || isCallerStructRet)
4186 // Do not sibcall optimize vararg calls unless all arguments are passed via
4188 LLVMContext &C = *DAG.getContext();
4189 if (isVarArg && !Outs.empty()) {
4190 // Optimizing for varargs on Win64 is unlikely to be safe without
4191 // additional testing.
4192 if (IsCalleeWin64 || IsCallerWin64)
4195 SmallVector<CCValAssign, 16> ArgLocs;
4196 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4198 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4199 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4200 if (!ArgLocs[i].isRegLoc())
4204 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4205 // stack. Therefore, if it's not used by the call it is not safe to optimize
4206 // this into a sibcall.
4207 bool Unused = false;
4208 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4215 SmallVector<CCValAssign, 16> RVLocs;
4216 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4217 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4218 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4219 CCValAssign &VA = RVLocs[i];
4220 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4225 // Check that the call results are passed in the same way.
4226 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4227 RetCC_X86, RetCC_X86))
4229 // The callee has to preserve all registers the caller needs to preserve.
4230 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4231 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4233 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4234 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4238 unsigned StackArgsSize = 0;
4240 // If the callee takes no arguments then go on to check the results of the
4242 if (!Outs.empty()) {
4243 // Check if stack adjustment is needed. For now, do not do this if any
4244 // argument is passed on the stack.
4245 SmallVector<CCValAssign, 16> ArgLocs;
4246 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4248 // Allocate shadow area for Win64
4250 CCInfo.AllocateStack(32, 8);
4252 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4253 StackArgsSize = CCInfo.getNextStackOffset();
4255 if (CCInfo.getNextStackOffset()) {
4256 // Check if the arguments are already laid out in the right way as
4257 // the caller's fixed stack objects.
4258 MachineFrameInfo &MFI = MF.getFrameInfo();
4259 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4260 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4261 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4262 CCValAssign &VA = ArgLocs[i];
4263 SDValue Arg = OutVals[i];
4264 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4265 if (VA.getLocInfo() == CCValAssign::Indirect)
4267 if (!VA.isRegLoc()) {
4268 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4275 bool PositionIndependent = isPositionIndependent();
4276 // If the tailcall address may be in a register, then make sure it's
4277 // possible to register allocate for it. In 32-bit, the call address can
4278 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4279 // callee-saved registers are restored. These happen to be the same
4280 // registers used to pass 'inreg' arguments so watch out for those.
4281 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4282 !isa<ExternalSymbolSDNode>(Callee)) ||
4283 PositionIndependent)) {
4284 unsigned NumInRegs = 0;
4285 // In PIC we need an extra register to formulate the address computation
4287 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4289 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4290 CCValAssign &VA = ArgLocs[i];
4293 unsigned Reg = VA.getLocReg();
4296 case X86::EAX: case X86::EDX: case X86::ECX:
4297 if (++NumInRegs == MaxInRegs)
4304 const MachineRegisterInfo &MRI = MF.getRegInfo();
4305 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4309 bool CalleeWillPop =
4310 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4311 MF.getTarget().Options.GuaranteedTailCallOpt);
4313 if (unsigned BytesToPop =
4314 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4315 // If we have bytes to pop, the callee must pop them.
4316 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4317 if (!CalleePopMatches)
4319 } else if (CalleeWillPop && StackArgsSize > 0) {
4320 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4328 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4329 const TargetLibraryInfo *libInfo) const {
4330 return X86::createFastISel(funcInfo, libInfo);
4333 //===----------------------------------------------------------------------===//
4334 // Other Lowering Hooks
4335 //===----------------------------------------------------------------------===//
4337 static bool MayFoldLoad(SDValue Op) {
4338 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4341 static bool MayFoldIntoStore(SDValue Op) {
4342 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4345 static bool MayFoldIntoZeroExtend(SDValue Op) {
4346 if (Op.hasOneUse()) {
4347 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4348 return (ISD::ZERO_EXTEND == Opcode);
4353 static bool isTargetShuffle(unsigned Opcode) {
4355 default: return false;
4356 case X86ISD::BLENDI:
4357 case X86ISD::PSHUFB:
4358 case X86ISD::PSHUFD:
4359 case X86ISD::PSHUFHW:
4360 case X86ISD::PSHUFLW:
4362 case X86ISD::INSERTPS:
4363 case X86ISD::EXTRQI:
4364 case X86ISD::INSERTQI:
4365 case X86ISD::PALIGNR:
4366 case X86ISD::VSHLDQ:
4367 case X86ISD::VSRLDQ:
4368 case X86ISD::MOVLHPS:
4369 case X86ISD::MOVHLPS:
4370 case X86ISD::MOVLPS:
4371 case X86ISD::MOVLPD:
4372 case X86ISD::MOVSHDUP:
4373 case X86ISD::MOVSLDUP:
4374 case X86ISD::MOVDDUP:
4377 case X86ISD::UNPCKL:
4378 case X86ISD::UNPCKH:
4379 case X86ISD::VBROADCAST:
4380 case X86ISD::VPERMILPI:
4381 case X86ISD::VPERMILPV:
4382 case X86ISD::VPERM2X128:
4383 case X86ISD::VPERMIL2:
4384 case X86ISD::VPERMI:
4385 case X86ISD::VPPERM:
4386 case X86ISD::VPERMV:
4387 case X86ISD::VPERMV3:
4388 case X86ISD::VZEXT_MOVL:
4393 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4395 default: return false;
4397 case X86ISD::PSHUFB:
4398 case X86ISD::VPERMILPV:
4399 case X86ISD::VPERMIL2:
4400 case X86ISD::VPPERM:
4401 case X86ISD::VPERMV:
4402 case X86ISD::VPERMV3:
4404 // 'Faux' Target Shuffles.
4411 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4412 MachineFunction &MF = DAG.getMachineFunction();
4413 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4414 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4415 int ReturnAddrIndex = FuncInfo->getRAIndex();
4417 if (ReturnAddrIndex == 0) {
4418 // Set up a frame object for the return address.
4419 unsigned SlotSize = RegInfo->getSlotSize();
4420 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4423 FuncInfo->setRAIndex(ReturnAddrIndex);
4426 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4429 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4430 bool hasSymbolicDisplacement) {
4431 // Offset should fit into 32 bit immediate field.
4432 if (!isInt<32>(Offset))
4435 // If we don't have a symbolic displacement - we don't have any extra
4437 if (!hasSymbolicDisplacement)
4440 // FIXME: Some tweaks might be needed for medium code model.
4441 if (M != CodeModel::Small && M != CodeModel::Kernel)
4444 // For small code model we assume that latest object is 16MB before end of 31
4445 // bits boundary. We may also accept pretty large negative constants knowing
4446 // that all objects are in the positive half of address space.
4447 if (M == CodeModel::Small && Offset < 16*1024*1024)
4450 // For kernel code model we know that all object resist in the negative half
4451 // of 32bits address space. We may not accept negative offsets, since they may
4452 // be just off and we may accept pretty large positive ones.
4453 if (M == CodeModel::Kernel && Offset >= 0)
4459 /// Determines whether the callee is required to pop its own arguments.
4460 /// Callee pop is necessary to support tail calls.
4461 bool X86::isCalleePop(CallingConv::ID CallingConv,
4462 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4463 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4464 // can guarantee TCO.
4465 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4468 switch (CallingConv) {
4471 case CallingConv::X86_StdCall:
4472 case CallingConv::X86_FastCall:
4473 case CallingConv::X86_ThisCall:
4474 case CallingConv::X86_VectorCall:
4479 /// Return true if the condition is an unsigned comparison operation.
4480 static bool isX86CCUnsigned(unsigned X86CC) {
4483 llvm_unreachable("Invalid integer condition!");
4499 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4500 switch (SetCCOpcode) {
4501 default: llvm_unreachable("Invalid integer condition!");
4502 case ISD::SETEQ: return X86::COND_E;
4503 case ISD::SETGT: return X86::COND_G;
4504 case ISD::SETGE: return X86::COND_GE;
4505 case ISD::SETLT: return X86::COND_L;
4506 case ISD::SETLE: return X86::COND_LE;
4507 case ISD::SETNE: return X86::COND_NE;
4508 case ISD::SETULT: return X86::COND_B;
4509 case ISD::SETUGT: return X86::COND_A;
4510 case ISD::SETULE: return X86::COND_BE;
4511 case ISD::SETUGE: return X86::COND_AE;
4515 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4516 /// condition code, returning the condition code and the LHS/RHS of the
4517 /// comparison to make.
4518 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4519 bool isFP, SDValue &LHS, SDValue &RHS,
4520 SelectionDAG &DAG) {
4522 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4523 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4524 // X > -1 -> X == 0, jump !sign.
4525 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4526 return X86::COND_NS;
4528 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4529 // X < 0 -> X == 0, jump on sign.
4532 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4534 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4535 return X86::COND_LE;
4539 return TranslateIntegerX86CC(SetCCOpcode);
4542 // First determine if it is required or is profitable to flip the operands.
4544 // If LHS is a foldable load, but RHS is not, flip the condition.
4545 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4546 !ISD::isNON_EXTLoad(RHS.getNode())) {
4547 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4548 std::swap(LHS, RHS);
4551 switch (SetCCOpcode) {
4557 std::swap(LHS, RHS);
4561 // On a floating point condition, the flags are set as follows:
4563 // 0 | 0 | 0 | X > Y
4564 // 0 | 0 | 1 | X < Y
4565 // 1 | 0 | 0 | X == Y
4566 // 1 | 1 | 1 | unordered
4567 switch (SetCCOpcode) {
4568 default: llvm_unreachable("Condcode should be pre-legalized away");
4570 case ISD::SETEQ: return X86::COND_E;
4571 case ISD::SETOLT: // flipped
4573 case ISD::SETGT: return X86::COND_A;
4574 case ISD::SETOLE: // flipped
4576 case ISD::SETGE: return X86::COND_AE;
4577 case ISD::SETUGT: // flipped
4579 case ISD::SETLT: return X86::COND_B;
4580 case ISD::SETUGE: // flipped
4582 case ISD::SETLE: return X86::COND_BE;
4584 case ISD::SETNE: return X86::COND_NE;
4585 case ISD::SETUO: return X86::COND_P;
4586 case ISD::SETO: return X86::COND_NP;
4588 case ISD::SETUNE: return X86::COND_INVALID;
4592 /// Is there a floating point cmov for the specific X86 condition code?
4593 /// Current x86 isa includes the following FP cmov instructions:
4594 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4595 static bool hasFPCMov(unsigned X86CC) {
4612 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4614 MachineFunction &MF,
4615 unsigned Intrinsic) const {
4617 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4621 Info.opc = ISD::INTRINSIC_W_CHAIN;
4622 Info.flags = MachineMemOperand::MONone;
4625 switch (IntrData->Type) {
4626 case TRUNCATE_TO_MEM_VI8:
4627 case TRUNCATE_TO_MEM_VI16:
4628 case TRUNCATE_TO_MEM_VI32: {
4629 Info.ptrVal = I.getArgOperand(0);
4630 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4631 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4632 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4634 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4635 ScalarVT = MVT::i16;
4636 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4637 ScalarVT = MVT::i32;
4639 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4641 Info.flags |= MachineMemOperand::MOStore;
4651 /// Returns true if the target can instruction select the
4652 /// specified FP immediate natively. If false, the legalizer will
4653 /// materialize the FP immediate as a load from a constant pool.
4654 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4655 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4656 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4662 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4663 ISD::LoadExtType ExtTy,
4665 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4666 // relocation target a movq or addq instruction: don't let the load shrink.
4667 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4668 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4669 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4670 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4674 /// Returns true if it is beneficial to convert a load of a constant
4675 /// to just the constant itself.
4676 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4678 assert(Ty->isIntegerTy());
4680 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4681 if (BitSize == 0 || BitSize > 64)
4686 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4687 // TODO: It might be a win to ease or lift this restriction, but the generic
4688 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4689 if (VT.isVector() && Subtarget.hasAVX512())
4695 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4696 unsigned Index) const {
4697 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4700 // Mask vectors support all subregister combinations and operations that
4701 // extract half of vector.
4702 if (ResVT.getVectorElementType() == MVT::i1)
4703 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4704 (Index == ResVT.getVectorNumElements()));
4706 return (Index % ResVT.getVectorNumElements()) == 0;
4709 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4710 // Speculate cttz only if we can directly use TZCNT.
4711 return Subtarget.hasBMI();
4714 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4715 // Speculate ctlz only if we can directly use LZCNT.
4716 return Subtarget.hasLZCNT();
4719 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4720 EVT BitcastVT) const {
4721 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4724 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4727 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4728 const SelectionDAG &DAG) const {
4729 // Do not merge to float value size (128 bytes) if no implicit
4730 // float attribute is set.
4731 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4732 Attribute::NoImplicitFloat);
4735 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4736 return (MemVT.getSizeInBits() <= MaxIntSize);
4741 bool X86TargetLowering::isCtlzFast() const {
4742 return Subtarget.hasFastLZCNT();
4745 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4746 const Instruction &AndI) const {
4750 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4751 EVT VT = Y.getValueType();
4756 if (!Subtarget.hasBMI())
4759 // There are only 32-bit and 64-bit forms for 'andn'.
4760 if (VT != MVT::i32 && VT != MVT::i64)
4763 // A mask and compare against constant is ok for an 'andn' too
4764 // even though the BMI instruction doesn't have an immediate form.
4769 bool X86TargetLowering::hasAndNot(SDValue Y) const {
4770 EVT VT = Y.getValueType();
4772 if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
4773 return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
4777 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
4780 if (VT == MVT::v4i32)
4783 return Subtarget.hasSSE2();
4786 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4787 MVT VT = MVT::getIntegerVT(NumBits);
4788 if (isTypeLegal(VT))
4791 // PMOVMSKB can handle this.
4792 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4795 // VPMOVMSKB can handle this.
4796 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4799 // TODO: Allow 64-bit type for 32-bit target.
4800 // TODO: 512-bit types should be allowed, but make sure that those
4801 // cases are handled in combineVectorSizedSetCCEquality().
4803 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4806 /// Val is the undef sentinel value or equal to the specified value.
4807 static bool isUndefOrEqual(int Val, int CmpVal) {
4808 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4811 /// Val is either the undef or zero sentinel value.
4812 static bool isUndefOrZero(int Val) {
4813 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4816 /// Return true if every element in Mask, beginning
4817 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4818 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4820 if (Mask[i] != SM_SentinelUndef)
4825 /// Return true if Val falls within the specified range (L, H].
4826 static bool isInRange(int Val, int Low, int Hi) {
4827 return (Val >= Low && Val < Hi);
4830 /// Return true if the value of any element in Mask falls within the specified
4832 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
4834 if (isInRange(M, Low, Hi))
4839 /// Return true if Val is undef or if its value falls within the
4840 /// specified range (L, H].
4841 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4842 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
4845 /// Return true if every element in Mask is undef or if its value
4846 /// falls within the specified range (L, H].
4847 static bool isUndefOrInRange(ArrayRef<int> Mask,
4850 if (!isUndefOrInRange(M, Low, Hi))
4855 /// Return true if Val is undef, zero or if its value falls within the
4856 /// specified range (L, H].
4857 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4858 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
4861 /// Return true if every element in Mask is undef, zero or if its value
4862 /// falls within the specified range (L, H].
4863 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4865 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4870 /// Return true if every element in Mask, beginning
4871 /// from position Pos and ending in Pos + Size, falls within the specified
4872 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
4873 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
4874 unsigned Size, int Low, int Step = 1) {
4875 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4876 if (!isUndefOrEqual(Mask[i], Low))
4881 /// Return true if every element in Mask, beginning
4882 /// from position Pos and ending in Pos+Size, falls within the specified
4883 /// sequential range (Low, Low+Size], or is undef or is zero.
4884 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4885 unsigned Size, int Low) {
4886 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4887 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4892 /// Return true if every element in Mask, beginning
4893 /// from position Pos and ending in Pos+Size is undef or is zero.
4894 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4896 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4897 if (!isUndefOrZero(Mask[i]))
4902 /// Helper function to test whether a shuffle mask could be
4903 /// simplified by widening the elements being shuffled.
4905 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4906 /// leaves it in an unspecified state.
4908 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4909 /// shuffle masks. The latter have the special property of a '-2' representing
4910 /// a zero-ed lane of a vector.
4911 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4912 SmallVectorImpl<int> &WidenedMask) {
4913 WidenedMask.assign(Mask.size() / 2, 0);
4914 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4916 int M1 = Mask[i + 1];
4918 // If both elements are undef, its trivial.
4919 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4920 WidenedMask[i / 2] = SM_SentinelUndef;
4924 // Check for an undef mask and a mask value properly aligned to fit with
4925 // a pair of values. If we find such a case, use the non-undef mask's value.
4926 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4927 WidenedMask[i / 2] = M1 / 2;
4930 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4931 WidenedMask[i / 2] = M0 / 2;
4935 // When zeroing, we need to spread the zeroing across both lanes to widen.
4936 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4937 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4938 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4939 WidenedMask[i / 2] = SM_SentinelZero;
4945 // Finally check if the two mask values are adjacent and aligned with
4947 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4948 WidenedMask[i / 2] = M0 / 2;
4952 // Otherwise we can't safely widen the elements used in this shuffle.
4955 assert(WidenedMask.size() == Mask.size() / 2 &&
4956 "Incorrect size of mask after widening the elements!");
4961 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4962 bool X86::isZeroNode(SDValue Elt) {
4963 return isNullConstant(Elt) || isNullFPConstant(Elt);
4966 // Build a vector of constants.
4967 // Use an UNDEF node if MaskElt == -1.
4968 // Split 64-bit constants in the 32-bit mode.
4969 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4970 const SDLoc &dl, bool IsMask = false) {
4972 SmallVector<SDValue, 32> Ops;
4975 MVT ConstVecVT = VT;
4976 unsigned NumElts = VT.getVectorNumElements();
4977 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4978 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4979 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4983 MVT EltVT = ConstVecVT.getVectorElementType();
4984 for (unsigned i = 0; i < NumElts; ++i) {
4985 bool IsUndef = Values[i] < 0 && IsMask;
4986 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4987 DAG.getConstant(Values[i], dl, EltVT);
4988 Ops.push_back(OpNode);
4990 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4991 DAG.getConstant(0, dl, EltVT));
4993 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4995 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4999 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5000 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5001 assert(Bits.size() == Undefs.getBitWidth() &&
5002 "Unequal constant and undef arrays");
5003 SmallVector<SDValue, 32> Ops;
5006 MVT ConstVecVT = VT;
5007 unsigned NumElts = VT.getVectorNumElements();
5008 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5009 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5010 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5014 MVT EltVT = ConstVecVT.getVectorElementType();
5015 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5017 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5020 const APInt &V = Bits[i];
5021 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5023 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5024 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5025 } else if (EltVT == MVT::f32) {
5026 APFloat FV(APFloat::IEEEsingle(), V);
5027 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5028 } else if (EltVT == MVT::f64) {
5029 APFloat FV(APFloat::IEEEdouble(), V);
5030 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5032 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5036 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5037 return DAG.getBitcast(VT, ConstsNode);
5040 /// Returns a vector of specified type with all zero elements.
5041 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5042 SelectionDAG &DAG, const SDLoc &dl) {
5043 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5044 VT.getVectorElementType() == MVT::i1) &&
5045 "Unexpected vector type");
5047 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5048 // type. This ensures they get CSE'd. But if the integer type is not
5049 // available, use a floating-point +0.0 instead.
5051 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5052 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5053 } else if (VT.getVectorElementType() == MVT::i1) {
5054 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5055 "Unexpected vector type");
5056 Vec = DAG.getConstant(0, dl, VT);
5058 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5059 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5061 return DAG.getBitcast(VT, Vec);
5064 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5065 const SDLoc &dl, unsigned vectorWidth) {
5066 EVT VT = Vec.getValueType();
5067 EVT ElVT = VT.getVectorElementType();
5068 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5069 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5070 VT.getVectorNumElements()/Factor);
5072 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5073 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5074 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5076 // This is the index of the first element of the vectorWidth-bit chunk
5077 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5078 IdxVal &= ~(ElemsPerChunk - 1);
5080 // If the input is a buildvector just emit a smaller one.
5081 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5082 return DAG.getBuildVector(ResultVT, dl,
5083 Vec->ops().slice(IdxVal, ElemsPerChunk));
5085 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5086 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5089 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5090 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5091 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5092 /// instructions or a simple subregister reference. Idx is an index in the
5093 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5094 /// lowering EXTRACT_VECTOR_ELT operations easier.
5095 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5096 SelectionDAG &DAG, const SDLoc &dl) {
5097 assert((Vec.getValueType().is256BitVector() ||
5098 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5099 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5102 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5103 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5104 SelectionDAG &DAG, const SDLoc &dl) {
5105 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5106 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5109 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5110 SelectionDAG &DAG, const SDLoc &dl,
5111 unsigned vectorWidth) {
5112 assert((vectorWidth == 128 || vectorWidth == 256) &&
5113 "Unsupported vector width");
5114 // Inserting UNDEF is Result
5117 EVT VT = Vec.getValueType();
5118 EVT ElVT = VT.getVectorElementType();
5119 EVT ResultVT = Result.getValueType();
5121 // Insert the relevant vectorWidth bits.
5122 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5123 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5125 // This is the index of the first element of the vectorWidth-bit chunk
5126 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5127 IdxVal &= ~(ElemsPerChunk - 1);
5129 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5130 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5133 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5134 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5135 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5136 /// simple superregister reference. Idx is an index in the 128 bits
5137 /// we want. It need not be aligned to a 128-bit boundary. That makes
5138 /// lowering INSERT_VECTOR_ELT operations easier.
5139 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5140 SelectionDAG &DAG, const SDLoc &dl) {
5141 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5142 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5145 /// Widen a vector to a larger size with the same scalar type, with the new
5146 /// elements either zero or undef.
5147 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5148 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5150 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5151 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5152 "Unsupported vector widening type");
5153 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5155 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5156 DAG.getIntPtrConstant(0, dl));
5159 // Helper for splitting operands of an operation to legal target size and
5160 // apply a function on each part.
5161 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5162 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5163 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5164 // The argument Builder is a function that will be applied on each split part:
5165 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5166 template <typename F>
5167 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5168 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5169 F Builder, bool CheckBWI = true) {
5170 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5171 unsigned NumSubs = 1;
5172 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5173 (!CheckBWI && Subtarget.useAVX512Regs())) {
5174 if (VT.getSizeInBits() > 512) {
5175 NumSubs = VT.getSizeInBits() / 512;
5176 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5178 } else if (Subtarget.hasAVX2()) {
5179 if (VT.getSizeInBits() > 256) {
5180 NumSubs = VT.getSizeInBits() / 256;
5181 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5184 if (VT.getSizeInBits() > 128) {
5185 NumSubs = VT.getSizeInBits() / 128;
5186 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5191 return Builder(DAG, DL, Ops);
5193 SmallVector<SDValue, 4> Subs;
5194 for (unsigned i = 0; i != NumSubs; ++i) {
5195 SmallVector<SDValue, 2> SubOps;
5196 for (SDValue Op : Ops) {
5197 EVT OpVT = Op.getValueType();
5198 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5199 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5200 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5202 Subs.push_back(Builder(DAG, DL, SubOps));
5204 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5207 // Return true if the instruction zeroes the unused upper part of the
5208 // destination and accepts mask.
5209 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5214 case X86ISD::CMPM_RND:
5220 /// Insert i1-subvector to i1-vector.
5221 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5222 const X86Subtarget &Subtarget) {
5225 SDValue Vec = Op.getOperand(0);
5226 SDValue SubVec = Op.getOperand(1);
5227 SDValue Idx = Op.getOperand(2);
5229 if (!isa<ConstantSDNode>(Idx))
5232 // Inserting undef is a nop. We can just return the original vector.
5233 if (SubVec.isUndef())
5236 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5237 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5240 MVT OpVT = Op.getSimpleValueType();
5241 unsigned NumElems = OpVT.getVectorNumElements();
5243 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5245 // Extend to natively supported kshift.
5246 MVT WideOpVT = OpVT;
5247 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5248 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5250 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5252 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5253 // May need to promote to a legal type.
5254 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5255 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5257 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5260 MVT SubVecVT = SubVec.getSimpleValueType();
5261 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5263 assert(IdxVal + SubVecNumElems <= NumElems &&
5264 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5265 "Unexpected index value in INSERT_SUBVECTOR");
5267 SDValue Undef = DAG.getUNDEF(WideOpVT);
5270 // Zero lower bits of the Vec
5271 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5272 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5274 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5275 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5276 // Merge them together, SubVec should be zero extended.
5277 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5278 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5280 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5281 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5284 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5285 Undef, SubVec, ZeroIdx);
5287 if (Vec.isUndef()) {
5288 assert(IdxVal != 0 && "Unexpected index");
5289 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5290 DAG.getConstant(IdxVal, dl, MVT::i8));
5291 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5294 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5295 assert(IdxVal != 0 && "Unexpected index");
5296 NumElems = WideOpVT.getVectorNumElements();
5297 unsigned ShiftLeft = NumElems - SubVecNumElems;
5298 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5299 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5300 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5301 if (ShiftRight != 0)
5302 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5303 DAG.getConstant(ShiftRight, dl, MVT::i8));
5304 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5307 // Simple case when we put subvector in the upper part
5308 if (IdxVal + SubVecNumElems == NumElems) {
5309 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5310 DAG.getConstant(IdxVal, dl, MVT::i8));
5311 if (SubVecNumElems * 2 == NumElems) {
5312 // Special case, use legal zero extending insert_subvector. This allows
5313 // isel to opimitize when bits are known zero.
5314 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5315 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5316 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5319 // Otherwise use explicit shifts to zero the bits.
5320 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5321 Undef, Vec, ZeroIdx);
5322 NumElems = WideOpVT.getVectorNumElements();
5323 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5324 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5325 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5327 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5328 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5331 // Inserting into the middle is more complicated.
5333 NumElems = WideOpVT.getVectorNumElements();
5335 // Widen the vector if needed.
5336 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5337 // Move the current value of the bit to be replace to the lsbs.
5338 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5339 DAG.getConstant(IdxVal, dl, MVT::i8));
5340 // Xor with the new bit.
5341 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5342 // Shift to MSB, filling bottom bits with 0.
5343 unsigned ShiftLeft = NumElems - SubVecNumElems;
5344 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5345 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5346 // Shift to the final position, filling upper bits with 0.
5347 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5348 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5349 DAG.getConstant(ShiftRight, dl, MVT::i8));
5350 // Xor with original vector leaving the new value.
5351 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5352 // Reduce to original width if needed.
5353 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5356 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5357 unsigned NumElems, SelectionDAG &DAG,
5358 const SDLoc &dl, unsigned VectorWidth) {
5359 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5360 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5363 /// Returns a vector of specified type with all bits set.
5364 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5365 /// Then bitcast to their original type, ensuring they get CSE'd.
5366 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5367 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5368 "Expected a 128/256/512-bit vector type");
5370 APInt Ones = APInt::getAllOnesValue(32);
5371 unsigned NumElts = VT.getSizeInBits() / 32;
5372 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5373 return DAG.getBitcast(VT, Vec);
5376 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5377 SelectionDAG &DAG) {
5378 EVT InVT = In.getValueType();
5379 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5381 if (VT.is128BitVector() && InVT.is128BitVector())
5382 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5383 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5385 // For 256-bit vectors, we only need the lower (128-bit) input half.
5386 // For 512-bit vectors, we only need the lower input half or quarter.
5387 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5388 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5389 In = extractSubVector(In, 0, DAG, DL,
5390 std::max(128, (int)VT.getSizeInBits() / Scale));
5393 return DAG.getNode(Opc, DL, VT, In);
5396 /// Returns a vector_shuffle node for an unpackl operation.
5397 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5398 SDValue V1, SDValue V2) {
5399 SmallVector<int, 8> Mask;
5400 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5401 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5404 /// Returns a vector_shuffle node for an unpackh operation.
5405 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5406 SDValue V1, SDValue V2) {
5407 SmallVector<int, 8> Mask;
5408 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5409 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5412 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5413 /// This produces a shuffle where the low element of V2 is swizzled into the
5414 /// zero/undef vector, landing at element Idx.
5415 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5416 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5418 const X86Subtarget &Subtarget,
5419 SelectionDAG &DAG) {
5420 MVT VT = V2.getSimpleValueType();
5422 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5423 int NumElems = VT.getVectorNumElements();
5424 SmallVector<int, 16> MaskVec(NumElems);
5425 for (int i = 0; i != NumElems; ++i)
5426 // If this is the insertion idx, put the low elt of V2 here.
5427 MaskVec[i] = (i == Idx) ? NumElems : i;
5428 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5431 static SDValue peekThroughBitcasts(SDValue V) {
5432 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5433 V = V.getOperand(0);
5437 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5438 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5439 V.getOperand(0).hasOneUse())
5440 V = V.getOperand(0);
5444 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
5445 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
5446 while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
5447 V = V.getOperand(0);
5451 static const Constant *getTargetConstantFromNode(SDValue Op) {
5452 Op = peekThroughBitcasts(Op);
5454 auto *Load = dyn_cast<LoadSDNode>(Op);
5458 SDValue Ptr = Load->getBasePtr();
5459 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5460 Ptr->getOpcode() == X86ISD::WrapperRIP)
5461 Ptr = Ptr->getOperand(0);
5463 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5464 if (!CNode || CNode->isMachineConstantPoolEntry())
5467 return dyn_cast<Constant>(CNode->getConstVal());
5470 // Extract raw constant bits from constant pools.
5471 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5473 SmallVectorImpl<APInt> &EltBits,
5474 bool AllowWholeUndefs = true,
5475 bool AllowPartialUndefs = true) {
5476 assert(EltBits.empty() && "Expected an empty EltBits vector");
5478 Op = peekThroughBitcasts(Op);
5480 EVT VT = Op.getValueType();
5481 unsigned SizeInBits = VT.getSizeInBits();
5482 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5483 unsigned NumElts = SizeInBits / EltSizeInBits;
5485 // Bitcast a source array of element bits to the target size.
5486 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5487 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5488 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5489 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5490 "Constant bit sizes don't match");
5492 // Don't split if we don't allow undef bits.
5493 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5494 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5497 // If we're already the right size, don't bother bitcasting.
5498 if (NumSrcElts == NumElts) {
5499 UndefElts = UndefSrcElts;
5500 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5504 // Extract all the undef/constant element data and pack into single bitsets.
5505 APInt UndefBits(SizeInBits, 0);
5506 APInt MaskBits(SizeInBits, 0);
5508 for (unsigned i = 0; i != NumSrcElts; ++i) {
5509 unsigned BitOffset = i * SrcEltSizeInBits;
5510 if (UndefSrcElts[i])
5511 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5512 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5515 // Split the undef/constant single bitset data into the target elements.
5516 UndefElts = APInt(NumElts, 0);
5517 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5519 for (unsigned i = 0; i != NumElts; ++i) {
5520 unsigned BitOffset = i * EltSizeInBits;
5521 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5523 // Only treat an element as UNDEF if all bits are UNDEF.
5524 if (UndefEltBits.isAllOnesValue()) {
5525 if (!AllowWholeUndefs)
5527 UndefElts.setBit(i);
5531 // If only some bits are UNDEF then treat them as zero (or bail if not
5533 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5536 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5537 EltBits[i] = Bits.getZExtValue();
5542 // Collect constant bits and insert into mask/undef bit masks.
5543 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5544 unsigned UndefBitIndex) {
5547 if (isa<UndefValue>(Cst)) {
5548 Undefs.setBit(UndefBitIndex);
5551 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5552 Mask = CInt->getValue();
5555 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5556 Mask = CFP->getValueAPF().bitcastToAPInt();
5564 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5565 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5566 return CastBitData(UndefSrcElts, SrcEltBits);
5569 // Extract scalar constant bits.
5570 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5571 APInt UndefSrcElts = APInt::getNullValue(1);
5572 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5573 return CastBitData(UndefSrcElts, SrcEltBits);
5575 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5576 APInt UndefSrcElts = APInt::getNullValue(1);
5577 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5578 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5579 return CastBitData(UndefSrcElts, SrcEltBits);
5582 // Extract constant bits from build vector.
5583 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5584 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5585 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5587 APInt UndefSrcElts(NumSrcElts, 0);
5588 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5589 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5590 const SDValue &Src = Op.getOperand(i);
5591 if (Src.isUndef()) {
5592 UndefSrcElts.setBit(i);
5595 auto *Cst = cast<ConstantSDNode>(Src);
5596 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5598 return CastBitData(UndefSrcElts, SrcEltBits);
5601 // Extract constant bits from constant pool vector.
5602 if (auto *Cst = getTargetConstantFromNode(Op)) {
5603 Type *CstTy = Cst->getType();
5604 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5607 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5608 unsigned NumSrcElts = CstTy->getVectorNumElements();
5610 APInt UndefSrcElts(NumSrcElts, 0);
5611 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5612 for (unsigned i = 0; i != NumSrcElts; ++i)
5613 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5617 return CastBitData(UndefSrcElts, SrcEltBits);
5620 // Extract constant bits from a broadcasted constant pool scalar.
5621 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5622 EltSizeInBits <= VT.getScalarSizeInBits()) {
5623 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5624 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5625 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5627 APInt UndefSrcElts(NumSrcElts, 0);
5628 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5629 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5630 if (UndefSrcElts[0])
5631 UndefSrcElts.setBits(0, NumSrcElts);
5632 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5633 return CastBitData(UndefSrcElts, SrcEltBits);
5638 // Extract a rematerialized scalar constant insertion.
5639 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5640 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5641 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5642 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5643 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5645 APInt UndefSrcElts(NumSrcElts, 0);
5646 SmallVector<APInt, 64> SrcEltBits;
5647 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5648 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5649 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5650 return CastBitData(UndefSrcElts, SrcEltBits);
5656 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5657 unsigned MaskEltSizeInBits,
5658 SmallVectorImpl<uint64_t> &RawMask) {
5660 SmallVector<APInt, 64> EltBits;
5662 // Extract the raw target constant bits.
5663 // FIXME: We currently don't support UNDEF bits or mask entries.
5664 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5665 EltBits, /* AllowWholeUndefs */ false,
5666 /* AllowPartialUndefs */ false))
5669 // Insert the extracted elements into the mask.
5670 for (APInt Elt : EltBits)
5671 RawMask.push_back(Elt.getZExtValue());
5676 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5677 /// Note: This ignores saturation, so inputs must be checked first.
5678 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5680 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5681 unsigned NumElts = VT.getVectorNumElements();
5682 unsigned NumLanes = VT.getSizeInBits() / 128;
5683 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5684 unsigned Offset = Unary ? 0 : NumElts;
5686 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5687 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5688 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5689 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5690 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5694 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5695 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5696 /// operands in \p Ops, and returns true.
5697 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5698 /// IsUnary for shuffles which use a single input multiple times, and in those
5699 /// cases it will adjust the mask to only have indices within that single input.
5700 /// It is an error to call this with non-empty Mask/Ops vectors.
5701 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5702 SmallVectorImpl<SDValue> &Ops,
5703 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5704 unsigned NumElems = VT.getVectorNumElements();
5707 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5708 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5711 bool IsFakeUnary = false;
5712 switch(N->getOpcode()) {
5713 case X86ISD::BLENDI:
5714 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5715 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5716 ImmN = N->getOperand(N->getNumOperands()-1);
5717 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5718 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5721 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5722 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5723 ImmN = N->getOperand(N->getNumOperands()-1);
5724 DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
5725 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5726 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5728 case X86ISD::INSERTPS:
5729 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5730 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5731 ImmN = N->getOperand(N->getNumOperands()-1);
5732 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5733 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5735 case X86ISD::EXTRQI:
5736 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5737 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5738 isa<ConstantSDNode>(N->getOperand(2))) {
5739 int BitLen = N->getConstantOperandVal(1);
5740 int BitIdx = N->getConstantOperandVal(2);
5741 DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5746 case X86ISD::INSERTQI:
5747 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5748 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5749 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5750 isa<ConstantSDNode>(N->getOperand(3))) {
5751 int BitLen = N->getConstantOperandVal(2);
5752 int BitIdx = N->getConstantOperandVal(3);
5753 DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5755 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5758 case X86ISD::UNPCKH:
5759 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5760 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5761 DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
5762 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5764 case X86ISD::UNPCKL:
5765 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5766 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5767 DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
5768 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5770 case X86ISD::MOVHLPS:
5771 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5772 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5773 DecodeMOVHLPSMask(NumElems, Mask);
5774 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5776 case X86ISD::MOVLHPS:
5777 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5778 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5779 DecodeMOVLHPSMask(NumElems, Mask);
5780 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5782 case X86ISD::PALIGNR:
5783 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5784 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5785 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5786 ImmN = N->getOperand(N->getNumOperands()-1);
5787 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5789 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5790 Ops.push_back(N->getOperand(1));
5791 Ops.push_back(N->getOperand(0));
5793 case X86ISD::VSHLDQ:
5794 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5795 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5796 ImmN = N->getOperand(N->getNumOperands() - 1);
5797 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5801 case X86ISD::VSRLDQ:
5802 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5803 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5804 ImmN = N->getOperand(N->getNumOperands() - 1);
5805 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5809 case X86ISD::PSHUFD:
5810 case X86ISD::VPERMILPI:
5811 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5812 ImmN = N->getOperand(N->getNumOperands()-1);
5813 DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
5814 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5817 case X86ISD::PSHUFHW:
5818 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5819 ImmN = N->getOperand(N->getNumOperands()-1);
5820 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5824 case X86ISD::PSHUFLW:
5825 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5826 ImmN = N->getOperand(N->getNumOperands()-1);
5827 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5831 case X86ISD::VZEXT_MOVL:
5832 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5833 DecodeZeroMoveLowMask(NumElems, Mask);
5836 case X86ISD::VBROADCAST: {
5837 SDValue N0 = N->getOperand(0);
5838 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5839 // add the pre-extracted value to the Ops vector.
5840 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5841 N0.getOperand(0).getValueType() == VT &&
5842 N0.getConstantOperandVal(1) == 0)
5843 Ops.push_back(N0.getOperand(0));
5845 // We only decode broadcasts of same-sized vectors, unless the broadcast
5846 // came from an extract from the original width. If we found one, we
5847 // pushed it the Ops vector above.
5848 if (N0.getValueType() == VT || !Ops.empty()) {
5849 DecodeVectorBroadcast(NumElems, Mask);
5855 case X86ISD::VPERMILPV: {
5856 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5858 SDValue MaskNode = N->getOperand(1);
5859 unsigned MaskEltSize = VT.getScalarSizeInBits();
5860 SmallVector<uint64_t, 32> RawMask;
5861 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5862 DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
5865 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5866 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5871 case X86ISD::PSHUFB: {
5872 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5873 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5874 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5876 SDValue MaskNode = N->getOperand(1);
5877 SmallVector<uint64_t, 32> RawMask;
5878 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5879 DecodePSHUFBMask(RawMask, Mask);
5882 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5883 DecodePSHUFBMask(C, Mask);
5888 case X86ISD::VPERMI:
5889 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5890 ImmN = N->getOperand(N->getNumOperands()-1);
5891 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5896 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5897 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5898 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5900 case X86ISD::VPERM2X128:
5901 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5902 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5903 ImmN = N->getOperand(N->getNumOperands()-1);
5904 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5906 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5908 case X86ISD::MOVSLDUP:
5909 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5910 DecodeMOVSLDUPMask(NumElems, Mask);
5913 case X86ISD::MOVSHDUP:
5914 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5915 DecodeMOVSHDUPMask(NumElems, Mask);
5918 case X86ISD::MOVDDUP:
5919 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5920 DecodeMOVDDUPMask(NumElems, Mask);
5923 case X86ISD::MOVLPD:
5924 case X86ISD::MOVLPS:
5925 // Not yet implemented
5927 case X86ISD::VPERMIL2: {
5928 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5929 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5930 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5931 unsigned MaskEltSize = VT.getScalarSizeInBits();
5932 SDValue MaskNode = N->getOperand(2);
5933 SDValue CtrlNode = N->getOperand(3);
5934 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5935 unsigned CtrlImm = CtrlOp->getZExtValue();
5936 SmallVector<uint64_t, 32> RawMask;
5937 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5938 DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
5942 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5943 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5949 case X86ISD::VPPERM: {
5950 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5951 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5952 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5953 SDValue MaskNode = N->getOperand(2);
5954 SmallVector<uint64_t, 32> RawMask;
5955 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5956 DecodeVPPERMMask(RawMask, Mask);
5959 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5960 DecodeVPPERMMask(C, Mask);
5965 case X86ISD::VPERMV: {
5966 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5968 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5969 Ops.push_back(N->getOperand(1));
5970 SDValue MaskNode = N->getOperand(0);
5971 SmallVector<uint64_t, 32> RawMask;
5972 unsigned MaskEltSize = VT.getScalarSizeInBits();
5973 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5974 DecodeVPERMVMask(RawMask, Mask);
5977 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5978 DecodeVPERMVMask(C, MaskEltSize, Mask);
5983 case X86ISD::VPERMV3: {
5984 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5985 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5986 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5987 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5988 Ops.push_back(N->getOperand(0));
5989 Ops.push_back(N->getOperand(2));
5990 SDValue MaskNode = N->getOperand(1);
5991 unsigned MaskEltSize = VT.getScalarSizeInBits();
5992 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5993 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5998 default: llvm_unreachable("unknown target shuffle node");
6001 // Empty mask indicates the decode failed.
6005 // Check if we're getting a shuffle mask with zero'd elements.
6006 if (!AllowSentinelZero)
6007 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6010 // If we have a fake unary shuffle, the shuffle mask is spread across two
6011 // inputs that are actually the same node. Re-map the mask to always point
6012 // into the first input.
6015 if (M >= (int)Mask.size())
6018 // If we didn't already add operands in the opcode-specific code, default to
6019 // adding 1 or 2 operands starting at 0.
6021 Ops.push_back(N->getOperand(0));
6022 if (!IsUnary || IsFakeUnary)
6023 Ops.push_back(N->getOperand(1));
6029 /// Check a target shuffle mask's inputs to see if we can set any values to
6030 /// SM_SentinelZero - this is for elements that are known to be zero
6031 /// (not just zeroable) from their inputs.
6032 /// Returns true if the target shuffle mask was decoded.
6033 static bool setTargetShuffleZeroElements(SDValue N,
6034 SmallVectorImpl<int> &Mask,
6035 SmallVectorImpl<SDValue> &Ops) {
6037 if (!isTargetShuffle(N.getOpcode()))
6040 MVT VT = N.getSimpleValueType();
6041 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6044 SDValue V1 = Ops[0];
6045 SDValue V2 = IsUnary ? V1 : Ops[1];
6047 V1 = peekThroughBitcasts(V1);
6048 V2 = peekThroughBitcasts(V2);
6050 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6051 "Illegal split of shuffle value type");
6052 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
6054 // Extract known constant input data.
6055 APInt UndefSrcElts[2];
6056 SmallVector<APInt, 32> SrcEltBits[2];
6057 bool IsSrcConstant[2] = {
6058 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6059 SrcEltBits[0], true, false),
6060 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6061 SrcEltBits[1], true, false)};
6063 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6066 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6070 // Determine shuffle input and normalize the mask.
6071 unsigned SrcIdx = M / Size;
6072 SDValue V = M < Size ? V1 : V2;
6075 // We are referencing an UNDEF input.
6077 Mask[i] = SM_SentinelUndef;
6081 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6082 // TODO: We currently only set UNDEF for integer types - floats use the same
6083 // registers as vectors and many of the scalar folded loads rely on the
6084 // SCALAR_TO_VECTOR pattern.
6085 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6086 (Size % V.getValueType().getVectorNumElements()) == 0) {
6087 int Scale = Size / V.getValueType().getVectorNumElements();
6088 int Idx = M / Scale;
6089 if (Idx != 0 && !VT.isFloatingPoint())
6090 Mask[i] = SM_SentinelUndef;
6091 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6092 Mask[i] = SM_SentinelZero;
6096 // Attempt to extract from the source's constant bits.
6097 if (IsSrcConstant[SrcIdx]) {
6098 if (UndefSrcElts[SrcIdx][M])
6099 Mask[i] = SM_SentinelUndef;
6100 else if (SrcEltBits[SrcIdx][M] == 0)
6101 Mask[i] = SM_SentinelZero;
6105 assert(VT.getVectorNumElements() == Mask.size() &&
6106 "Different mask size from vector size!");
6110 // Attempt to decode ops that could be represented as a shuffle mask.
6111 // The decoded shuffle mask may contain a different number of elements to the
6112 // destination value type.
6113 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6114 SmallVectorImpl<SDValue> &Ops,
6115 const SelectionDAG &DAG) {
6119 MVT VT = N.getSimpleValueType();
6120 unsigned NumElts = VT.getVectorNumElements();
6121 unsigned NumSizeInBits = VT.getSizeInBits();
6122 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6123 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6124 "Expected byte aligned value types");
6126 unsigned Opcode = N.getOpcode();
6128 case ISD::VECTOR_SHUFFLE: {
6129 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6130 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6131 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6132 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6133 Ops.push_back(N.getOperand(0));
6134 Ops.push_back(N.getOperand(1));
6140 case X86ISD::ANDNP: {
6141 // Attempt to decode as a per-byte mask.
6143 SmallVector<APInt, 32> EltBits;
6144 SDValue N0 = N.getOperand(0);
6145 SDValue N1 = N.getOperand(1);
6146 bool IsAndN = (X86ISD::ANDNP == Opcode);
6147 uint64_t ZeroMask = IsAndN ? 255 : 0;
6148 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6150 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6152 Mask.push_back(SM_SentinelUndef);
6155 uint64_t ByteBits = EltBits[i].getZExtValue();
6156 if (ByteBits != 0 && ByteBits != 255)
6158 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6160 Ops.push_back(IsAndN ? N1 : N0);
6163 case ISD::SCALAR_TO_VECTOR: {
6164 // Match against a scalar_to_vector of an extract from a vector,
6165 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6166 SDValue N0 = N.getOperand(0);
6169 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6170 N0.getOperand(0).getValueType() == VT) ||
6171 (N0.getOpcode() == X86ISD::PEXTRW &&
6172 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6173 (N0.getOpcode() == X86ISD::PEXTRB &&
6174 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6178 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6181 SDValue SrcVec = SrcExtract.getOperand(0);
6182 EVT SrcVT = SrcVec.getValueType();
6183 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6184 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6186 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6187 if (NumSrcElts <= SrcIdx)
6190 Ops.push_back(SrcVec);
6191 Mask.push_back(SrcIdx);
6192 Mask.append(NumZeros, SM_SentinelZero);
6193 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6196 case X86ISD::PINSRB:
6197 case X86ISD::PINSRW: {
6198 SDValue InVec = N.getOperand(0);
6199 SDValue InScl = N.getOperand(1);
6200 SDValue InIndex = N.getOperand(2);
6201 if (!isa<ConstantSDNode>(InIndex) ||
6202 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6204 uint64_t InIdx = N.getConstantOperandVal(2);
6206 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6207 if (X86::isZeroNode(InScl)) {
6208 Ops.push_back(InVec);
6209 for (unsigned i = 0; i != NumElts; ++i)
6210 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6214 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6215 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6217 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6218 if (InScl.getOpcode() != ExOp)
6221 SDValue ExVec = InScl.getOperand(0);
6222 SDValue ExIndex = InScl.getOperand(1);
6223 if (!isa<ConstantSDNode>(ExIndex) ||
6224 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6226 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6228 Ops.push_back(InVec);
6229 Ops.push_back(ExVec);
6230 for (unsigned i = 0; i != NumElts; ++i)
6231 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6234 case X86ISD::PACKSS:
6235 case X86ISD::PACKUS: {
6236 SDValue N0 = N.getOperand(0);
6237 SDValue N1 = N.getOperand(1);
6238 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6239 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6240 "Unexpected input value type");
6242 // If we know input saturation won't happen we can treat this
6243 // as a truncation shuffle.
6244 if (Opcode == X86ISD::PACKSS) {
6245 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6246 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6249 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6250 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6251 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6255 bool IsUnary = (N0 == N1);
6261 createPackShuffleMask(VT, Mask, IsUnary);
6265 case X86ISD::VSRLI: {
6266 uint64_t ShiftVal = N.getConstantOperandVal(1);
6267 // Out of range bit shifts are guaranteed to be zero.
6268 if (NumBitsPerElt <= ShiftVal) {
6269 Mask.append(NumElts, SM_SentinelZero);
6273 // We can only decode 'whole byte' bit shifts as shuffles.
6274 if ((ShiftVal % 8) != 0)
6277 uint64_t ByteShift = ShiftVal / 8;
6278 unsigned NumBytes = NumSizeInBits / 8;
6279 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6280 Ops.push_back(N.getOperand(0));
6282 // Clear mask to all zeros and insert the shifted byte indices.
6283 Mask.append(NumBytes, SM_SentinelZero);
6285 if (X86ISD::VSHLI == Opcode) {
6286 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6287 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6288 Mask[i + j] = i + j - ByteShift;
6290 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6291 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6292 Mask[i + j - ByteShift] = i + j;
6296 case ISD::ZERO_EXTEND_VECTOR_INREG:
6297 case X86ISD::VZEXT: {
6298 // TODO - add support for VPMOVZX with smaller input vector types.
6299 SDValue Src = N.getOperand(0);
6300 MVT SrcVT = Src.getSimpleValueType();
6301 if (NumSizeInBits != SrcVT.getSizeInBits())
6303 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
6304 VT.getVectorNumElements(), Mask);
6313 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6314 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6315 SmallVectorImpl<int> &Mask) {
6316 int MaskWidth = Mask.size();
6317 SmallVector<SDValue, 16> UsedInputs;
6318 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6319 int lo = UsedInputs.size() * MaskWidth;
6320 int hi = lo + MaskWidth;
6322 // Strip UNDEF input usage.
6323 if (Inputs[i].isUndef())
6325 if ((lo <= M) && (M < hi))
6326 M = SM_SentinelUndef;
6328 // Check for unused inputs.
6329 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6330 UsedInputs.push_back(Inputs[i]);
6337 Inputs = UsedInputs;
6340 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6341 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6342 /// remaining input indices in case we now have a unary shuffle and adjust the
6343 /// inputs accordingly.
6344 /// Returns true if the target shuffle mask was decoded.
6345 static bool resolveTargetShuffleInputs(SDValue Op,
6346 SmallVectorImpl<SDValue> &Inputs,
6347 SmallVectorImpl<int> &Mask,
6348 const SelectionDAG &DAG) {
6349 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6350 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6353 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6357 /// Returns the scalar element that will make up the ith
6358 /// element of the result of the vector shuffle.
6359 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6362 return SDValue(); // Limit search depth.
6364 SDValue V = SDValue(N, 0);
6365 EVT VT = V.getValueType();
6366 unsigned Opcode = V.getOpcode();
6368 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6369 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6370 int Elt = SV->getMaskElt(Index);
6373 return DAG.getUNDEF(VT.getVectorElementType());
6375 unsigned NumElems = VT.getVectorNumElements();
6376 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6377 : SV->getOperand(1);
6378 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6381 // Recurse into target specific vector shuffles to find scalars.
6382 if (isTargetShuffle(Opcode)) {
6383 MVT ShufVT = V.getSimpleValueType();
6384 MVT ShufSVT = ShufVT.getVectorElementType();
6385 int NumElems = (int)ShufVT.getVectorNumElements();
6386 SmallVector<int, 16> ShuffleMask;
6387 SmallVector<SDValue, 16> ShuffleOps;
6390 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6393 int Elt = ShuffleMask[Index];
6394 if (Elt == SM_SentinelZero)
6395 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6396 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6397 if (Elt == SM_SentinelUndef)
6398 return DAG.getUNDEF(ShufSVT);
6400 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6401 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6402 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6406 // Actual nodes that may contain scalar elements
6407 if (Opcode == ISD::BITCAST) {
6408 V = V.getOperand(0);
6409 EVT SrcVT = V.getValueType();
6410 unsigned NumElems = VT.getVectorNumElements();
6412 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6416 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6417 return (Index == 0) ? V.getOperand(0)
6418 : DAG.getUNDEF(VT.getVectorElementType());
6420 if (V.getOpcode() == ISD::BUILD_VECTOR)
6421 return V.getOperand(Index);
6426 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6427 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6428 unsigned NumNonZero, unsigned NumZero,
6430 const X86Subtarget &Subtarget) {
6431 MVT VT = Op.getSimpleValueType();
6432 unsigned NumElts = VT.getVectorNumElements();
6433 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6434 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6435 "Illegal vector insertion");
6441 for (unsigned i = 0; i < NumElts; ++i) {
6442 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6446 // If the build vector contains zeros or our first insertion is not the
6447 // first index then insert into zero vector to break any register
6448 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6451 if (NumZero || 0 != i)
6452 V = getZeroVector(VT, Subtarget, DAG, dl);
6454 assert(0 == i && "Expected insertion into zero-index");
6455 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6456 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6457 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6458 V = DAG.getBitcast(VT, V);
6462 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6463 DAG.getIntPtrConstant(i, dl));
6469 /// Custom lower build_vector of v16i8.
6470 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6471 unsigned NumNonZero, unsigned NumZero,
6473 const X86Subtarget &Subtarget) {
6474 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6477 // SSE4.1 - use PINSRB to insert each byte directly.
6478 if (Subtarget.hasSSE41())
6479 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6486 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6487 for (unsigned i = 0; i < 16; ++i) {
6488 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6489 if (ThisIsNonZero && First) {
6491 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6493 V = DAG.getUNDEF(MVT::v8i16);
6498 // FIXME: Investigate extending to i32 instead of just i16.
6499 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6500 SDValue ThisElt, LastElt;
6501 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6502 if (LastIsNonZero) {
6504 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6506 if (ThisIsNonZero) {
6507 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6508 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6509 DAG.getConstant(8, dl, MVT::i8));
6511 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6517 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6518 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6519 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6520 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6521 V = DAG.getBitcast(MVT::v8i16, V);
6523 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6524 DAG.getIntPtrConstant(i / 2, dl));
6530 return DAG.getBitcast(MVT::v16i8, V);
6533 /// Custom lower build_vector of v8i16.
6534 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6535 unsigned NumNonZero, unsigned NumZero,
6537 const X86Subtarget &Subtarget) {
6538 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6541 // Use PINSRW to insert each byte directly.
6542 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6546 /// Custom lower build_vector of v4i32 or v4f32.
6547 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6548 const X86Subtarget &Subtarget) {
6549 // Find all zeroable elements.
6550 std::bitset<4> Zeroable;
6551 for (int i=0; i < 4; ++i) {
6552 SDValue Elt = Op->getOperand(i);
6553 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6555 assert(Zeroable.size() - Zeroable.count() > 1 &&
6556 "We expect at least two non-zero elements!");
6558 // We only know how to deal with build_vector nodes where elements are either
6559 // zeroable or extract_vector_elt with constant index.
6560 SDValue FirstNonZero;
6561 unsigned FirstNonZeroIdx;
6562 for (unsigned i=0; i < 4; ++i) {
6565 SDValue Elt = Op->getOperand(i);
6566 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6567 !isa<ConstantSDNode>(Elt.getOperand(1)))
6569 // Make sure that this node is extracting from a 128-bit vector.
6570 MVT VT = Elt.getOperand(0).getSimpleValueType();
6571 if (!VT.is128BitVector())
6573 if (!FirstNonZero.getNode()) {
6575 FirstNonZeroIdx = i;
6579 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6580 SDValue V1 = FirstNonZero.getOperand(0);
6581 MVT VT = V1.getSimpleValueType();
6583 // See if this build_vector can be lowered as a blend with zero.
6585 unsigned EltMaskIdx, EltIdx;
6587 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6588 if (Zeroable[EltIdx]) {
6589 // The zero vector will be on the right hand side.
6590 Mask[EltIdx] = EltIdx+4;
6594 Elt = Op->getOperand(EltIdx);
6595 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6596 EltMaskIdx = Elt.getConstantOperandVal(1);
6597 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6599 Mask[EltIdx] = EltIdx;
6603 // Let the shuffle legalizer deal with blend operations.
6604 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6605 if (V1.getSimpleValueType() != VT)
6606 V1 = DAG.getBitcast(VT, V1);
6607 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6610 // See if we can lower this build_vector to a INSERTPS.
6611 if (!Subtarget.hasSSE41())
6614 SDValue V2 = Elt.getOperand(0);
6615 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6618 bool CanFold = true;
6619 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6623 SDValue Current = Op->getOperand(i);
6624 SDValue SrcVector = Current->getOperand(0);
6627 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6633 assert(V1.getNode() && "Expected at least two non-zero elements!");
6634 if (V1.getSimpleValueType() != MVT::v4f32)
6635 V1 = DAG.getBitcast(MVT::v4f32, V1);
6636 if (V2.getSimpleValueType() != MVT::v4f32)
6637 V2 = DAG.getBitcast(MVT::v4f32, V2);
6639 // Ok, we can emit an INSERTPS instruction.
6640 unsigned ZMask = Zeroable.to_ulong();
6642 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6643 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6645 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6646 DAG.getIntPtrConstant(InsertPSMask, DL));
6647 return DAG.getBitcast(VT, Result);
6650 /// Return a vector logical shift node.
6651 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6652 SelectionDAG &DAG, const TargetLowering &TLI,
6654 assert(VT.is128BitVector() && "Unknown type for VShift");
6655 MVT ShVT = MVT::v16i8;
6656 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6657 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6658 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6659 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
6660 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6663 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6664 SelectionDAG &DAG) {
6666 // Check if the scalar load can be widened into a vector load. And if
6667 // the address is "base + cst" see if the cst can be "absorbed" into
6668 // the shuffle mask.
6669 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6670 SDValue Ptr = LD->getBasePtr();
6671 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6673 EVT PVT = LD->getValueType(0);
6674 if (PVT != MVT::i32 && PVT != MVT::f32)
6679 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6680 FI = FINode->getIndex();
6682 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6683 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6684 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6685 Offset = Ptr.getConstantOperandVal(1);
6686 Ptr = Ptr.getOperand(0);
6691 // FIXME: 256-bit vector instructions don't require a strict alignment,
6692 // improve this code to support it better.
6693 unsigned RequiredAlign = VT.getSizeInBits()/8;
6694 SDValue Chain = LD->getChain();
6695 // Make sure the stack object alignment is at least 16 or 32.
6696 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6697 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6698 if (MFI.isFixedObjectIndex(FI)) {
6699 // Can't change the alignment. FIXME: It's possible to compute
6700 // the exact stack offset and reference FI + adjust offset instead.
6701 // If someone *really* cares about this. That's the way to implement it.
6704 MFI.setObjectAlignment(FI, RequiredAlign);
6708 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6709 // Ptr + (Offset & ~15).
6712 if ((Offset % RequiredAlign) & 3)
6714 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6717 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6718 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6721 int EltNo = (Offset - StartOffset) >> 2;
6722 unsigned NumElems = VT.getVectorNumElements();
6724 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6725 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6726 LD->getPointerInfo().getWithOffset(StartOffset));
6728 SmallVector<int, 8> Mask(NumElems, EltNo);
6730 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6736 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6737 /// elements can be replaced by a single large load which has the same value as
6738 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6740 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6741 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6742 const SDLoc &DL, SelectionDAG &DAG,
6743 const X86Subtarget &Subtarget,
6744 bool isAfterLegalize) {
6745 unsigned NumElems = Elts.size();
6747 int LastLoadedElt = -1;
6748 SmallBitVector LoadMask(NumElems, false);
6749 SmallBitVector ZeroMask(NumElems, false);
6750 SmallBitVector UndefMask(NumElems, false);
6752 // For each element in the initializer, see if we've found a load, zero or an
6754 for (unsigned i = 0; i < NumElems; ++i) {
6755 SDValue Elt = peekThroughBitcasts(Elts[i]);
6760 UndefMask[i] = true;
6761 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6763 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6766 // Each loaded element must be the correct fractional portion of the
6767 // requested vector load.
6768 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6773 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6774 "Incomplete element masks");
6776 // Handle Special Cases - all undef or undef/zero.
6777 if (UndefMask.count() == NumElems)
6778 return DAG.getUNDEF(VT);
6780 // FIXME: Should we return this as a BUILD_VECTOR instead?
6781 if ((ZeroMask | UndefMask).count() == NumElems)
6782 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6783 : DAG.getConstantFP(0.0, DL, VT);
6785 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6786 int FirstLoadedElt = LoadMask.find_first();
6787 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6788 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6789 EVT LDBaseVT = EltBase.getValueType();
6791 // Consecutive loads can contain UNDEFS but not ZERO elements.
6792 // Consecutive loads with UNDEFs and ZEROs elements require a
6793 // an additional shuffle stage to clear the ZERO elements.
6794 bool IsConsecutiveLoad = true;
6795 bool IsConsecutiveLoadWithZeros = true;
6796 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6798 SDValue Elt = peekThroughBitcasts(Elts[i]);
6799 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6800 if (!DAG.areNonVolatileConsecutiveLoads(
6801 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6802 i - FirstLoadedElt)) {
6803 IsConsecutiveLoad = false;
6804 IsConsecutiveLoadWithZeros = false;
6807 } else if (ZeroMask[i]) {
6808 IsConsecutiveLoad = false;
6812 SmallVector<LoadSDNode *, 8> Loads;
6813 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6815 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6817 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6818 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6819 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6820 "Cannot merge volatile loads.");
6822 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6823 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6824 for (auto *LD : Loads)
6825 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6829 // LOAD - all consecutive load/undefs (must start/end with a load).
6830 // If we have found an entire vector of loads and undefs, then return a large
6831 // load of the entire vector width starting at the base pointer.
6832 // If the vector contains zeros, then attempt to shuffle those elements.
6833 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6834 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6835 assert(LDBase && "Did not find base load for merging consecutive loads");
6836 EVT EltVT = LDBase->getValueType(0);
6837 // Ensure that the input vector size for the merged loads matches the
6838 // cumulative size of the input elements.
6839 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6842 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6845 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6846 // will lower to regular temporal loads and use the cache.
6847 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6848 VT.is256BitVector() && !Subtarget.hasInt256())
6851 if (IsConsecutiveLoad)
6852 return CreateLoad(VT, LDBase);
6854 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6855 // vector and a zero vector to clear out the zero elements.
6856 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6857 SmallVector<int, 4> ClearMask(NumElems, -1);
6858 for (unsigned i = 0; i < NumElems; ++i) {
6860 ClearMask[i] = i + NumElems;
6861 else if (LoadMask[i])
6864 SDValue V = CreateLoad(VT, LDBase);
6865 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6866 : DAG.getConstantFP(0.0, DL, VT);
6867 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6872 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6874 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6875 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6876 (LoadSize == 32 || LoadSize == 64) &&
6877 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6878 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6879 : MVT::getIntegerVT(LoadSize);
6880 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6881 if (TLI.isTypeLegal(VecVT)) {
6882 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6883 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6885 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6886 LDBase->getPointerInfo(),
6887 LDBase->getAlignment(),
6888 MachineMemOperand::MOLoad);
6889 for (auto *LD : Loads)
6890 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6891 return DAG.getBitcast(VT, ResNode);
6898 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6899 unsigned SplatBitSize, LLVMContext &C) {
6900 unsigned ScalarSize = VT.getScalarSizeInBits();
6901 unsigned NumElm = SplatBitSize / ScalarSize;
6903 SmallVector<Constant *, 32> ConstantVec;
6904 for (unsigned i = 0; i < NumElm; i++) {
6905 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6907 if (VT.isFloatingPoint()) {
6908 if (ScalarSize == 32) {
6909 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6911 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6912 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6915 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6916 ConstantVec.push_back(Const);
6918 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6921 static bool isUseOfShuffle(SDNode *N) {
6922 for (auto *U : N->uses()) {
6923 if (isTargetShuffle(U->getOpcode()))
6925 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6926 return isUseOfShuffle(U);
6931 // Check if the current node of build vector is a zero extended vector.
6932 // // If so, return the value extended.
6933 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6934 // // NumElt - return the number of zero extended identical values.
6935 // // EltType - return the type of the value include the zero extend.
6936 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6937 unsigned &NumElt, MVT &EltType) {
6938 SDValue ExtValue = Op->getOperand(0);
6939 unsigned NumElts = Op->getNumOperands();
6940 unsigned Delta = NumElts;
6942 for (unsigned i = 1; i < NumElts; i++) {
6943 if (Op->getOperand(i) == ExtValue) {
6947 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6950 if (!isPowerOf2_32(Delta) || Delta == 1)
6953 for (unsigned i = Delta; i < NumElts; i++) {
6954 if (i % Delta == 0) {
6955 if (Op->getOperand(i) != ExtValue)
6957 } else if (!(isNullConstant(Op->getOperand(i)) ||
6958 Op->getOperand(i).isUndef()))
6961 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6962 unsigned ExtVTSize = EltSize * Delta;
6963 EltType = MVT::getIntegerVT(ExtVTSize);
6964 NumElt = NumElts / Delta;
6968 /// Attempt to use the vbroadcast instruction to generate a splat value
6969 /// from a splat BUILD_VECTOR which uses:
6970 /// a. A single scalar load, or a constant.
6971 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6973 /// The VBROADCAST node is returned when a pattern is found,
6974 /// or SDValue() otherwise.
6975 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6976 const X86Subtarget &Subtarget,
6977 SelectionDAG &DAG) {
6978 // VBROADCAST requires AVX.
6979 // TODO: Splats could be generated for non-AVX CPUs using SSE
6980 // instructions, but there's less potential gain for only 128-bit vectors.
6981 if (!Subtarget.hasAVX())
6984 MVT VT = BVOp->getSimpleValueType(0);
6987 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6988 "Unsupported vector type for broadcast.");
6990 BitVector UndefElements;
6991 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6993 // Attempt to use VBROADCASTM
6994 // From this paterrn:
6995 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6996 // b. t1 = (build_vector t0 t0)
6998 // Create (VBROADCASTM v2i1 X)
6999 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
7000 MVT EltType = VT.getScalarType();
7001 unsigned NumElts = VT.getVectorNumElements();
7003 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
7004 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
7005 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
7006 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
7008 BOperand = ZeroExtended.getOperand(0);
7010 BOperand = Ld.getOperand(0).getOperand(0);
7011 MVT MaskVT = BOperand.getSimpleValueType();
7012 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7013 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7015 DAG.getNode(X86ISD::VBROADCASTM, dl,
7016 MVT::getVectorVT(EltType, NumElts), BOperand);
7017 return DAG.getBitcast(VT, Brdcst);
7022 // We need a splat of a single value to use broadcast, and it doesn't
7023 // make any sense if the value is only in one element of the vector.
7024 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
7025 APInt SplatValue, Undef;
7026 unsigned SplatBitSize;
7028 // Check if this is a repeated constant pattern suitable for broadcasting.
7029 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7030 SplatBitSize > VT.getScalarSizeInBits() &&
7031 SplatBitSize < VT.getSizeInBits()) {
7032 // Avoid replacing with broadcast when it's a use of a shuffle
7033 // instruction to preserve the present custom lowering of shuffles.
7034 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
7036 // replace BUILD_VECTOR with broadcast of the repeated constants.
7037 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7038 LLVMContext *Ctx = DAG.getContext();
7039 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7040 if (Subtarget.hasAVX()) {
7041 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
7042 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
7043 // Splatted value can fit in one INTEGER constant in constant pool.
7044 // Load the constant and broadcast it.
7045 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7046 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
7047 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
7048 SDValue CP = DAG.getConstantPool(C, PVT);
7049 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7051 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7053 CVT, dl, DAG.getEntryNode(), CP,
7054 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7056 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7057 MVT::getVectorVT(CVT, Repeat), Ld);
7058 return DAG.getBitcast(VT, Brdcst);
7059 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7060 // Splatted value can fit in one FLOAT constant in constant pool.
7061 // Load the constant and broadcast it.
7062 // AVX have support for 32 and 64 bit broadcast for floats only.
7063 // No 64bit integer in 32bit subtarget.
7064 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7065 // Lower the splat via APFloat directly, to avoid any conversion.
7068 ? ConstantFP::get(*Ctx,
7069 APFloat(APFloat::IEEEsingle(), SplatValue))
7070 : ConstantFP::get(*Ctx,
7071 APFloat(APFloat::IEEEdouble(), SplatValue));
7072 SDValue CP = DAG.getConstantPool(C, PVT);
7073 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7075 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7077 CVT, dl, DAG.getEntryNode(), CP,
7078 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7080 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7081 MVT::getVectorVT(CVT, Repeat), Ld);
7082 return DAG.getBitcast(VT, Brdcst);
7083 } else if (SplatBitSize > 64) {
7084 // Load the vector of constants and broadcast it.
7085 MVT CVT = VT.getScalarType();
7086 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7088 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7089 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7090 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7092 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7093 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7095 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7096 return DAG.getBitcast(VT, Brdcst);
7103 bool ConstSplatVal =
7104 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7106 // Make sure that all of the users of a non-constant load are from the
7107 // BUILD_VECTOR node.
7108 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7111 unsigned ScalarSize = Ld.getValueSizeInBits();
7112 bool IsGE256 = (VT.getSizeInBits() >= 256);
7114 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7115 // instruction to save 8 or more bytes of constant pool data.
7116 // TODO: If multiple splats are generated to load the same constant,
7117 // it may be detrimental to overall size. There needs to be a way to detect
7118 // that condition to know if this is truly a size win.
7119 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7121 // Handle broadcasting a single constant scalar from the constant pool
7123 // On Sandybridge (no AVX2), it is still better to load a constant vector
7124 // from the constant pool and not to broadcast it from a scalar.
7125 // But override that restriction when optimizing for size.
7126 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7127 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7128 EVT CVT = Ld.getValueType();
7129 assert(!CVT.isVector() && "Must not broadcast a vector type");
7131 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7132 // For size optimization, also splat v2f64 and v2i64, and for size opt
7133 // with AVX2, also splat i8 and i16.
7134 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7135 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7136 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7137 const Constant *C = nullptr;
7138 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7139 C = CI->getConstantIntValue();
7140 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7141 C = CF->getConstantFPValue();
7143 assert(C && "Invalid constant type");
7145 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7147 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7148 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7150 CVT, dl, DAG.getEntryNode(), CP,
7151 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7154 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7158 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7160 // Handle AVX2 in-register broadcasts.
7161 if (!IsLoad && Subtarget.hasInt256() &&
7162 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7163 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7165 // The scalar source must be a normal load.
7169 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7170 (Subtarget.hasVLX() && ScalarSize == 64))
7171 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7173 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7174 // double since there is no vbroadcastsd xmm
7175 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7176 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7177 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7180 // Unsupported broadcast.
7184 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7185 /// underlying vector and index.
7187 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7189 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7191 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7192 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7195 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7197 // (extract_vector_elt (v8f32 %1), Constant<6>)
7199 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7200 // (extract_subvector (v8f32 %0), Constant<4>),
7203 // In this case the vector is the extract_subvector expression and the index
7204 // is 2, as specified by the shuffle.
7205 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7206 SDValue ShuffleVec = SVOp->getOperand(0);
7207 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7208 assert(ShuffleVecVT.getVectorElementType() ==
7209 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7211 int ShuffleIdx = SVOp->getMaskElt(Idx);
7212 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7213 ExtractedFromVec = ShuffleVec;
7219 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7220 MVT VT = Op.getSimpleValueType();
7222 // Skip if insert_vec_elt is not supported.
7223 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7224 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7228 unsigned NumElems = Op.getNumOperands();
7232 SmallVector<unsigned, 4> InsertIndices;
7233 SmallVector<int, 8> Mask(NumElems, -1);
7235 for (unsigned i = 0; i != NumElems; ++i) {
7236 unsigned Opc = Op.getOperand(i).getOpcode();
7238 if (Opc == ISD::UNDEF)
7241 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7242 // Quit if more than 1 elements need inserting.
7243 if (InsertIndices.size() > 1)
7246 InsertIndices.push_back(i);
7250 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7251 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7253 // Quit if non-constant index.
7254 if (!isa<ConstantSDNode>(ExtIdx))
7256 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7258 // Quit if extracted from vector of different type.
7259 if (ExtractedFromVec.getValueType() != VT)
7262 if (!VecIn1.getNode())
7263 VecIn1 = ExtractedFromVec;
7264 else if (VecIn1 != ExtractedFromVec) {
7265 if (!VecIn2.getNode())
7266 VecIn2 = ExtractedFromVec;
7267 else if (VecIn2 != ExtractedFromVec)
7268 // Quit if more than 2 vectors to shuffle
7272 if (ExtractedFromVec == VecIn1)
7274 else if (ExtractedFromVec == VecIn2)
7275 Mask[i] = Idx + NumElems;
7278 if (!VecIn1.getNode())
7281 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7282 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7284 for (unsigned Idx : InsertIndices)
7285 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7286 DAG.getIntPtrConstant(Idx, DL));
7291 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7292 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7293 Op.getScalarValueSizeInBits() == 1 &&
7294 "Can not convert non-constant vector");
7295 uint64_t Immediate = 0;
7296 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7297 SDValue In = Op.getOperand(idx);
7299 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7302 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7303 return DAG.getConstant(Immediate, dl, VT);
7305 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7306 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7307 const X86Subtarget &Subtarget) {
7309 MVT VT = Op.getSimpleValueType();
7310 assert((VT.getVectorElementType() == MVT::i1) &&
7311 "Unexpected type in LowerBUILD_VECTORvXi1!");
7314 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7317 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7320 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7321 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7322 // Split the pieces.
7324 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7326 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7327 // We have to manually lower both halves so getNode doesn't try to
7328 // reassemble the build_vector.
7329 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7330 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7331 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7333 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7334 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7335 return DAG.getBitcast(VT, Imm);
7336 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7338 DAG.getIntPtrConstant(0, dl));
7341 // Vector has one or more non-const elements
7342 uint64_t Immediate = 0;
7343 SmallVector<unsigned, 16> NonConstIdx;
7344 bool IsSplat = true;
7345 bool HasConstElts = false;
7347 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7348 SDValue In = Op.getOperand(idx);
7351 if (!isa<ConstantSDNode>(In))
7352 NonConstIdx.push_back(idx);
7354 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7355 HasConstElts = true;
7359 else if (In != Op.getOperand(SplatIdx))
7363 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7365 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7366 DAG.getConstant(1, dl, VT),
7367 DAG.getConstant(0, dl, VT));
7369 // insert elements one by one
7373 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7374 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7376 else if (HasConstElts)
7377 Imm = DAG.getConstant(0, dl, VT);
7379 Imm = DAG.getUNDEF(VT);
7380 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7381 DstVec = DAG.getBitcast(VT, Imm);
7383 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7384 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7385 DAG.getIntPtrConstant(0, dl));
7388 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7389 unsigned InsertIdx = NonConstIdx[i];
7390 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7391 Op.getOperand(InsertIdx),
7392 DAG.getIntPtrConstant(InsertIdx, dl));
7397 /// Return true if \p N implements a horizontal binop and return the
7398 /// operands for the horizontal binop into V0 and V1.
7400 /// This is a helper function of LowerToHorizontalOp().
7401 /// This function checks that the build_vector \p N in input implements a
7402 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7403 /// operation to match.
7404 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7405 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7406 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7409 /// This function only analyzes elements of \p N whose indices are
7410 /// in range [BaseIdx, LastIdx).
7411 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7413 unsigned BaseIdx, unsigned LastIdx,
7414 SDValue &V0, SDValue &V1) {
7415 EVT VT = N->getValueType(0);
7417 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7418 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7419 "Invalid Vector in input!");
7421 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7422 bool CanFold = true;
7423 unsigned ExpectedVExtractIdx = BaseIdx;
7424 unsigned NumElts = LastIdx - BaseIdx;
7425 V0 = DAG.getUNDEF(VT);
7426 V1 = DAG.getUNDEF(VT);
7428 // Check if N implements a horizontal binop.
7429 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7430 SDValue Op = N->getOperand(i + BaseIdx);
7433 if (Op->isUndef()) {
7434 // Update the expected vector extract index.
7435 if (i * 2 == NumElts)
7436 ExpectedVExtractIdx = BaseIdx;
7437 ExpectedVExtractIdx += 2;
7441 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7446 SDValue Op0 = Op.getOperand(0);
7447 SDValue Op1 = Op.getOperand(1);
7449 // Try to match the following pattern:
7450 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7451 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7452 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7453 Op0.getOperand(0) == Op1.getOperand(0) &&
7454 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7455 isa<ConstantSDNode>(Op1.getOperand(1)));
7459 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7460 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7462 if (i * 2 < NumElts) {
7464 V0 = Op0.getOperand(0);
7465 if (V0.getValueType() != VT)
7470 V1 = Op0.getOperand(0);
7471 if (V1.getValueType() != VT)
7474 if (i * 2 == NumElts)
7475 ExpectedVExtractIdx = BaseIdx;
7478 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7479 if (I0 == ExpectedVExtractIdx)
7480 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7481 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7482 // Try to match the following dag sequence:
7483 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7484 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7488 ExpectedVExtractIdx += 2;
7494 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7495 /// a concat_vector.
7497 /// This is a helper function of LowerToHorizontalOp().
7498 /// This function expects two 256-bit vectors called V0 and V1.
7499 /// At first, each vector is split into two separate 128-bit vectors.
7500 /// Then, the resulting 128-bit vectors are used to implement two
7501 /// horizontal binary operations.
7503 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7505 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7506 /// the two new horizontal binop.
7507 /// When Mode is set, the first horizontal binop dag node would take as input
7508 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7509 /// horizontal binop dag node would take as input the lower 128-bit of V1
7510 /// and the upper 128-bit of V1.
7512 /// HADD V0_LO, V0_HI
7513 /// HADD V1_LO, V1_HI
7515 /// Otherwise, the first horizontal binop dag node takes as input the lower
7516 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7517 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7519 /// HADD V0_LO, V1_LO
7520 /// HADD V0_HI, V1_HI
7522 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7523 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7524 /// the upper 128-bits of the result.
7525 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7526 const SDLoc &DL, SelectionDAG &DAG,
7527 unsigned X86Opcode, bool Mode,
7528 bool isUndefLO, bool isUndefHI) {
7529 MVT VT = V0.getSimpleValueType();
7530 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7531 "Invalid nodes in input!");
7533 unsigned NumElts = VT.getVectorNumElements();
7534 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7535 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7536 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7537 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7538 MVT NewVT = V0_LO.getSimpleValueType();
7540 SDValue LO = DAG.getUNDEF(NewVT);
7541 SDValue HI = DAG.getUNDEF(NewVT);
7544 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7545 if (!isUndefLO && !V0->isUndef())
7546 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7547 if (!isUndefHI && !V1->isUndef())
7548 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7550 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7551 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7552 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7554 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7555 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7558 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7561 /// Returns true iff \p BV builds a vector with the result equivalent to
7562 /// the result of ADDSUB/SUBADD operation.
7563 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7564 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7565 /// \p Opnd0 and \p Opnd1.
7566 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7567 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7568 SDValue &Opnd0, SDValue &Opnd1,
7569 unsigned &NumExtracts,
7572 MVT VT = BV->getSimpleValueType(0);
7573 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7576 unsigned NumElts = VT.getVectorNumElements();
7577 SDValue InVec0 = DAG.getUNDEF(VT);
7578 SDValue InVec1 = DAG.getUNDEF(VT);
7582 // Odd-numbered elements in the input build vector are obtained from
7583 // adding/subtracting two integer/float elements.
7584 // Even-numbered elements in the input build vector are obtained from
7585 // subtracting/adding two integer/float elements.
7586 unsigned Opc[2] {0, 0};
7587 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7588 SDValue Op = BV->getOperand(i);
7590 // Skip 'undef' values.
7591 unsigned Opcode = Op.getOpcode();
7592 if (Opcode == ISD::UNDEF)
7595 // Early exit if we found an unexpected opcode.
7596 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7599 SDValue Op0 = Op.getOperand(0);
7600 SDValue Op1 = Op.getOperand(1);
7602 // Try to match the following pattern:
7603 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7604 // Early exit if we cannot match that sequence.
7605 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7606 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7607 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7608 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7609 Op0.getOperand(1) != Op1.getOperand(1))
7612 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7616 // We found a valid add/sub node, make sure its the same opcode as previous
7617 // elements for this parity.
7618 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7620 Opc[i % 2] = Opcode;
7622 // Update InVec0 and InVec1.
7623 if (InVec0.isUndef()) {
7624 InVec0 = Op0.getOperand(0);
7625 if (InVec0.getSimpleValueType() != VT)
7628 if (InVec1.isUndef()) {
7629 InVec1 = Op1.getOperand(0);
7630 if (InVec1.getSimpleValueType() != VT)
7634 // Make sure that operands in input to each add/sub node always
7635 // come from a same pair of vectors.
7636 if (InVec0 != Op0.getOperand(0)) {
7637 if (Opcode == ISD::FSUB)
7640 // FADD is commutable. Try to commute the operands
7641 // and then test again.
7642 std::swap(Op0, Op1);
7643 if (InVec0 != Op0.getOperand(0))
7647 if (InVec1 != Op1.getOperand(0))
7650 // Increment the number of extractions done.
7654 // Ensure we have found an opcode for both parities and that they are
7655 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7656 // inputs are undef.
7657 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7658 InVec0.isUndef() || InVec1.isUndef())
7661 IsSubAdd = Opc[0] == ISD::FADD;
7668 /// Returns true if is possible to fold MUL and an idiom that has already been
7669 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7670 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7671 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7673 /// Prior to calling this function it should be known that there is some
7674 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7675 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7676 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7677 /// of \p Opnd0 uses is expected to be equal to 2.
7678 /// For example, this function may be called for the following IR:
7679 /// %AB = fmul fast <2 x double> %A, %B
7680 /// %Sub = fsub fast <2 x double> %AB, %C
7681 /// %Add = fadd fast <2 x double> %AB, %C
7682 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7683 /// <2 x i32> <i32 0, i32 3>
7684 /// There is a def for %Addsub here, which potentially can be replaced by
7685 /// X86ISD::ADDSUB operation:
7686 /// %Addsub = X86ISD::ADDSUB %AB, %C
7687 /// and such ADDSUB can further be replaced with FMADDSUB:
7688 /// %Addsub = FMADDSUB %A, %B, %C.
7690 /// The main reason why this method is called before the replacement of the
7691 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7692 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7694 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7696 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7697 unsigned ExpectedUses) {
7698 if (Opnd0.getOpcode() != ISD::FMUL ||
7699 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7702 // FIXME: These checks must match the similar ones in
7703 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7704 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7705 // or MUL + ADDSUB to FMADDSUB.
7706 const TargetOptions &Options = DAG.getTarget().Options;
7708 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7713 Opnd1 = Opnd0.getOperand(1);
7714 Opnd0 = Opnd0.getOperand(0);
7719 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7720 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7721 /// X86ISD::FMSUBADD node.
7722 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7723 const X86Subtarget &Subtarget,
7724 SelectionDAG &DAG) {
7725 SDValue Opnd0, Opnd1;
7726 unsigned NumExtracts;
7728 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7732 MVT VT = BV->getSimpleValueType(0);
7735 // Try to generate X86ISD::FMADDSUB node here.
7737 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7738 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7739 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7742 // We only support ADDSUB.
7746 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7747 // the ADDSUB idiom has been successfully recognized. There are no known
7748 // X86 targets with 512-bit ADDSUB instructions!
7749 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7751 if (VT.is512BitVector())
7754 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7757 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7758 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7759 const X86Subtarget &Subtarget,
7760 SelectionDAG &DAG) {
7761 MVT VT = BV->getSimpleValueType(0);
7762 unsigned NumElts = VT.getVectorNumElements();
7763 unsigned NumUndefsLO = 0;
7764 unsigned NumUndefsHI = 0;
7765 unsigned Half = NumElts/2;
7767 // Count the number of UNDEF operands in the build_vector in input.
7768 for (unsigned i = 0, e = Half; i != e; ++i)
7769 if (BV->getOperand(i)->isUndef())
7772 for (unsigned i = Half, e = NumElts; i != e; ++i)
7773 if (BV->getOperand(i)->isUndef())
7776 // Early exit if this is either a build_vector of all UNDEFs or all the
7777 // operands but one are UNDEF.
7778 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7782 SDValue InVec0, InVec1;
7783 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7784 // Try to match an SSE3 float HADD/HSUB.
7785 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7786 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7788 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7789 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7790 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7791 // Try to match an SSSE3 integer HADD/HSUB.
7792 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7793 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7795 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7796 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7799 if (!Subtarget.hasAVX())
7802 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7803 // Try to match an AVX horizontal add/sub of packed single/double
7804 // precision floating point values from 256-bit vectors.
7805 SDValue InVec2, InVec3;
7806 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7807 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7808 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7809 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7810 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7812 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7813 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7814 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7815 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7816 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7817 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7818 // Try to match an AVX2 horizontal add/sub of signed integers.
7819 SDValue InVec2, InVec3;
7821 bool CanFold = true;
7823 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7824 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7825 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7826 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7827 X86Opcode = X86ISD::HADD;
7828 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7829 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7830 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7831 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7832 X86Opcode = X86ISD::HSUB;
7837 // Fold this build_vector into a single horizontal add/sub.
7838 // Do this only if the target has AVX2.
7839 if (Subtarget.hasAVX2())
7840 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7842 // Do not try to expand this build_vector into a pair of horizontal
7843 // add/sub if we can emit a pair of scalar add/sub.
7844 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7847 // Convert this build_vector into a pair of horizontal binop followed by
7849 bool isUndefLO = NumUndefsLO == Half;
7850 bool isUndefHI = NumUndefsHI == Half;
7851 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7852 isUndefLO, isUndefHI);
7856 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7857 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7859 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7860 X86Opcode = X86ISD::HADD;
7861 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7862 X86Opcode = X86ISD::HSUB;
7863 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7864 X86Opcode = X86ISD::FHADD;
7865 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7866 X86Opcode = X86ISD::FHSUB;
7870 // Don't try to expand this build_vector into a pair of horizontal add/sub
7871 // if we can simply emit a pair of scalar add/sub.
7872 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7875 // Convert this build_vector into two horizontal add/sub followed by
7877 bool isUndefLO = NumUndefsLO == Half;
7878 bool isUndefHI = NumUndefsHI == Half;
7879 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7880 isUndefLO, isUndefHI);
7886 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7887 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7888 /// just apply the bit to the vectors.
7889 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7890 /// from this, but enough scalar bit operations are created from the later
7891 /// legalization + scalarization stages to need basic support.
7892 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7893 SelectionDAG &DAG) {
7895 MVT VT = Op->getSimpleValueType(0);
7896 unsigned NumElems = VT.getVectorNumElements();
7897 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7899 // Check that all elements have the same opcode.
7900 // TODO: Should we allow UNDEFS and if so how many?
7901 unsigned Opcode = Op->getOperand(0).getOpcode();
7902 for (unsigned i = 1; i < NumElems; ++i)
7903 if (Opcode != Op->getOperand(i).getOpcode())
7906 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7913 // Don't do this if the buildvector is a splat - we'd replace one
7914 // constant with an entire vector.
7915 if (Op->getSplatValue())
7917 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7922 SmallVector<SDValue, 4> LHSElts, RHSElts;
7923 for (SDValue Elt : Op->ops()) {
7924 SDValue LHS = Elt.getOperand(0);
7925 SDValue RHS = Elt.getOperand(1);
7927 // We expect the canonicalized RHS operand to be the constant.
7928 if (!isa<ConstantSDNode>(RHS))
7930 LHSElts.push_back(LHS);
7931 RHSElts.push_back(RHS);
7934 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7935 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7936 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7939 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7940 /// functionality to do this, so it's all zeros, all ones, or some derivation
7941 /// that is cheap to calculate.
7942 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7943 const X86Subtarget &Subtarget) {
7945 MVT VT = Op.getSimpleValueType();
7947 // Vectors containing all zeros can be matched by pxor and xorps.
7948 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7949 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7950 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7951 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7954 return getZeroVector(VT, Subtarget, DAG, DL);
7957 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7958 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7959 // vpcmpeqd on 256-bit vectors.
7960 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7961 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7962 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7965 return getOnesVector(VT, DAG, DL);
7971 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
7972 /// from a vector of source values and a vector of extraction indices.
7973 /// The vectors might be manipulated to match the type of the permute op.
7974 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
7975 SDLoc &DL, SelectionDAG &DAG,
7976 const X86Subtarget &Subtarget) {
7978 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
7979 unsigned NumElts = VT.getVectorNumElements();
7980 unsigned SizeInBits = VT.getSizeInBits();
7982 // Adjust IndicesVec to match VT size.
7983 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
7984 "Illegal variable permute mask size");
7985 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
7986 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
7987 NumElts * VT.getScalarSizeInBits());
7988 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7990 // Handle SrcVec that don't match VT type.
7991 if (SrcVec.getValueSizeInBits() != SizeInBits) {
7992 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
7993 // Handle larger SrcVec by treating it as a larger permute.
7994 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
7995 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
7996 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
7997 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
7998 Subtarget, DAG, SDLoc(IndicesVec));
7999 return extractSubVector(
8000 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
8001 DAG, DL, SizeInBits);
8002 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8003 // Widen smaller SrcVec to match VT.
8004 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8009 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8010 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8011 EVT SrcVT = Idx.getValueType();
8012 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8013 uint64_t IndexScale = 0;
8014 uint64_t IndexOffset = 0;
8016 // If we're scaling a smaller permute op, then we need to repeat the
8017 // indices, scaling and offsetting them as well.
8018 // e.g. v4i32 -> v16i8 (Scale = 4)
8019 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8020 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8021 for (uint64_t i = 0; i != Scale; ++i) {
8022 IndexScale |= Scale << (i * NumDstBits);
8023 IndexOffset |= i << (i * NumDstBits);
8026 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8027 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8028 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8029 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8033 unsigned Opcode = 0;
8034 switch (VT.SimpleTy) {
8038 if (Subtarget.hasSSSE3())
8039 Opcode = X86ISD::PSHUFB;
8042 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8043 Opcode = X86ISD::VPERMV;
8044 else if (Subtarget.hasSSSE3()) {
8045 Opcode = X86ISD::PSHUFB;
8046 ShuffleVT = MVT::v16i8;
8051 if (Subtarget.hasAVX()) {
8052 Opcode = X86ISD::VPERMILPV;
8053 ShuffleVT = MVT::v4f32;
8054 } else if (Subtarget.hasSSSE3()) {
8055 Opcode = X86ISD::PSHUFB;
8056 ShuffleVT = MVT::v16i8;
8061 if (Subtarget.hasAVX()) {
8062 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8063 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8064 Opcode = X86ISD::VPERMILPV;
8065 ShuffleVT = MVT::v2f64;
8066 } else if (Subtarget.hasSSE41()) {
8067 // SSE41 can compare v2i64 - select between indices 0 and 1.
8068 return DAG.getSelectCC(
8070 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8071 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8072 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8073 ISD::CondCode::SETEQ);
8077 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8078 Opcode = X86ISD::VPERMV;
8079 else if (Subtarget.hasXOP()) {
8080 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8081 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8082 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8083 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8085 ISD::CONCAT_VECTORS, DL, VT,
8086 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8087 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8088 } else if (Subtarget.hasAVX()) {
8089 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8090 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8091 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8092 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8093 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8094 ArrayRef<SDValue> Ops) {
8095 // Permute Lo and Hi and then select based on index range.
8096 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8097 // care about the bit[7] as its just an index vector.
8098 SDValue Idx = Ops[2];
8099 EVT VT = Idx.getValueType();
8100 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8101 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8102 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8103 ISD::CondCode::SETGT);
8105 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8106 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8111 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8112 Opcode = X86ISD::VPERMV;
8113 else if (Subtarget.hasAVX()) {
8114 // Scale to v32i8 and perform as v32i8.
8115 IndicesVec = ScaleIndices(IndicesVec, 2);
8116 return DAG.getBitcast(
8117 VT, createVariablePermute(
8118 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8119 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8124 if (Subtarget.hasAVX2())
8125 Opcode = X86ISD::VPERMV;
8126 else if (Subtarget.hasAVX()) {
8127 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8128 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8129 {0, 1, 2, 3, 0, 1, 2, 3});
8130 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8131 {4, 5, 6, 7, 4, 5, 6, 7});
8132 if (Subtarget.hasXOP())
8133 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8134 LoLo, HiHi, IndicesVec,
8135 DAG.getConstant(0, DL, MVT::i8)));
8136 // Permute Lo and Hi and then select based on index range.
8137 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8138 SDValue Res = DAG.getSelectCC(
8139 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8140 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8141 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8142 ISD::CondCode::SETGT);
8143 return DAG.getBitcast(VT, Res);
8148 if (Subtarget.hasAVX512()) {
8149 if (!Subtarget.hasVLX()) {
8150 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8151 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8153 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8154 DAG, SDLoc(IndicesVec));
8155 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8157 return extract256BitVector(Res, 0, DAG, DL);
8159 Opcode = X86ISD::VPERMV;
8160 } else if (Subtarget.hasAVX()) {
8161 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8163 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8165 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8166 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8167 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8168 if (Subtarget.hasXOP())
8169 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8170 LoLo, HiHi, IndicesVec,
8171 DAG.getConstant(0, DL, MVT::i8)));
8172 // Permute Lo and Hi and then select based on index range.
8173 // This works as VPERMILPD only uses index bit[1] to permute elements.
8174 SDValue Res = DAG.getSelectCC(
8175 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8176 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8177 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8178 ISD::CondCode::SETGT);
8179 return DAG.getBitcast(VT, Res);
8183 if (Subtarget.hasVBMI())
8184 Opcode = X86ISD::VPERMV;
8187 if (Subtarget.hasBWI())
8188 Opcode = X86ISD::VPERMV;
8194 if (Subtarget.hasAVX512())
8195 Opcode = X86ISD::VPERMV;
8201 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8202 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8203 "Illegal variable permute shuffle type");
8205 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8207 IndicesVec = ScaleIndices(IndicesVec, Scale);
8209 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8210 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8212 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8213 SDValue Res = Opcode == X86ISD::VPERMV
8214 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8215 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8216 return DAG.getBitcast(VT, Res);
8219 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8220 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8221 // (build_vector (extract_elt V, (extract_elt I, 0)),
8222 // (extract_elt V, (extract_elt I, 1)),
8227 // TODO: Handle undefs
8228 // TODO: Utilize pshufb and zero mask blending to support more efficient
8229 // construction of vectors with constant-0 elements.
8231 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8232 const X86Subtarget &Subtarget) {
8233 SDValue SrcVec, IndicesVec;
8234 // Check for a match of the permute source vector and permute index elements.
8235 // This is done by checking that the i-th build_vector operand is of the form:
8236 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8237 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8238 SDValue Op = V.getOperand(Idx);
8239 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8242 // If this is the first extract encountered in V, set the source vector,
8243 // otherwise verify the extract is from the previously defined source
8246 SrcVec = Op.getOperand(0);
8247 else if (SrcVec != Op.getOperand(0))
8249 SDValue ExtractedIndex = Op->getOperand(1);
8250 // Peek through extends.
8251 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8252 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8253 ExtractedIndex = ExtractedIndex.getOperand(0);
8254 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8257 // If this is the first extract from the index vector candidate, set the
8258 // indices vector, otherwise verify the extract is from the previously
8259 // defined indices vector.
8261 IndicesVec = ExtractedIndex.getOperand(0);
8262 else if (IndicesVec != ExtractedIndex.getOperand(0))
8265 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8266 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8271 MVT VT = V.getSimpleValueType();
8272 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8276 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8279 MVT VT = Op.getSimpleValueType();
8280 MVT EltVT = VT.getVectorElementType();
8281 unsigned NumElems = Op.getNumOperands();
8283 // Generate vectors for predicate vectors.
8284 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8285 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8287 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8288 return VectorConstant;
8290 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8291 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8293 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8294 return HorizontalOp;
8295 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8297 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8300 unsigned EVTBits = EltVT.getSizeInBits();
8302 unsigned NumZero = 0;
8303 unsigned NumNonZero = 0;
8304 uint64_t NonZeros = 0;
8305 bool IsAllConstants = true;
8306 SmallSet<SDValue, 8> Values;
8307 unsigned NumConstants = NumElems;
8308 for (unsigned i = 0; i < NumElems; ++i) {
8309 SDValue Elt = Op.getOperand(i);
8313 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8314 IsAllConstants = false;
8317 if (X86::isZeroNode(Elt))
8320 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8321 NonZeros |= ((uint64_t)1 << i);
8326 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8327 if (NumNonZero == 0)
8328 return DAG.getUNDEF(VT);
8330 // If we are inserting one variable into a vector of non-zero constants, try
8331 // to avoid loading each constant element as a scalar. Load the constants as a
8332 // vector and then insert the variable scalar element. If insertion is not
8333 // supported, we assume that we will fall back to a shuffle to get the scalar
8334 // blended with the constants. Insertion into a zero vector is handled as a
8335 // special-case somewhere below here.
8336 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8337 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8338 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8339 // Create an all-constant vector. The variable element in the old
8340 // build vector is replaced by undef in the constant vector. Save the
8341 // variable scalar element and its index for use in the insertelement.
8342 LLVMContext &Context = *DAG.getContext();
8343 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8344 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8347 for (unsigned i = 0; i != NumElems; ++i) {
8348 SDValue Elt = Op.getOperand(i);
8349 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8350 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8351 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8352 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8353 else if (!Elt.isUndef()) {
8354 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8355 "Expected one variable element in this vector");
8357 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8360 Constant *CV = ConstantVector::get(ConstVecOps);
8361 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8363 // The constants we just created may not be legal (eg, floating point). We
8364 // must lower the vector right here because we can not guarantee that we'll
8365 // legalize it before loading it. This is also why we could not just create
8366 // a new build vector here. If the build vector contains illegal constants,
8367 // it could get split back up into a series of insert elements.
8368 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8369 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8370 MachineFunction &MF = DAG.getMachineFunction();
8371 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8372 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8373 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8376 // Special case for single non-zero, non-undef, element.
8377 if (NumNonZero == 1) {
8378 unsigned Idx = countTrailingZeros(NonZeros);
8379 SDValue Item = Op.getOperand(Idx);
8381 // If we have a constant or non-constant insertion into the low element of
8382 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8383 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8384 // depending on what the source datatype is.
8387 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8389 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8390 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8391 assert((VT.is128BitVector() || VT.is256BitVector() ||
8392 VT.is512BitVector()) &&
8393 "Expected an SSE value type!");
8394 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8395 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8396 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8399 // We can't directly insert an i8 or i16 into a vector, so zero extend
8401 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8402 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8403 if (VT.getSizeInBits() >= 256) {
8404 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8405 if (Subtarget.hasAVX()) {
8406 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8407 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8409 // Without AVX, we need to extend to a 128-bit vector and then
8410 // insert into the 256-bit vector.
8411 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8412 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8413 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8416 assert(VT.is128BitVector() && "Expected an SSE value type!");
8417 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8418 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8420 return DAG.getBitcast(VT, Item);
8424 // Is it a vector logical left shift?
8425 if (NumElems == 2 && Idx == 1 &&
8426 X86::isZeroNode(Op.getOperand(0)) &&
8427 !X86::isZeroNode(Op.getOperand(1))) {
8428 unsigned NumBits = VT.getSizeInBits();
8429 return getVShift(true, VT,
8430 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8431 VT, Op.getOperand(1)),
8432 NumBits/2, DAG, *this, dl);
8435 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8438 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8439 // is a non-constant being inserted into an element other than the low one,
8440 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8441 // movd/movss) to move this into the low element, then shuffle it into
8443 if (EVTBits == 32) {
8444 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8445 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8449 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8450 if (Values.size() == 1) {
8451 if (EVTBits == 32) {
8452 // Instead of a shuffle like this:
8453 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8454 // Check if it's possible to issue this instead.
8455 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8456 unsigned Idx = countTrailingZeros(NonZeros);
8457 SDValue Item = Op.getOperand(Idx);
8458 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8459 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8464 // A vector full of immediates; various special cases are already
8465 // handled, so this is best done with a single constant-pool load.
8469 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8472 // See if we can use a vector load to get all of the elements.
8474 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8476 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8480 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8481 // build_vector and broadcast it.
8482 // TODO: We could probably generalize this more.
8483 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8484 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8485 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8486 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8487 // Make sure all the even/odd operands match.
8488 for (unsigned i = 2; i != NumElems; ++i)
8489 if (Ops[i % 2] != Op.getOperand(i))
8493 if (CanSplat(Op, NumElems, Ops)) {
8494 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8495 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8496 // Create a new build vector and cast to v2i64/v2f64.
8497 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8498 DAG.getBuildVector(NarrowVT, dl, Ops));
8499 // Broadcast from v2i64/v2f64 and cast to final VT.
8500 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8501 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8506 // For AVX-length vectors, build the individual 128-bit pieces and use
8507 // shuffles to put them in place.
8508 if (VT.getSizeInBits() > 128) {
8509 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8511 // Build both the lower and upper subvector.
8513 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8514 SDValue Upper = DAG.getBuildVector(
8515 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8517 // Recreate the wider vector with the lower and upper part.
8518 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8519 VT.getSizeInBits() / 2);
8522 // Let legalizer expand 2-wide build_vectors.
8523 if (EVTBits == 64) {
8524 if (NumNonZero == 1) {
8525 // One half is zero or undef.
8526 unsigned Idx = countTrailingZeros(NonZeros);
8527 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8528 Op.getOperand(Idx));
8529 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8534 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8535 if (EVTBits == 8 && NumElems == 16)
8536 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8540 if (EVTBits == 16 && NumElems == 8)
8541 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8545 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8546 if (EVTBits == 32 && NumElems == 4)
8547 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8550 // If element VT is == 32 bits, turn it into a number of shuffles.
8551 if (NumElems == 4 && NumZero > 0) {
8552 SmallVector<SDValue, 8> Ops(NumElems);
8553 for (unsigned i = 0; i < 4; ++i) {
8554 bool isZero = !(NonZeros & (1ULL << i));
8556 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8558 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8561 for (unsigned i = 0; i < 2; ++i) {
8562 switch ((NonZeros >> (i*2)) & 0x3) {
8563 default: llvm_unreachable("Unexpected NonZero count");
8565 Ops[i] = Ops[i*2]; // Must be a zero vector.
8568 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8571 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8574 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8579 bool Reverse1 = (NonZeros & 0x3) == 2;
8580 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8584 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8585 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8587 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8590 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8592 // Check for a build vector from mostly shuffle plus few inserting.
8593 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8596 // For SSE 4.1, use insertps to put the high elements into the low element.
8597 if (Subtarget.hasSSE41()) {
8599 if (!Op.getOperand(0).isUndef())
8600 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8602 Result = DAG.getUNDEF(VT);
8604 for (unsigned i = 1; i < NumElems; ++i) {
8605 if (Op.getOperand(i).isUndef()) continue;
8606 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8607 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8612 // Otherwise, expand into a number of unpckl*, start by extending each of
8613 // our (non-undef) elements to the full vector width with the element in the
8614 // bottom slot of the vector (which generates no code for SSE).
8615 SmallVector<SDValue, 8> Ops(NumElems);
8616 for (unsigned i = 0; i < NumElems; ++i) {
8617 if (!Op.getOperand(i).isUndef())
8618 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8620 Ops[i] = DAG.getUNDEF(VT);
8623 // Next, we iteratively mix elements, e.g. for v4f32:
8624 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8625 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8626 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8627 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8628 // Generate scaled UNPCKL shuffle mask.
8629 SmallVector<int, 16> Mask;
8630 for(unsigned i = 0; i != Scale; ++i)
8632 for (unsigned i = 0; i != Scale; ++i)
8633 Mask.push_back(NumElems+i);
8634 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8636 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8637 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8642 // 256-bit AVX can use the vinsertf128 instruction
8643 // to create 256-bit vectors from two other 128-bit ones.
8644 // TODO: Detect subvector broadcast here instead of DAG combine?
8645 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8646 const X86Subtarget &Subtarget) {
8648 MVT ResVT = Op.getSimpleValueType();
8650 assert((ResVT.is256BitVector() ||
8651 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8653 unsigned NumOperands = Op.getNumOperands();
8654 unsigned NumZero = 0;
8655 unsigned NumNonZero = 0;
8656 unsigned NonZeros = 0;
8657 for (unsigned i = 0; i != NumOperands; ++i) {
8658 SDValue SubVec = Op.getOperand(i);
8659 if (SubVec.isUndef())
8661 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8664 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8670 // If we have more than 2 non-zeros, build each half separately.
8671 if (NumNonZero > 2) {
8672 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8673 ResVT.getVectorNumElements()/2);
8674 ArrayRef<SDUse> Ops = Op->ops();
8675 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8676 Ops.slice(0, NumOperands/2));
8677 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8678 Ops.slice(NumOperands/2));
8679 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8682 // Otherwise, build it up through insert_subvectors.
8683 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8684 : DAG.getUNDEF(ResVT);
8686 MVT SubVT = Op.getOperand(0).getSimpleValueType();
8687 unsigned NumSubElems = SubVT.getVectorNumElements();
8688 for (unsigned i = 0; i != NumOperands; ++i) {
8689 if ((NonZeros & (1 << i)) == 0)
8692 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
8694 DAG.getIntPtrConstant(i * NumSubElems, dl));
8700 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8701 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8702 static bool isExpandWithZeros(const SDValue &Op) {
8703 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8704 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8706 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8707 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8713 // Returns true if the given node is a type promotion (by concatenating i1
8714 // zeros) of the result of a node that already zeros all upper bits of
8716 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8717 unsigned Opc = Op.getOpcode();
8719 assert(Opc == ISD::CONCAT_VECTORS &&
8720 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8721 "Unexpected node to check for type promotion!");
8723 // As long as we are concatenating zeros to the upper part of a previous node
8724 // result, climb up the tree until a node with different opcode is
8726 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8727 if (Opc == ISD::INSERT_SUBVECTOR) {
8728 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8729 Op.getConstantOperandVal(2) == 0)
8730 Op = Op.getOperand(1);
8733 } else { // Opc == ISD::CONCAT_VECTORS
8734 if (isExpandWithZeros(Op))
8735 Op = Op.getOperand(0);
8739 Opc = Op.getOpcode();
8742 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8743 // of a node that zeros the upper bits (its masked version).
8744 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8745 (Op.getOpcode() == ISD::AND &&
8746 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8747 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8754 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
8755 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8756 const X86Subtarget &Subtarget,
8757 SelectionDAG & DAG) {
8759 MVT ResVT = Op.getSimpleValueType();
8760 unsigned NumOperands = Op.getNumOperands();
8762 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8763 "Unexpected number of operands in CONCAT_VECTORS");
8765 // If this node promotes - by concatenating zeroes - the type of the result
8766 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8767 // output register, mark it as legal and catch the pattern in instruction
8768 // selection to avoid emitting extra instructions (for zeroing upper bits).
8769 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
8770 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
8772 unsigned NumZero = 0;
8773 unsigned NumNonZero = 0;
8774 uint64_t NonZeros = 0;
8775 for (unsigned i = 0; i != NumOperands; ++i) {
8776 SDValue SubVec = Op.getOperand(i);
8777 if (SubVec.isUndef())
8779 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8782 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8783 NonZeros |= (uint64_t)1 << i;
8789 // If there are zero or one non-zeros we can handle this very simply.
8790 if (NumNonZero <= 1) {
8791 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8792 : DAG.getUNDEF(ResVT);
8795 unsigned Idx = countTrailingZeros(NonZeros);
8796 SDValue SubVec = Op.getOperand(Idx);
8797 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8798 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8799 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8802 if (NumOperands > 2) {
8803 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8804 ResVT.getVectorNumElements()/2);
8805 ArrayRef<SDUse> Ops = Op->ops();
8806 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8807 Ops.slice(0, NumOperands/2));
8808 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8809 Ops.slice(NumOperands/2));
8810 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8813 assert(NumNonZero == 2 && "Simple cases not handled?");
8815 if (ResVT.getVectorNumElements() >= 16)
8816 return Op; // The operation is legal with KUNPCK
8818 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8819 DAG.getUNDEF(ResVT), Op.getOperand(0),
8820 DAG.getIntPtrConstant(0, dl));
8821 unsigned NumElems = ResVT.getVectorNumElements();
8822 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8823 DAG.getIntPtrConstant(NumElems/2, dl));
8826 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8827 const X86Subtarget &Subtarget,
8828 SelectionDAG &DAG) {
8829 MVT VT = Op.getSimpleValueType();
8830 if (VT.getVectorElementType() == MVT::i1)
8831 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8833 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8834 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8835 Op.getNumOperands() == 4)));
8837 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8838 // from two other 128-bit ones.
8840 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8841 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
8844 //===----------------------------------------------------------------------===//
8845 // Vector shuffle lowering
8847 // This is an experimental code path for lowering vector shuffles on x86. It is
8848 // designed to handle arbitrary vector shuffles and blends, gracefully
8849 // degrading performance as necessary. It works hard to recognize idiomatic
8850 // shuffles and lower them to optimal instruction patterns without leaving
8851 // a framework that allows reasonably efficient handling of all vector shuffle
8853 //===----------------------------------------------------------------------===//
8855 /// Tiny helper function to identify a no-op mask.
8857 /// This is a somewhat boring predicate function. It checks whether the mask
8858 /// array input, which is assumed to be a single-input shuffle mask of the kind
8859 /// used by the X86 shuffle instructions (not a fully general
8860 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8861 /// in-place shuffle are 'no-op's.
8862 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8863 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8864 assert(Mask[i] >= -1 && "Out of bound mask element!");
8865 if (Mask[i] >= 0 && Mask[i] != i)
8871 /// Test whether there are elements crossing 128-bit lanes in this
8874 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8875 /// and we routinely test for these.
8876 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8877 int LaneSize = 128 / VT.getScalarSizeInBits();
8878 int Size = Mask.size();
8879 for (int i = 0; i < Size; ++i)
8880 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8885 /// Test whether a shuffle mask is equivalent within each sub-lane.
8887 /// This checks a shuffle mask to see if it is performing the same
8888 /// lane-relative shuffle in each sub-lane. This trivially implies
8889 /// that it is also not lane-crossing. It may however involve a blend from the
8890 /// same lane of a second vector.
8892 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8893 /// non-trivial to compute in the face of undef lanes. The representation is
8894 /// suitable for use with existing 128-bit shuffles as entries from the second
8895 /// vector have been remapped to [LaneSize, 2*LaneSize).
8896 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8898 SmallVectorImpl<int> &RepeatedMask) {
8899 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8900 RepeatedMask.assign(LaneSize, -1);
8901 int Size = Mask.size();
8902 for (int i = 0; i < Size; ++i) {
8903 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8906 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8907 // This entry crosses lanes, so there is no way to model this shuffle.
8910 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8911 // Adjust second vector indices to start at LaneSize instead of Size.
8912 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8913 : Mask[i] % LaneSize + LaneSize;
8914 if (RepeatedMask[i % LaneSize] < 0)
8915 // This is the first non-undef entry in this slot of a 128-bit lane.
8916 RepeatedMask[i % LaneSize] = LocalM;
8917 else if (RepeatedMask[i % LaneSize] != LocalM)
8918 // Found a mismatch with the repeated mask.
8924 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8926 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8927 SmallVectorImpl<int> &RepeatedMask) {
8928 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8931 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8933 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8934 SmallVectorImpl<int> &RepeatedMask) {
8935 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8938 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8939 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8940 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8942 SmallVectorImpl<int> &RepeatedMask) {
8943 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8944 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8945 int Size = Mask.size();
8946 for (int i = 0; i < Size; ++i) {
8947 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8948 if (Mask[i] == SM_SentinelUndef)
8950 if (Mask[i] == SM_SentinelZero) {
8951 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8953 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8956 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8957 // This entry crosses lanes, so there is no way to model this shuffle.
8960 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8961 // Adjust second vector indices to start at LaneSize instead of Size.
8963 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8964 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8965 // This is the first non-undef entry in this slot of a 128-bit lane.
8966 RepeatedMask[i % LaneSize] = LocalM;
8967 else if (RepeatedMask[i % LaneSize] != LocalM)
8968 // Found a mismatch with the repeated mask.
8974 /// Checks whether a shuffle mask is equivalent to an explicit list of
8977 /// This is a fast way to test a shuffle mask against a fixed pattern:
8979 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8981 /// It returns true if the mask is exactly as wide as the argument list, and
8982 /// each element of the mask is either -1 (signifying undef) or the value given
8983 /// in the argument.
8984 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8985 ArrayRef<int> ExpectedMask) {
8986 if (Mask.size() != ExpectedMask.size())
8989 int Size = Mask.size();
8991 // If the values are build vectors, we can look through them to find
8992 // equivalent inputs that make the shuffles equivalent.
8993 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8994 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8996 for (int i = 0; i < Size; ++i) {
8997 assert(Mask[i] >= -1 && "Out of bound mask element!");
8998 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8999 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
9000 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
9001 if (!MaskBV || !ExpectedBV ||
9002 MaskBV->getOperand(Mask[i] % Size) !=
9003 ExpectedBV->getOperand(ExpectedMask[i] % Size))
9011 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9013 /// The masks must be exactly the same width.
9015 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9016 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9018 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
9019 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
9020 ArrayRef<int> ExpectedMask) {
9021 int Size = Mask.size();
9022 if (Size != (int)ExpectedMask.size())
9025 for (int i = 0; i < Size; ++i)
9026 if (Mask[i] == SM_SentinelUndef)
9028 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
9030 else if (Mask[i] != ExpectedMask[i])
9036 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
9038 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
9039 const APInt &Zeroable) {
9040 int NumElts = Mask.size();
9041 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
9043 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
9044 for (int i = 0; i != NumElts; ++i) {
9046 if (M == SM_SentinelUndef)
9048 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
9049 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
9054 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9056 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9057 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9060 SmallVector<int, 8> Unpcklwd;
9061 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9062 /* Unary = */ false);
9063 SmallVector<int, 8> Unpckhwd;
9064 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9065 /* Unary = */ false);
9066 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9067 isTargetShuffleEquivalent(Mask, Unpckhwd));
9068 return IsUnpackwdMask;
9071 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9073 /// This helper function produces an 8-bit shuffle immediate corresponding to
9074 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9075 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9078 /// NB: We rely heavily on "undef" masks preserving the input lane.
9079 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9080 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9081 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9082 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9083 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9084 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9087 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9088 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9089 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9090 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9094 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9095 SelectionDAG &DAG) {
9096 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9099 /// Compute whether each element of a shuffle is zeroable.
9101 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9102 /// Either it is an undef element in the shuffle mask, the element of the input
9103 /// referenced is undef, or the element of the input referenced is known to be
9104 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9105 /// as many lanes with this technique as possible to simplify the remaining
9107 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9108 SDValue V1, SDValue V2) {
9109 APInt Zeroable(Mask.size(), 0);
9110 V1 = peekThroughBitcasts(V1);
9111 V2 = peekThroughBitcasts(V2);
9113 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9114 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9116 int VectorSizeInBits = V1.getValueSizeInBits();
9117 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9118 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9120 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9122 // Handle the easy cases.
9123 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9128 // Determine shuffle input and normalize the mask.
9129 SDValue V = M < Size ? V1 : V2;
9132 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9133 if (V.getOpcode() != ISD::BUILD_VECTOR)
9136 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9137 // the (larger) source element must be UNDEF/ZERO.
9138 if ((Size % V.getNumOperands()) == 0) {
9139 int Scale = Size / V->getNumOperands();
9140 SDValue Op = V.getOperand(M / Scale);
9141 if (Op.isUndef() || X86::isZeroNode(Op))
9143 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9144 APInt Val = Cst->getAPIntValue();
9145 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9146 Val = Val.getLoBits(ScalarSizeInBits);
9149 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9150 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9151 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9152 Val = Val.getLoBits(ScalarSizeInBits);
9159 // If the BUILD_VECTOR has more elements then all the (smaller) source
9160 // elements must be UNDEF or ZERO.
9161 if ((V.getNumOperands() % Size) == 0) {
9162 int Scale = V->getNumOperands() / Size;
9163 bool AllZeroable = true;
9164 for (int j = 0; j < Scale; ++j) {
9165 SDValue Op = V.getOperand((M * Scale) + j);
9166 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9177 // The Shuffle result is as follow:
9178 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9179 // Each Zeroable's element correspond to a particular Mask's element.
9180 // As described in computeZeroableShuffleElements function.
9182 // The function looks for a sub-mask that the nonzero elements are in
9183 // increasing order. If such sub-mask exist. The function returns true.
9184 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9185 ArrayRef<int> Mask, const EVT &VectorType,
9186 bool &IsZeroSideLeft) {
9187 int NextElement = -1;
9188 // Check if the Mask's nonzero elements are in increasing order.
9189 for (int i = 0, e = Mask.size(); i < e; i++) {
9190 // Checks if the mask's zeros elements are built from only zeros.
9191 assert(Mask[i] >= -1 && "Out of bound mask element!");
9196 // Find the lowest non zero element
9197 if (NextElement < 0) {
9198 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9199 IsZeroSideLeft = NextElement != 0;
9201 // Exit if the mask's non zero elements are not in increasing order.
9202 if (NextElement != Mask[i])
9209 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9210 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9211 ArrayRef<int> Mask, SDValue V1,
9213 const APInt &Zeroable,
9214 const X86Subtarget &Subtarget,
9215 SelectionDAG &DAG) {
9216 int Size = Mask.size();
9217 int LaneSize = 128 / VT.getScalarSizeInBits();
9218 const int NumBytes = VT.getSizeInBits() / 8;
9219 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9221 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9222 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9223 (Subtarget.hasBWI() && VT.is512BitVector()));
9225 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9226 // Sign bit set in i8 mask means zero element.
9227 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9230 for (int i = 0; i < NumBytes; ++i) {
9231 int M = Mask[i / NumEltBytes];
9233 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9236 if (Zeroable[i / NumEltBytes]) {
9237 PSHUFBMask[i] = ZeroMask;
9241 // We can only use a single input of V1 or V2.
9242 SDValue SrcV = (M >= Size ? V2 : V1);
9248 // PSHUFB can't cross lanes, ensure this doesn't happen.
9249 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9253 M = M * NumEltBytes + (i % NumEltBytes);
9254 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9256 assert(V && "Failed to find a source input");
9258 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9259 return DAG.getBitcast(
9260 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9261 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9264 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9265 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9268 // X86 has dedicated shuffle that can be lowered to VEXPAND
9269 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9270 const APInt &Zeroable,
9271 ArrayRef<int> Mask, SDValue &V1,
9272 SDValue &V2, SelectionDAG &DAG,
9273 const X86Subtarget &Subtarget) {
9274 bool IsLeftZeroSide = true;
9275 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9278 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9280 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9281 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9282 unsigned NumElts = VT.getVectorNumElements();
9283 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9284 "Unexpected number of vector elements");
9285 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9286 Subtarget, DAG, DL);
9287 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9288 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9289 return DAG.getSelect(DL, VT, VMask,
9290 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9294 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9295 unsigned &UnpackOpcode, bool IsUnary,
9296 ArrayRef<int> TargetMask,
9297 const SDLoc &DL, SelectionDAG &DAG,
9298 const X86Subtarget &Subtarget) {
9299 int NumElts = VT.getVectorNumElements();
9301 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9302 for (int i = 0; i != NumElts; i += 2) {
9303 int M1 = TargetMask[i + 0];
9304 int M2 = TargetMask[i + 1];
9305 Undef1 &= (SM_SentinelUndef == M1);
9306 Undef2 &= (SM_SentinelUndef == M2);
9307 Zero1 &= isUndefOrZero(M1);
9308 Zero2 &= isUndefOrZero(M2);
9310 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9311 "Zeroable shuffle detected");
9313 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9314 SmallVector<int, 64> Unpckl, Unpckh;
9315 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9316 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9317 UnpackOpcode = X86ISD::UNPCKL;
9318 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9319 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9323 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9324 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9325 UnpackOpcode = X86ISD::UNPCKH;
9326 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9327 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9331 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9332 if (IsUnary && (Zero1 || Zero2)) {
9333 // Don't bother if we can blend instead.
9334 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9335 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9338 bool MatchLo = true, MatchHi = true;
9339 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9340 int M = TargetMask[i];
9342 // Ignore if the input is known to be zero or the index is undef.
9343 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9344 (M == SM_SentinelUndef))
9347 MatchLo &= (M == Unpckl[i]);
9348 MatchHi &= (M == Unpckh[i]);
9351 if (MatchLo || MatchHi) {
9352 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9353 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9354 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9359 // If a binary shuffle, commute and try again.
9361 ShuffleVectorSDNode::commuteMask(Unpckl);
9362 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9363 UnpackOpcode = X86ISD::UNPCKL;
9368 ShuffleVectorSDNode::commuteMask(Unpckh);
9369 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9370 UnpackOpcode = X86ISD::UNPCKH;
9379 // X86 has dedicated unpack instructions that can handle specific blend
9380 // operations: UNPCKH and UNPCKL.
9381 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9382 ArrayRef<int> Mask, SDValue V1,
9383 SDValue V2, SelectionDAG &DAG) {
9384 SmallVector<int, 8> Unpckl;
9385 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9386 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9387 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9389 SmallVector<int, 8> Unpckh;
9390 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9391 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9392 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9394 // Commute and try again.
9395 ShuffleVectorSDNode::commuteMask(Unpckl);
9396 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9397 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9399 ShuffleVectorSDNode::commuteMask(Unpckh);
9400 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9401 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9406 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
9408 int Size = (int)Mask.size();
9409 int Split = Size / Delta;
9410 int TruncatedVectorStart = SwappedOps ? Size : 0;
9412 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
9413 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
9416 // The rest of the mask should not refer to the truncated vector's elements.
9417 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
9418 TruncatedVectorStart + Size))
9424 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9426 // An example is the following:
9428 // t0: ch = EntryToken
9429 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9430 // t25: v4i32 = truncate t2
9431 // t41: v8i16 = bitcast t25
9432 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9433 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9434 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9435 // t18: v2i64 = bitcast t51
9437 // Without avx512vl, this is lowered to:
9439 // vpmovqd %zmm0, %ymm0
9440 // vpshufb {{.*#+}} xmm0 =
9441 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
9443 // But when avx512vl is available, one can just use a single vpmovdw
9445 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
9446 MVT VT, SDValue V1, SDValue V2,
9448 const X86Subtarget &Subtarget) {
9449 if (VT != MVT::v16i8 && VT != MVT::v8i16)
9452 if (Mask.size() != VT.getVectorNumElements())
9455 bool SwappedOps = false;
9457 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
9458 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
9467 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
9468 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
9470 // and similar ones.
9471 if (V1.getOpcode() != ISD::BITCAST)
9473 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
9476 SDValue Src = V1.getOperand(0).getOperand(0);
9477 MVT SrcVT = Src.getSimpleValueType();
9479 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
9480 // are only available with avx512vl.
9481 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
9484 // Down Convert Word to Byte is only available with avx512bw. The case with
9485 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
9486 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
9487 !Subtarget.hasBWI())
9490 // The first half/quarter of the mask should refer to every second/fourth
9491 // element of the vector truncated and bitcasted.
9492 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
9493 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
9496 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
9499 // X86 has dedicated pack instructions that can handle specific truncation
9500 // operations: PACKSS and PACKUS.
9501 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9502 SDValue &V2, unsigned &PackOpcode,
9503 ArrayRef<int> TargetMask,
9505 const X86Subtarget &Subtarget) {
9506 unsigned NumElts = VT.getVectorNumElements();
9507 unsigned BitSize = VT.getScalarSizeInBits();
9508 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9509 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9511 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9512 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9513 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9514 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9515 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9516 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9517 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9521 PackOpcode = X86ISD::PACKUS;
9525 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9526 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9530 PackOpcode = X86ISD::PACKSS;
9536 // Try binary shuffle.
9537 SmallVector<int, 32> BinaryMask;
9538 createPackShuffleMask(VT, BinaryMask, false);
9539 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9540 if (MatchPACK(V1, V2))
9543 // Try unary shuffle.
9544 SmallVector<int, 32> UnaryMask;
9545 createPackShuffleMask(VT, UnaryMask, true);
9546 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9547 if (MatchPACK(V1, V1))
9553 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9554 ArrayRef<int> Mask, SDValue V1,
9555 SDValue V2, SelectionDAG &DAG,
9556 const X86Subtarget &Subtarget) {
9558 unsigned PackOpcode;
9559 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9561 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9562 DAG.getBitcast(PackVT, V2));
9567 /// Try to emit a bitmask instruction for a shuffle.
9569 /// This handles cases where we can model a blend exactly as a bitmask due to
9570 /// one of the inputs being zeroable.
9571 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9572 SDValue V2, ArrayRef<int> Mask,
9573 const APInt &Zeroable,
9574 SelectionDAG &DAG) {
9575 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9576 MVT EltVT = VT.getVectorElementType();
9577 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9578 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9579 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9581 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9584 if (Mask[i] % Size != i)
9585 return SDValue(); // Not a blend.
9587 V = Mask[i] < Size ? V1 : V2;
9588 else if (V != (Mask[i] < Size ? V1 : V2))
9589 return SDValue(); // Can only let one input through the mask.
9591 VMaskOps[i] = AllOnes;
9594 return SDValue(); // No non-zeroable elements!
9596 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9597 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9600 /// Try to emit a blend instruction for a shuffle using bit math.
9602 /// This is used as a fallback approach when first class blend instructions are
9603 /// unavailable. Currently it is only suitable for integer vectors, but could
9604 /// be generalized for floating point vectors if desirable.
9605 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9606 SDValue V2, ArrayRef<int> Mask,
9607 SelectionDAG &DAG) {
9608 assert(VT.isInteger() && "Only supports integer vector types!");
9609 MVT EltVT = VT.getVectorElementType();
9610 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9611 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9612 SmallVector<SDValue, 16> MaskOps;
9613 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9614 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9615 return SDValue(); // Shuffled input!
9616 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9619 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9620 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9621 // We have to cast V2 around.
9622 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9623 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9624 DAG.getBitcast(MaskVT, V1Mask),
9625 DAG.getBitcast(MaskVT, V2)));
9626 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9629 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9630 SDValue PreservedSrc,
9631 const X86Subtarget &Subtarget,
9634 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9635 MutableArrayRef<int> TargetMask,
9636 bool &ForceV1Zero, bool &ForceV2Zero,
9637 uint64_t &BlendMask) {
9638 bool V1IsZeroOrUndef =
9639 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9640 bool V2IsZeroOrUndef =
9641 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9644 ForceV1Zero = false, ForceV2Zero = false;
9645 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9647 // Attempt to generate the binary blend mask. If an input is zero then
9648 // we can use any lane.
9649 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9650 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9651 int M = TargetMask[i];
9652 if (M == SM_SentinelUndef)
9656 if (M == i + Size) {
9657 BlendMask |= 1ull << i;
9660 if (M == SM_SentinelZero) {
9661 if (V1IsZeroOrUndef) {
9666 if (V2IsZeroOrUndef) {
9668 BlendMask |= 1ull << i;
9669 TargetMask[i] = i + Size;
9678 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9680 uint64_t ScaledMask = 0;
9681 for (int i = 0; i != Size; ++i)
9682 if (BlendMask & (1ull << i))
9683 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9687 /// Try to emit a blend instruction for a shuffle.
9689 /// This doesn't do any checks for the availability of instructions for blending
9690 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9691 /// be matched in the backend with the type given. What it does check for is
9692 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9693 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9694 SDValue V2, ArrayRef<int> Original,
9695 const APInt &Zeroable,
9696 const X86Subtarget &Subtarget,
9697 SelectionDAG &DAG) {
9698 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9700 uint64_t BlendMask = 0;
9701 bool ForceV1Zero = false, ForceV2Zero = false;
9702 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9706 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9708 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9710 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9712 switch (VT.SimpleTy) {
9717 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9718 DAG.getConstant(BlendMask, DL, MVT::i8));
9722 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9726 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9727 // that instruction.
9728 if (Subtarget.hasAVX2()) {
9729 // Scale the blend by the number of 32-bit dwords per element.
9730 int Scale = VT.getScalarSizeInBits() / 32;
9731 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9732 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9733 V1 = DAG.getBitcast(BlendVT, V1);
9734 V2 = DAG.getBitcast(BlendVT, V2);
9735 return DAG.getBitcast(
9736 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9737 DAG.getConstant(BlendMask, DL, MVT::i8)));
9741 // For integer shuffles we need to expand the mask and cast the inputs to
9742 // v8i16s prior to blending.
9743 int Scale = 8 / VT.getVectorNumElements();
9744 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9745 V1 = DAG.getBitcast(MVT::v8i16, V1);
9746 V2 = DAG.getBitcast(MVT::v8i16, V2);
9747 return DAG.getBitcast(VT,
9748 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9749 DAG.getConstant(BlendMask, DL, MVT::i8)));
9753 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9754 SmallVector<int, 8> RepeatedMask;
9755 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9756 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9757 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9759 for (int i = 0; i < 8; ++i)
9760 if (RepeatedMask[i] >= 8)
9761 BlendMask |= 1ull << i;
9762 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9763 DAG.getConstant(BlendMask, DL, MVT::i8));
9769 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9770 "256-bit byte-blends require AVX2 support!");
9772 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9774 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9775 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9776 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9779 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9780 if (SDValue Masked =
9781 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9784 // Scale the blend by the number of bytes per element.
9785 int Scale = VT.getScalarSizeInBits() / 8;
9787 // This form of blend is always done on bytes. Compute the byte vector
9789 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9791 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9792 // mix of LLVM's code generator and the x86 backend. We tell the code
9793 // generator that boolean values in the elements of an x86 vector register
9794 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9795 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9796 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9797 // of the element (the remaining are ignored) and 0 in that high bit would
9798 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9799 // the LLVM model for boolean values in vector elements gets the relevant
9800 // bit set, it is set backwards and over constrained relative to x86's
9802 SmallVector<SDValue, 32> VSELECTMask;
9803 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9804 for (int j = 0; j < Scale; ++j)
9805 VSELECTMask.push_back(
9806 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9807 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9810 V1 = DAG.getBitcast(BlendVT, V1);
9811 V2 = DAG.getBitcast(BlendVT, V2);
9812 return DAG.getBitcast(
9814 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9824 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9825 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9826 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9829 llvm_unreachable("Not a supported integer vector type!");
9833 /// Try to lower as a blend of elements from two inputs followed by
9834 /// a single-input permutation.
9836 /// This matches the pattern where we can blend elements from two inputs and
9837 /// then reduce the shuffle to a single-input permutation.
9838 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9839 SDValue V1, SDValue V2,
9841 SelectionDAG &DAG) {
9842 // We build up the blend mask while checking whether a blend is a viable way
9843 // to reduce the shuffle.
9844 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9845 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9847 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9851 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9853 if (BlendMask[Mask[i] % Size] < 0)
9854 BlendMask[Mask[i] % Size] = Mask[i];
9855 else if (BlendMask[Mask[i] % Size] != Mask[i])
9856 return SDValue(); // Can't blend in the needed input!
9858 PermuteMask[i] = Mask[i] % Size;
9861 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9862 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9865 /// Generic routine to decompose a shuffle and blend into independent
9866 /// blends and permutes.
9868 /// This matches the extremely common pattern for handling combined
9869 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9870 /// operations. It will try to pick the best arrangement of shuffles and
9872 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9876 SelectionDAG &DAG) {
9877 // Shuffle the input elements into the desired positions in V1 and V2 and
9878 // blend them together.
9879 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9880 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9881 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9882 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9883 if (Mask[i] >= 0 && Mask[i] < Size) {
9884 V1Mask[i] = Mask[i];
9886 } else if (Mask[i] >= Size) {
9887 V2Mask[i] = Mask[i] - Size;
9888 BlendMask[i] = i + Size;
9891 // Try to lower with the simpler initial blend strategy unless one of the
9892 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9893 // shuffle may be able to fold with a load or other benefit. However, when
9894 // we'll have to do 2x as many shuffles in order to achieve this, blending
9895 // first is a better strategy.
9896 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9897 if (SDValue BlendPerm =
9898 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9901 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9902 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9903 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9906 /// Try to lower a vector shuffle as a rotation.
9908 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9909 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9910 ArrayRef<int> Mask) {
9911 int NumElts = Mask.size();
9913 // We need to detect various ways of spelling a rotation:
9914 // [11, 12, 13, 14, 15, 0, 1, 2]
9915 // [-1, 12, 13, 14, -1, -1, 1, -1]
9916 // [-1, -1, -1, -1, -1, -1, 1, 2]
9917 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9918 // [-1, 4, 5, 6, -1, -1, 9, -1]
9919 // [-1, 4, 5, 6, -1, -1, -1, -1]
9922 for (int i = 0; i < NumElts; ++i) {
9924 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9925 "Unexpected mask index.");
9929 // Determine where a rotated vector would have started.
9930 int StartIdx = i - (M % NumElts);
9932 // The identity rotation isn't interesting, stop.
9935 // If we found the tail of a vector the rotation must be the missing
9936 // front. If we found the head of a vector, it must be how much of the
9938 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9941 Rotation = CandidateRotation;
9942 else if (Rotation != CandidateRotation)
9943 // The rotations don't match, so we can't match this mask.
9946 // Compute which value this mask is pointing at.
9947 SDValue MaskV = M < NumElts ? V1 : V2;
9949 // Compute which of the two target values this index should be assigned
9950 // to. This reflects whether the high elements are remaining or the low
9951 // elements are remaining.
9952 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9954 // Either set up this value if we've not encountered it before, or check
9955 // that it remains consistent.
9958 else if (TargetV != MaskV)
9959 // This may be a rotation, but it pulls from the inputs in some
9960 // unsupported interleaving.
9964 // Check that we successfully analyzed the mask, and normalize the results.
9965 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9966 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9978 /// Try to lower a vector shuffle as a byte rotation.
9980 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9981 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9982 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9983 /// try to generically lower a vector shuffle through such an pattern. It
9984 /// does not check for the profitability of lowering either as PALIGNR or
9985 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9986 /// This matches shuffle vectors that look like:
9988 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9990 /// Essentially it concatenates V1 and V2, shifts right by some number of
9991 /// elements, and takes the low elements as the result. Note that while this is
9992 /// specified as a *right shift* because x86 is little-endian, it is a *left
9993 /// rotate* of the vector lanes.
9994 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9995 ArrayRef<int> Mask) {
9996 // Don't accept any shuffles with zero elements.
9997 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
10000 // PALIGNR works on 128-bit lanes.
10001 SmallVector<int, 16> RepeatedMask;
10002 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10005 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
10009 // PALIGNR rotates bytes, so we need to scale the
10010 // rotation based on how many bytes are in the vector lane.
10011 int NumElts = RepeatedMask.size();
10012 int Scale = 16 / NumElts;
10013 return Rotation * Scale;
10016 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
10017 SDValue V1, SDValue V2,
10018 ArrayRef<int> Mask,
10019 const X86Subtarget &Subtarget,
10020 SelectionDAG &DAG) {
10021 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
10023 SDValue Lo = V1, Hi = V2;
10024 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
10025 if (ByteRotation <= 0)
10028 // Cast the inputs to i8 vector of correct length to match PALIGNR or
10030 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10031 Lo = DAG.getBitcast(ByteVT, Lo);
10032 Hi = DAG.getBitcast(ByteVT, Hi);
10034 // SSSE3 targets can use the palignr instruction.
10035 if (Subtarget.hasSSSE3()) {
10036 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
10037 "512-bit PALIGNR requires BWI instructions");
10038 return DAG.getBitcast(
10039 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
10040 DAG.getConstant(ByteRotation, DL, MVT::i8)));
10043 assert(VT.is128BitVector() &&
10044 "Rotate-based lowering only supports 128-bit lowering!");
10045 assert(Mask.size() <= 16 &&
10046 "Can shuffle at most 16 bytes in a 128-bit vector!");
10047 assert(ByteVT == MVT::v16i8 &&
10048 "SSE2 rotate lowering only needed for v16i8!");
10050 // Default SSE2 implementation
10051 int LoByteShift = 16 - ByteRotation;
10052 int HiByteShift = ByteRotation;
10054 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
10055 DAG.getConstant(LoByteShift, DL, MVT::i8));
10056 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
10057 DAG.getConstant(HiByteShift, DL, MVT::i8));
10058 return DAG.getBitcast(VT,
10059 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
10062 /// Try to lower a vector shuffle as a dword/qword rotation.
10064 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
10065 /// rotation of the concatenation of two vectors; This routine will
10066 /// try to generically lower a vector shuffle through such an pattern.
10068 /// Essentially it concatenates V1 and V2, shifts right by some number of
10069 /// elements, and takes the low elements as the result. Note that while this is
10070 /// specified as a *right shift* because x86 is little-endian, it is a *left
10071 /// rotate* of the vector lanes.
10072 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
10073 SDValue V1, SDValue V2,
10074 ArrayRef<int> Mask,
10075 const X86Subtarget &Subtarget,
10076 SelectionDAG &DAG) {
10077 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
10078 "Only 32-bit and 64-bit elements are supported!");
10080 // 128/256-bit vectors are only supported with VLX.
10081 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
10082 && "VLX required for 128/256-bit vectors");
10084 SDValue Lo = V1, Hi = V2;
10085 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
10089 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
10090 DAG.getConstant(Rotation, DL, MVT::i8));
10093 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
10095 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
10096 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
10097 /// matches elements from one of the input vectors shuffled to the left or
10098 /// right with zeroable elements 'shifted in'. It handles both the strictly
10099 /// bit-wise element shifts and the byte shift across an entire 128-bit double
10100 /// quad word lane.
10102 /// PSHL : (little-endian) left bit shift.
10103 /// [ zz, 0, zz, 2 ]
10104 /// [ -1, 4, zz, -1 ]
10105 /// PSRL : (little-endian) right bit shift.
10106 /// [ 1, zz, 3, zz]
10107 /// [ -1, -1, 7, zz]
10108 /// PSLLDQ : (little-endian) left byte shift
10109 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
10110 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
10111 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
10112 /// PSRLDQ : (little-endian) right byte shift
10113 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
10114 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
10115 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
10116 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
10117 unsigned ScalarSizeInBits,
10118 ArrayRef<int> Mask, int MaskOffset,
10119 const APInt &Zeroable,
10120 const X86Subtarget &Subtarget) {
10121 int Size = Mask.size();
10122 unsigned SizeInBits = Size * ScalarSizeInBits;
10124 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
10125 for (int i = 0; i < Size; i += Scale)
10126 for (int j = 0; j < Shift; ++j)
10127 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
10133 auto MatchShift = [&](int Shift, int Scale, bool Left) {
10134 for (int i = 0; i != Size; i += Scale) {
10135 unsigned Pos = Left ? i + Shift : i;
10136 unsigned Low = Left ? i : i + Shift;
10137 unsigned Len = Scale - Shift;
10138 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
10142 int ShiftEltBits = ScalarSizeInBits * Scale;
10143 bool ByteShift = ShiftEltBits > 64;
10144 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
10145 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10146 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10148 // Normalize the scale for byte shifts to still produce an i64 element
10150 Scale = ByteShift ? Scale / 2 : Scale;
10152 // We need to round trip through the appropriate type for the shift.
10153 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10154 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10155 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10156 return (int)ShiftAmt;
10159 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10160 // keep doubling the size of the integer elements up to that. We can
10161 // then shift the elements of the integer vector by whole multiples of
10162 // their width within the elements of the larger integer vector. Test each
10163 // multiple to see if we can find a match with the moved element indices
10164 // and that the shifted in elements are all zeroable.
10165 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10166 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10167 for (int Shift = 1; Shift != Scale; ++Shift)
10168 for (bool Left : {true, false})
10169 if (CheckZeros(Shift, Scale, Left)) {
10170 int ShiftAmt = MatchShift(Shift, Scale, Left);
10179 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10180 SDValue V2, ArrayRef<int> Mask,
10181 const APInt &Zeroable,
10182 const X86Subtarget &Subtarget,
10183 SelectionDAG &DAG) {
10184 int Size = Mask.size();
10185 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10191 // Try to match shuffle against V1 shift.
10192 int ShiftAmt = matchVectorShuffleAsShift(
10193 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
10195 // If V1 failed, try to match shuffle against V2 shift.
10196 if (ShiftAmt < 0) {
10198 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
10199 Mask, Size, Zeroable, Subtarget);
10206 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
10207 "Illegal integer vector type");
10208 V = DAG.getBitcast(ShiftVT, V);
10209 V = DAG.getNode(Opcode, DL, ShiftVT, V,
10210 DAG.getConstant(ShiftAmt, DL, MVT::i8));
10211 return DAG.getBitcast(VT, V);
10214 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
10215 // Remainder of lower half result is zero and upper half is all undef.
10216 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
10217 ArrayRef<int> Mask, uint64_t &BitLen,
10218 uint64_t &BitIdx, const APInt &Zeroable) {
10219 int Size = Mask.size();
10220 int HalfSize = Size / 2;
10221 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10222 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
10224 // Upper half must be undefined.
10225 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10228 // Determine the extraction length from the part of the
10229 // lower half that isn't zeroable.
10230 int Len = HalfSize;
10231 for (; Len > 0; --Len)
10232 if (!Zeroable[Len - 1])
10234 assert(Len > 0 && "Zeroable shuffle mask");
10236 // Attempt to match first Len sequential elements from the lower half.
10239 for (int i = 0; i != Len; ++i) {
10241 if (M == SM_SentinelUndef)
10243 SDValue &V = (M < Size ? V1 : V2);
10246 // The extracted elements must start at a valid index and all mask
10247 // elements must be in the lower half.
10248 if (i > M || M >= HalfSize)
10251 if (Idx < 0 || (Src == V && Idx == (M - i))) {
10259 if (!Src || Idx < 0)
10262 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
10263 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10264 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10269 // INSERTQ: Extract lowest Len elements from lower half of second source and
10270 // insert over first source, starting at Idx.
10271 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
10272 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
10273 ArrayRef<int> Mask, uint64_t &BitLen,
10274 uint64_t &BitIdx) {
10275 int Size = Mask.size();
10276 int HalfSize = Size / 2;
10277 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10279 // Upper half must be undefined.
10280 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10283 for (int Idx = 0; Idx != HalfSize; ++Idx) {
10286 // Attempt to match first source from mask before insertion point.
10287 if (isUndefInRange(Mask, 0, Idx)) {
10289 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
10291 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
10297 // Extend the extraction length looking to match both the insertion of
10298 // the second source and the remaining elements of the first.
10299 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
10301 int Len = Hi - Idx;
10303 // Match insertion.
10304 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
10306 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
10312 // Match the remaining elements of the lower half.
10313 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
10315 } else if ((!Base || (Base == V1)) &&
10316 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
10318 } else if ((!Base || (Base == V2)) &&
10319 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
10326 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10327 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10337 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
10338 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
10339 SDValue V2, ArrayRef<int> Mask,
10340 const APInt &Zeroable,
10341 SelectionDAG &DAG) {
10342 uint64_t BitLen, BitIdx;
10343 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
10344 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
10345 DAG.getConstant(BitLen, DL, MVT::i8),
10346 DAG.getConstant(BitIdx, DL, MVT::i8));
10348 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
10349 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
10350 V2 ? V2 : DAG.getUNDEF(VT),
10351 DAG.getConstant(BitLen, DL, MVT::i8),
10352 DAG.getConstant(BitIdx, DL, MVT::i8));
10357 /// Lower a vector shuffle as a zero or any extension.
10359 /// Given a specific number of elements, element bit width, and extension
10360 /// stride, produce either a zero or any extension based on the available
10361 /// features of the subtarget. The extended elements are consecutive and
10362 /// begin and can start from an offsetted element index in the input; to
10363 /// avoid excess shuffling the offset must either being in the bottom lane
10364 /// or at the start of a higher lane. All extended elements must be from
10366 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10367 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
10368 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10369 assert(Scale > 1 && "Need a scale to extend.");
10370 int EltBits = VT.getScalarSizeInBits();
10371 int NumElements = VT.getVectorNumElements();
10372 int NumEltsPerLane = 128 / EltBits;
10373 int OffsetLane = Offset / NumEltsPerLane;
10374 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
10375 "Only 8, 16, and 32 bit elements can be extended.");
10376 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
10377 assert(0 <= Offset && "Extension offset must be positive.");
10378 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
10379 "Extension offset must be in the first lane or start an upper lane.");
10381 // Check that an index is in same lane as the base offset.
10382 auto SafeOffset = [&](int Idx) {
10383 return OffsetLane == (Idx / NumEltsPerLane);
10386 // Shift along an input so that the offset base moves to the first element.
10387 auto ShuffleOffset = [&](SDValue V) {
10391 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10392 for (int i = 0; i * Scale < NumElements; ++i) {
10393 int SrcIdx = i + Offset;
10394 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10396 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10399 // Found a valid zext mask! Try various lowering strategies based on the
10400 // input type and available ISA extensions.
10401 if (Subtarget.hasSSE41()) {
10402 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10403 // PUNPCK will catch this in a later shuffle match.
10404 if (Offset && Scale == 2 && VT.is128BitVector())
10406 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10407 NumElements / Scale);
10408 InputV = ShuffleOffset(InputV);
10409 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10410 return DAG.getBitcast(VT, InputV);
10413 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10415 // For any extends we can cheat for larger element sizes and use shuffle
10416 // instructions that can fold with a load and/or copy.
10417 if (AnyExt && EltBits == 32) {
10418 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10420 return DAG.getBitcast(
10421 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10422 DAG.getBitcast(MVT::v4i32, InputV),
10423 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10425 if (AnyExt && EltBits == 16 && Scale > 2) {
10426 int PSHUFDMask[4] = {Offset / 2, -1,
10427 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10428 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10429 DAG.getBitcast(MVT::v4i32, InputV),
10430 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10431 int PSHUFWMask[4] = {1, -1, -1, -1};
10432 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10433 return DAG.getBitcast(
10434 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10435 DAG.getBitcast(MVT::v8i16, InputV),
10436 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10439 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10441 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10442 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10443 assert(VT.is128BitVector() && "Unexpected vector width!");
10445 int LoIdx = Offset * EltBits;
10446 SDValue Lo = DAG.getBitcast(
10447 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10448 DAG.getConstant(EltBits, DL, MVT::i8),
10449 DAG.getConstant(LoIdx, DL, MVT::i8)));
10451 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10452 !SafeOffset(Offset + 1))
10453 return DAG.getBitcast(VT, Lo);
10455 int HiIdx = (Offset + 1) * EltBits;
10456 SDValue Hi = DAG.getBitcast(
10457 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10458 DAG.getConstant(EltBits, DL, MVT::i8),
10459 DAG.getConstant(HiIdx, DL, MVT::i8)));
10460 return DAG.getBitcast(VT,
10461 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10464 // If this would require more than 2 unpack instructions to expand, use
10465 // pshufb when available. We can only use more than 2 unpack instructions
10466 // when zero extending i8 elements which also makes it easier to use pshufb.
10467 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10468 assert(NumElements == 16 && "Unexpected byte vector width!");
10469 SDValue PSHUFBMask[16];
10470 for (int i = 0; i < 16; ++i) {
10471 int Idx = Offset + (i / Scale);
10472 PSHUFBMask[i] = DAG.getConstant(
10473 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10475 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10476 return DAG.getBitcast(
10477 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10478 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10481 // If we are extending from an offset, ensure we start on a boundary that
10482 // we can unpack from.
10483 int AlignToUnpack = Offset % (NumElements / Scale);
10484 if (AlignToUnpack) {
10485 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10486 for (int i = AlignToUnpack; i < NumElements; ++i)
10487 ShMask[i - AlignToUnpack] = i;
10488 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10489 Offset -= AlignToUnpack;
10492 // Otherwise emit a sequence of unpacks.
10494 unsigned UnpackLoHi = X86ISD::UNPCKL;
10495 if (Offset >= (NumElements / 2)) {
10496 UnpackLoHi = X86ISD::UNPCKH;
10497 Offset -= (NumElements / 2);
10500 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10501 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10502 : getZeroVector(InputVT, Subtarget, DAG, DL);
10503 InputV = DAG.getBitcast(InputVT, InputV);
10504 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10508 } while (Scale > 1);
10509 return DAG.getBitcast(VT, InputV);
10512 /// Try to lower a vector shuffle as a zero extension on any microarch.
10514 /// This routine will try to do everything in its power to cleverly lower
10515 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10516 /// check for the profitability of this lowering, it tries to aggressively
10517 /// match this pattern. It will use all of the micro-architectural details it
10518 /// can to emit an efficient lowering. It handles both blends with all-zero
10519 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10520 /// masking out later).
10522 /// The reason we have dedicated lowering for zext-style shuffles is that they
10523 /// are both incredibly common and often quite performance sensitive.
10524 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10525 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10526 const APInt &Zeroable, const X86Subtarget &Subtarget,
10527 SelectionDAG &DAG) {
10528 int Bits = VT.getSizeInBits();
10529 int NumLanes = Bits / 128;
10530 int NumElements = VT.getVectorNumElements();
10531 int NumEltsPerLane = NumElements / NumLanes;
10532 assert(VT.getScalarSizeInBits() <= 32 &&
10533 "Exceeds 32-bit integer zero extension limit");
10534 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10536 // Define a helper function to check a particular ext-scale and lower to it if
10538 auto Lower = [&](int Scale) -> SDValue {
10540 bool AnyExt = true;
10543 for (int i = 0; i < NumElements; ++i) {
10546 continue; // Valid anywhere but doesn't tell us anything.
10547 if (i % Scale != 0) {
10548 // Each of the extended elements need to be zeroable.
10552 // We no longer are in the anyext case.
10557 // Each of the base elements needs to be consecutive indices into the
10558 // same input vector.
10559 SDValue V = M < NumElements ? V1 : V2;
10560 M = M % NumElements;
10563 Offset = M - (i / Scale);
10564 } else if (InputV != V)
10565 return SDValue(); // Flip-flopping inputs.
10567 // Offset must start in the lowest 128-bit lane or at the start of an
10569 // FIXME: Is it ever worth allowing a negative base offset?
10570 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10571 (Offset % NumEltsPerLane) == 0))
10574 // If we are offsetting, all referenced entries must come from the same
10576 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10579 if ((M % NumElements) != (Offset + (i / Scale)))
10580 return SDValue(); // Non-consecutive strided elements.
10584 // If we fail to find an input, we have a zero-shuffle which should always
10585 // have already been handled.
10586 // FIXME: Maybe handle this here in case during blending we end up with one?
10590 // If we are offsetting, don't extend if we only match a single input, we
10591 // can always do better by using a basic PSHUF or PUNPCK.
10592 if (Offset != 0 && Matches < 2)
10595 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10596 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10599 // The widest scale possible for extending is to a 64-bit integer.
10600 assert(Bits % 64 == 0 &&
10601 "The number of bits in a vector must be divisible by 64 on x86!");
10602 int NumExtElements = Bits / 64;
10604 // Each iteration, try extending the elements half as much, but into twice as
10606 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10607 assert(NumElements % NumExtElements == 0 &&
10608 "The input vector size must be divisible by the extended size.");
10609 if (SDValue V = Lower(NumElements / NumExtElements))
10613 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10617 // Returns one of the source operands if the shuffle can be reduced to a
10618 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10619 auto CanZExtLowHalf = [&]() {
10620 for (int i = NumElements / 2; i != NumElements; ++i)
10623 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10625 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10630 if (SDValue V = CanZExtLowHalf()) {
10631 V = DAG.getBitcast(MVT::v2i64, V);
10632 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10633 return DAG.getBitcast(VT, V);
10636 // No viable ext lowering found.
10640 /// Try to get a scalar value for a specific element of a vector.
10642 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10643 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10644 SelectionDAG &DAG) {
10645 MVT VT = V.getSimpleValueType();
10646 MVT EltVT = VT.getVectorElementType();
10647 V = peekThroughBitcasts(V);
10649 // If the bitcasts shift the element size, we can't extract an equivalent
10650 // element from it.
10651 MVT NewVT = V.getSimpleValueType();
10652 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10655 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10656 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10657 // Ensure the scalar operand is the same size as the destination.
10658 // FIXME: Add support for scalar truncation where possible.
10659 SDValue S = V.getOperand(Idx);
10660 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10661 return DAG.getBitcast(EltVT, S);
10667 /// Helper to test for a load that can be folded with x86 shuffles.
10669 /// This is particularly important because the set of instructions varies
10670 /// significantly based on whether the operand is a load or not.
10671 static bool isShuffleFoldableLoad(SDValue V) {
10672 V = peekThroughBitcasts(V);
10673 return ISD::isNON_EXTLoad(V.getNode());
10676 /// Try to lower insertion of a single element into a zero vector.
10678 /// This is a common pattern that we have especially efficient patterns to lower
10679 /// across all subtarget feature sets.
10680 static SDValue lowerVectorShuffleAsElementInsertion(
10681 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10682 const APInt &Zeroable, const X86Subtarget &Subtarget,
10683 SelectionDAG &DAG) {
10685 MVT EltVT = VT.getVectorElementType();
10688 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10690 bool IsV1Zeroable = true;
10691 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10692 if (i != V2Index && !Zeroable[i]) {
10693 IsV1Zeroable = false;
10697 // Check for a single input from a SCALAR_TO_VECTOR node.
10698 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10699 // all the smarts here sunk into that routine. However, the current
10700 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10701 // vector shuffle lowering is dead.
10702 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10704 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10705 // We need to zext the scalar if it is smaller than an i32.
10706 V2S = DAG.getBitcast(EltVT, V2S);
10707 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10708 // Using zext to expand a narrow element won't work for non-zero
10713 // Zero-extend directly to i32.
10714 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10715 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10717 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10718 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10719 EltVT == MVT::i16) {
10720 // Either not inserting from the low element of the input or the input
10721 // element size is too small to use VZEXT_MOVL to clear the high bits.
10725 if (!IsV1Zeroable) {
10726 // If V1 can't be treated as a zero vector we have fewer options to lower
10727 // this. We can't support integer vectors or non-zero targets cheaply, and
10728 // the V1 elements can't be permuted in any way.
10729 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10730 if (!VT.isFloatingPoint() || V2Index != 0)
10732 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10733 V1Mask[V2Index] = -1;
10734 if (!isNoopShuffleMask(V1Mask))
10736 if (!VT.is128BitVector())
10739 // Otherwise, use MOVSD or MOVSS.
10740 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10741 "Only two types of floating point element types to handle!");
10742 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10746 // This lowering only works for the low element with floating point vectors.
10747 if (VT.isFloatingPoint() && V2Index != 0)
10750 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10752 V2 = DAG.getBitcast(VT, V2);
10754 if (V2Index != 0) {
10755 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10756 // the desired position. Otherwise it is more efficient to do a vector
10757 // shift left. We know that we can do a vector shift left because all
10758 // the inputs are zero.
10759 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10760 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10761 V2Shuffle[V2Index] = 0;
10762 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10764 V2 = DAG.getBitcast(MVT::v16i8, V2);
10766 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10767 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
10768 V2 = DAG.getBitcast(VT, V2);
10774 /// Try to lower broadcast of a single - truncated - integer element,
10775 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10777 /// This assumes we have AVX2.
10778 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10779 SDValue V0, int BroadcastIdx,
10780 const X86Subtarget &Subtarget,
10781 SelectionDAG &DAG) {
10782 assert(Subtarget.hasAVX2() &&
10783 "We can only lower integer broadcasts with AVX2!");
10785 EVT EltVT = VT.getVectorElementType();
10786 EVT V0VT = V0.getValueType();
10788 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10789 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10791 EVT V0EltVT = V0VT.getVectorElementType();
10792 if (!V0EltVT.isInteger())
10795 const unsigned EltSize = EltVT.getSizeInBits();
10796 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10798 // This is only a truncation if the original element type is larger.
10799 if (V0EltSize <= EltSize)
10802 assert(((V0EltSize % EltSize) == 0) &&
10803 "Scalar type sizes must all be powers of 2 on x86!");
10805 const unsigned V0Opc = V0.getOpcode();
10806 const unsigned Scale = V0EltSize / EltSize;
10807 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10809 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10810 V0Opc != ISD::BUILD_VECTOR)
10813 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10815 // If we're extracting non-least-significant bits, shift so we can truncate.
10816 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10817 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10818 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10819 if (const int OffsetIdx = BroadcastIdx % Scale)
10820 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10821 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10823 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10824 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10827 /// Try to lower broadcast of a single element.
10829 /// For convenience, this code also bundles all of the subtarget feature set
10830 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10831 /// a convenient way to factor it out.
10832 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10833 SDValue V1, SDValue V2,
10834 ArrayRef<int> Mask,
10835 const X86Subtarget &Subtarget,
10836 SelectionDAG &DAG) {
10837 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10838 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10839 (Subtarget.hasAVX2() && VT.isInteger())))
10842 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10843 // we can only broadcast from a register with AVX2.
10844 unsigned NumElts = Mask.size();
10845 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10847 : X86ISD::VBROADCAST;
10848 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10850 // Check that the mask is a broadcast.
10851 int BroadcastIdx = -1;
10852 for (int i = 0; i != (int)NumElts; ++i) {
10853 SmallVector<int, 8> BroadcastMask(NumElts, i);
10854 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10860 if (BroadcastIdx < 0)
10862 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10863 "a sorted mask where the broadcast "
10866 // Go up the chain of (vector) values to find a scalar load that we can
10867 // combine with the broadcast.
10870 switch (V.getOpcode()) {
10871 case ISD::BITCAST: {
10872 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10873 SDValue VSrc = V.getOperand(0);
10874 unsigned NumEltBits = V.getScalarValueSizeInBits();
10875 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10876 if ((NumEltBits % NumSrcBits) == 0)
10877 BroadcastIdx *= (NumEltBits / NumSrcBits);
10878 else if ((NumSrcBits % NumEltBits) == 0 &&
10879 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10880 BroadcastIdx /= (NumSrcBits / NumEltBits);
10886 case ISD::CONCAT_VECTORS: {
10887 int OperandSize = Mask.size() / V.getNumOperands();
10888 V = V.getOperand(BroadcastIdx / OperandSize);
10889 BroadcastIdx %= OperandSize;
10892 case ISD::INSERT_SUBVECTOR: {
10893 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10894 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10898 int BeginIdx = (int)ConstantIdx->getZExtValue();
10900 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10901 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10902 BroadcastIdx -= BeginIdx;
10913 // Ensure the source vector and BroadcastIdx are for a suitable type.
10914 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10915 unsigned NumEltBits = VT.getScalarSizeInBits();
10916 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10917 if ((NumSrcBits % NumEltBits) == 0)
10918 BroadcastIdx *= (NumSrcBits / NumEltBits);
10919 else if ((NumEltBits % NumSrcBits) == 0 &&
10920 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10921 BroadcastIdx /= (NumEltBits / NumSrcBits);
10925 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10926 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10927 V = DAG.getBitcast(SrcVT, V);
10930 // Check if this is a broadcast of a scalar. We special case lowering
10931 // for scalars so that we can more effectively fold with loads.
10932 // First, look through bitcast: if the original value has a larger element
10933 // type than the shuffle, the broadcast element is in essence truncated.
10934 // Make that explicit to ease folding.
10935 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10936 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10937 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10938 return TruncBroadcast;
10940 MVT BroadcastVT = VT;
10942 // Peek through any bitcast (only useful for loads).
10943 SDValue BC = peekThroughBitcasts(V);
10945 // Also check the simpler case, where we can directly reuse the scalar.
10946 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10947 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10948 V = V.getOperand(BroadcastIdx);
10950 // If we can't broadcast from a register, check that the input is a load.
10951 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10953 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10954 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10955 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10956 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10957 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10962 // If we are broadcasting a load that is only used by the shuffle
10963 // then we can reduce the vector load to the broadcasted scalar load.
10964 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10965 SDValue BaseAddr = Ld->getOperand(1);
10966 EVT SVT = BroadcastVT.getScalarType();
10967 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10968 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10969 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10970 DAG.getMachineFunction().getMachineMemOperand(
10971 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10972 DAG.makeEquivalentMemoryOrdering(Ld, V);
10973 } else if (!BroadcastFromReg) {
10974 // We can't broadcast from a vector register.
10976 } else if (BroadcastIdx != 0) {
10977 // We can only broadcast from the zero-element of a vector register,
10978 // but it can be advantageous to broadcast from the zero-element of a
10980 if (!VT.is256BitVector() && !VT.is512BitVector())
10983 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10984 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10987 // Only broadcast the zero-element of a 128-bit subvector.
10988 unsigned EltSize = VT.getScalarSizeInBits();
10989 if (((BroadcastIdx * EltSize) % 128) != 0)
10992 // The shuffle input might have been a bitcast we looked through; look at
10993 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10994 // later bitcast it to BroadcastVT.
10995 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10996 "Unexpected vector element size");
10997 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
10998 "Unexpected vector size");
10999 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
11002 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
11003 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
11004 DAG.getBitcast(MVT::f64, V));
11006 // Bitcast back to the same scalar type as BroadcastVT.
11007 MVT SrcVT = V.getSimpleValueType();
11008 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
11009 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11010 "Unexpected vector element size");
11011 if (SrcVT.isVector()) {
11012 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11013 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
11015 SrcVT = BroadcastVT.getScalarType();
11017 V = DAG.getBitcast(SrcVT, V);
11020 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11021 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
11022 V = DAG.getBitcast(MVT::f64, V);
11023 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
11024 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
11027 // We only support broadcasting from 128-bit vectors to minimize the
11028 // number of patterns we need to deal with in isel. So extract down to
11029 // 128-bits, removing as many bitcasts as possible.
11030 if (SrcVT.getSizeInBits() > 128) {
11031 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
11032 128 / SrcVT.getScalarSizeInBits());
11033 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
11034 V = DAG.getBitcast(ExtVT, V);
11037 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
11040 // Check for whether we can use INSERTPS to perform the shuffle. We only use
11041 // INSERTPS when the V1 elements are already in the correct locations
11042 // because otherwise we can just always use two SHUFPS instructions which
11043 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
11044 // perform INSERTPS if a single V1 element is out of place and all V2
11045 // elements are zeroable.
11046 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
11047 unsigned &InsertPSMask,
11048 const APInt &Zeroable,
11049 ArrayRef<int> Mask,
11050 SelectionDAG &DAG) {
11051 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
11052 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
11053 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11055 // Attempt to match INSERTPS with one element from VA or VB being
11056 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
11058 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
11059 ArrayRef<int> CandidateMask) {
11060 unsigned ZMask = 0;
11061 int VADstIndex = -1;
11062 int VBDstIndex = -1;
11063 bool VAUsedInPlace = false;
11065 for (int i = 0; i < 4; ++i) {
11066 // Synthesize a zero mask from the zeroable elements (includes undefs).
11072 // Flag if we use any VA inputs in place.
11073 if (i == CandidateMask[i]) {
11074 VAUsedInPlace = true;
11078 // We can only insert a single non-zeroable element.
11079 if (VADstIndex >= 0 || VBDstIndex >= 0)
11082 if (CandidateMask[i] < 4) {
11083 // VA input out of place for insertion.
11086 // VB input for insertion.
11091 // Don't bother if we have no (non-zeroable) element for insertion.
11092 if (VADstIndex < 0 && VBDstIndex < 0)
11095 // Determine element insertion src/dst indices. The src index is from the
11096 // start of the inserted vector, not the start of the concatenated vector.
11097 unsigned VBSrcIndex = 0;
11098 if (VADstIndex >= 0) {
11099 // If we have a VA input out of place, we use VA as the V2 element
11100 // insertion and don't use the original V2 at all.
11101 VBSrcIndex = CandidateMask[VADstIndex];
11102 VBDstIndex = VADstIndex;
11105 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
11108 // If no V1 inputs are used in place, then the result is created only from
11109 // the zero mask and the V2 insertion - so remove V1 dependency.
11110 if (!VAUsedInPlace)
11111 VA = DAG.getUNDEF(MVT::v4f32);
11113 // Update V1, V2 and InsertPSMask accordingly.
11117 // Insert the V2 element into the desired position.
11118 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
11119 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
11123 if (matchAsInsertPS(V1, V2, Mask))
11126 // Commute and try again.
11127 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11128 ShuffleVectorSDNode::commuteMask(CommutedMask);
11129 if (matchAsInsertPS(V2, V1, CommutedMask))
11135 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
11136 SDValue V2, ArrayRef<int> Mask,
11137 const APInt &Zeroable,
11138 SelectionDAG &DAG) {
11139 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11140 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11142 // Attempt to match the insertps pattern.
11143 unsigned InsertPSMask;
11144 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11147 // Insert the V2 element into the desired position.
11148 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11149 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11152 /// Try to lower a shuffle as a permute of the inputs followed by an
11153 /// UNPCK instruction.
11155 /// This specifically targets cases where we end up with alternating between
11156 /// the two inputs, and so can permute them into something that feeds a single
11157 /// UNPCK instruction. Note that this routine only targets integer vectors
11158 /// because for floating point vectors we have a generalized SHUFPS lowering
11159 /// strategy that handles everything that doesn't *exactly* match an unpack,
11160 /// making this clever lowering unnecessary.
11161 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
11162 SDValue V1, SDValue V2,
11163 ArrayRef<int> Mask,
11164 SelectionDAG &DAG) {
11165 assert(!VT.isFloatingPoint() &&
11166 "This routine only supports integer vectors.");
11167 assert(VT.is128BitVector() &&
11168 "This routine only works on 128-bit vectors.");
11169 assert(!V2.isUndef() &&
11170 "This routine should only be used when blending two inputs.");
11171 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11173 int Size = Mask.size();
11176 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11178 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11180 bool UnpackLo = NumLoInputs >= NumHiInputs;
11182 auto TryUnpack = [&](int ScalarSize, int Scale) {
11183 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11184 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11186 for (int i = 0; i < Size; ++i) {
11190 // Each element of the unpack contains Scale elements from this mask.
11191 int UnpackIdx = i / Scale;
11193 // We only handle the case where V1 feeds the first slots of the unpack.
11194 // We rely on canonicalization to ensure this is the case.
11195 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11198 // Setup the mask for this input. The indexing is tricky as we have to
11199 // handle the unpack stride.
11200 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11201 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11205 // If we will have to shuffle both inputs to use the unpack, check whether
11206 // we can just unpack first and shuffle the result. If so, skip this unpack.
11207 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11208 !isNoopShuffleMask(V2Mask))
11211 // Shuffle the inputs into place.
11212 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11213 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11215 // Cast the inputs to the type we will use to unpack them.
11216 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11217 V1 = DAG.getBitcast(UnpackVT, V1);
11218 V2 = DAG.getBitcast(UnpackVT, V2);
11220 // Unpack the inputs and cast the result back to the desired type.
11221 return DAG.getBitcast(
11222 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11223 UnpackVT, V1, V2));
11226 // We try each unpack from the largest to the smallest to try and find one
11227 // that fits this mask.
11228 int OrigScalarSize = VT.getScalarSizeInBits();
11229 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11230 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11233 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11235 if (NumLoInputs == 0 || NumHiInputs == 0) {
11236 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11237 "We have to have *some* inputs!");
11238 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11240 // FIXME: We could consider the total complexity of the permute of each
11241 // possible unpacking. Or at the least we should consider how many
11242 // half-crossings are created.
11243 // FIXME: We could consider commuting the unpacks.
11245 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11246 for (int i = 0; i < Size; ++i) {
11250 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11253 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11255 return DAG.getVectorShuffle(
11256 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
11258 DAG.getUNDEF(VT), PermMask);
11264 /// Handle lowering of 2-lane 64-bit floating point shuffles.
11266 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
11267 /// support for floating point shuffles but not integer shuffles. These
11268 /// instructions will incur a domain crossing penalty on some chips though so
11269 /// it is better to avoid lowering through this for integer vectors where
11271 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11272 const APInt &Zeroable,
11273 SDValue V1, SDValue V2,
11274 const X86Subtarget &Subtarget,
11275 SelectionDAG &DAG) {
11276 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11277 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11278 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11280 if (V2.isUndef()) {
11281 // Check for being able to broadcast a single element.
11282 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11283 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
11286 // Straight shuffle of a single input vector. Simulate this by using the
11287 // single input as both of the "inputs" to this instruction..
11288 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
11290 if (Subtarget.hasAVX()) {
11291 // If we have AVX, we can use VPERMILPS which will allow folding a load
11292 // into the shuffle.
11293 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
11294 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11297 return DAG.getNode(
11298 X86ISD::SHUFP, DL, MVT::v2f64,
11299 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11300 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11301 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11303 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
11304 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
11306 // If we have a single input, insert that into V1 if we can do so cheaply.
11307 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
11308 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11309 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11311 // Try inverting the insertion since for v2 masks it is easy to do and we
11312 // can't reliably sort the mask one way or the other.
11313 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
11314 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
11315 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11316 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11320 // Try to use one of the special instruction patterns to handle two common
11321 // blend patterns if a zero-blend above didn't work.
11322 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
11323 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
11324 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
11325 // We can either use a special instruction to load over the low double or
11326 // to move just the low double.
11327 return DAG.getNode(
11328 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
11329 DL, MVT::v2f64, V2,
11330 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
11332 if (Subtarget.hasSSE41())
11333 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
11334 Zeroable, Subtarget, DAG))
11337 // Use dedicated unpack instructions for masks that match their pattern.
11339 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
11342 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
11343 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
11344 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11347 /// Handle lowering of 2-lane 64-bit integer shuffles.
11349 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
11350 /// the integer unit to minimize domain crossing penalties. However, for blends
11351 /// it falls back to the floating point shuffle operation with appropriate bit
11353 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11354 const APInt &Zeroable,
11355 SDValue V1, SDValue V2,
11356 const X86Subtarget &Subtarget,
11357 SelectionDAG &DAG) {
11358 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11359 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11360 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11362 if (V2.isUndef()) {
11363 // Check for being able to broadcast a single element.
11364 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11365 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11368 // Straight shuffle of a single input vector. For everything from SSE2
11369 // onward this has a single fast instruction with no scary immediates.
11370 // We have to map the mask as it is actually a v4i32 shuffle instruction.
11371 V1 = DAG.getBitcast(MVT::v4i32, V1);
11372 int WidenedMask[4] = {
11373 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
11374 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
11375 return DAG.getBitcast(
11377 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11378 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
11380 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
11381 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
11382 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11383 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11385 // Try to use shift instructions.
11386 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
11387 Zeroable, Subtarget, DAG))
11390 // When loading a scalar and then shuffling it into a vector we can often do
11391 // the insertion cheaply.
11392 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11393 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11395 // Try inverting the insertion since for v2 masks it is easy to do and we
11396 // can't reliably sort the mask one way or the other.
11397 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11398 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11399 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11402 // We have different paths for blend lowering, but they all must use the
11403 // *exact* same predicate.
11404 bool IsBlendSupported = Subtarget.hasSSE41();
11405 if (IsBlendSupported)
11406 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11407 Zeroable, Subtarget, DAG))
11410 // Use dedicated unpack instructions for masks that match their pattern.
11412 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11415 // Try to use byte rotation instructions.
11416 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11417 if (Subtarget.hasSSSE3()) {
11418 if (Subtarget.hasVLX())
11419 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11420 Mask, Subtarget, DAG))
11423 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11424 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11428 // If we have direct support for blends, we should lower by decomposing into
11429 // a permute. That will be faster than the domain cross.
11430 if (IsBlendSupported)
11431 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11434 // We implement this with SHUFPD which is pretty lame because it will likely
11435 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11436 // However, all the alternatives are still more cycles and newer chips don't
11437 // have this problem. It would be really nice if x86 had better shuffles here.
11438 V1 = DAG.getBitcast(MVT::v2f64, V1);
11439 V2 = DAG.getBitcast(MVT::v2f64, V2);
11440 return DAG.getBitcast(MVT::v2i64,
11441 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11444 /// Test whether this can be lowered with a single SHUFPS instruction.
11446 /// This is used to disable more specialized lowerings when the shufps lowering
11447 /// will happen to be efficient.
11448 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11449 // This routine only handles 128-bit shufps.
11450 assert(Mask.size() == 4 && "Unsupported mask size!");
11451 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11452 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11453 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11454 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11456 // To lower with a single SHUFPS we need to have the low half and high half
11457 // each requiring a single input.
11458 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11460 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11466 /// Lower a vector shuffle using the SHUFPS instruction.
11468 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11469 /// It makes no assumptions about whether this is the *best* lowering, it simply
11471 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11472 ArrayRef<int> Mask, SDValue V1,
11473 SDValue V2, SelectionDAG &DAG) {
11474 SDValue LowV = V1, HighV = V2;
11475 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11477 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11479 if (NumV2Elements == 1) {
11480 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11482 // Compute the index adjacent to V2Index and in the same half by toggling
11484 int V2AdjIndex = V2Index ^ 1;
11486 if (Mask[V2AdjIndex] < 0) {
11487 // Handles all the cases where we have a single V2 element and an undef.
11488 // This will only ever happen in the high lanes because we commute the
11489 // vector otherwise.
11491 std::swap(LowV, HighV);
11492 NewMask[V2Index] -= 4;
11494 // Handle the case where the V2 element ends up adjacent to a V1 element.
11495 // To make this work, blend them together as the first step.
11496 int V1Index = V2AdjIndex;
11497 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11498 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11499 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11501 // Now proceed to reconstruct the final blend as we have the necessary
11502 // high or low half formed.
11509 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11510 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11512 } else if (NumV2Elements == 2) {
11513 if (Mask[0] < 4 && Mask[1] < 4) {
11514 // Handle the easy case where we have V1 in the low lanes and V2 in the
11518 } else if (Mask[2] < 4 && Mask[3] < 4) {
11519 // We also handle the reversed case because this utility may get called
11520 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11521 // arrange things in the right direction.
11527 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11528 // trying to place elements directly, just blend them and set up the final
11529 // shuffle to place them.
11531 // The first two blend mask elements are for V1, the second two are for
11533 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11534 Mask[2] < 4 ? Mask[2] : Mask[3],
11535 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11536 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11537 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11538 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11540 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11543 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11544 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11545 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11546 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11549 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11550 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11553 /// Lower 4-lane 32-bit floating point shuffles.
11555 /// Uses instructions exclusively from the floating point unit to minimize
11556 /// domain crossing penalties, as these are sufficient to implement all v4f32
11558 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11559 const APInt &Zeroable,
11560 SDValue V1, SDValue V2,
11561 const X86Subtarget &Subtarget,
11562 SelectionDAG &DAG) {
11563 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11564 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11565 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11567 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11569 if (NumV2Elements == 0) {
11570 // Check for being able to broadcast a single element.
11571 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11572 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11575 // Use even/odd duplicate instructions for masks that match their pattern.
11576 if (Subtarget.hasSSE3()) {
11577 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11578 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11579 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11580 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11583 if (Subtarget.hasAVX()) {
11584 // If we have AVX, we can use VPERMILPS which will allow folding a load
11585 // into the shuffle.
11586 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11587 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11590 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11591 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11592 if (!Subtarget.hasSSE2()) {
11593 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11594 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11595 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11596 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11599 // Otherwise, use a straight shuffle of a single input vector. We pass the
11600 // input vector to both operands to simulate this with a SHUFPS.
11601 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11602 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11605 // There are special ways we can lower some single-element blends. However, we
11606 // have custom ways we can lower more complex single-element blends below that
11607 // we defer to if both this and BLENDPS fail to match, so restrict this to
11608 // when the V2 input is targeting element 0 of the mask -- that is the fast
11610 if (NumV2Elements == 1 && Mask[0] >= 4)
11611 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11612 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11615 if (Subtarget.hasSSE41()) {
11616 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11617 Zeroable, Subtarget, DAG))
11620 // Use INSERTPS if we can complete the shuffle efficiently.
11622 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11625 if (!isSingleSHUFPSMask(Mask))
11626 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11627 DL, MVT::v4f32, V1, V2, Mask, DAG))
11631 // Use low/high mov instructions. These are only valid in SSE1 because
11632 // otherwise they are widened to v2f64 and never get here.
11633 if (!Subtarget.hasSSE2()) {
11634 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11635 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11636 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11637 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11640 // Use dedicated unpack instructions for masks that match their pattern.
11642 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11645 // Otherwise fall back to a SHUFPS lowering strategy.
11646 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11649 /// Lower 4-lane i32 vector shuffles.
11651 /// We try to handle these with integer-domain shuffles where we can, but for
11652 /// blends we use the floating point domain blend instructions.
11653 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11654 const APInt &Zeroable,
11655 SDValue V1, SDValue V2,
11656 const X86Subtarget &Subtarget,
11657 SelectionDAG &DAG) {
11658 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11659 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11660 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11662 // Whenever we can lower this as a zext, that instruction is strictly faster
11663 // than any alternative. It also allows us to fold memory operands into the
11664 // shuffle in many cases.
11665 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11666 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11669 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11671 if (NumV2Elements == 0) {
11672 // Check for being able to broadcast a single element.
11673 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11674 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11677 // Straight shuffle of a single input vector. For everything from SSE2
11678 // onward this has a single fast instruction with no scary immediates.
11679 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11680 // but we aren't actually going to use the UNPCK instruction because doing
11681 // so prevents folding a load into this instruction or making a copy.
11682 const int UnpackLoMask[] = {0, 0, 1, 1};
11683 const int UnpackHiMask[] = {2, 2, 3, 3};
11684 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11685 Mask = UnpackLoMask;
11686 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11687 Mask = UnpackHiMask;
11689 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11690 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11693 // Try to use shift instructions.
11694 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11695 Zeroable, Subtarget, DAG))
11698 // There are special ways we can lower some single-element blends.
11699 if (NumV2Elements == 1)
11700 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11701 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11704 // We have different paths for blend lowering, but they all must use the
11705 // *exact* same predicate.
11706 bool IsBlendSupported = Subtarget.hasSSE41();
11707 if (IsBlendSupported)
11708 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11709 Zeroable, Subtarget, DAG))
11712 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11716 // Use dedicated unpack instructions for masks that match their pattern.
11718 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11721 // Try to use byte rotation instructions.
11722 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11723 if (Subtarget.hasSSSE3()) {
11724 if (Subtarget.hasVLX())
11725 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11726 Mask, Subtarget, DAG))
11729 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11730 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11734 // Assume that a single SHUFPS is faster than an alternative sequence of
11735 // multiple instructions (even if the CPU has a domain penalty).
11736 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11737 if (!isSingleSHUFPSMask(Mask)) {
11738 // If we have direct support for blends, we should lower by decomposing into
11739 // a permute. That will be faster than the domain cross.
11740 if (IsBlendSupported)
11741 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11744 // Try to lower by permuting the inputs into an unpack instruction.
11745 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11746 DL, MVT::v4i32, V1, V2, Mask, DAG))
11750 // We implement this with SHUFPS because it can blend from two vectors.
11751 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11752 // up the inputs, bypassing domain shift penalties that we would incur if we
11753 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11755 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11756 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11757 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11758 return DAG.getBitcast(MVT::v4i32, ShufPS);
11761 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11762 /// shuffle lowering, and the most complex part.
11764 /// The lowering strategy is to try to form pairs of input lanes which are
11765 /// targeted at the same half of the final vector, and then use a dword shuffle
11766 /// to place them onto the right half, and finally unpack the paired lanes into
11767 /// their final position.
11769 /// The exact breakdown of how to form these dword pairs and align them on the
11770 /// correct sides is really tricky. See the comments within the function for
11771 /// more of the details.
11773 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11774 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11775 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11776 /// vector, form the analogous 128-bit 8-element Mask.
11777 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11778 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11779 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11780 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11781 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11783 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11784 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11785 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11787 // Attempt to directly match PSHUFLW or PSHUFHW.
11788 if (isUndefOrInRange(LoMask, 0, 4) &&
11789 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11790 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11791 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11793 if (isUndefOrInRange(HiMask, 4, 8) &&
11794 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11795 for (int i = 0; i != 4; ++i)
11796 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11797 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11798 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11801 SmallVector<int, 4> LoInputs;
11802 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11803 array_pod_sort(LoInputs.begin(), LoInputs.end());
11804 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11805 SmallVector<int, 4> HiInputs;
11806 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11807 array_pod_sort(HiInputs.begin(), HiInputs.end());
11808 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11810 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11811 int NumHToL = LoInputs.size() - NumLToL;
11813 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11814 int NumHToH = HiInputs.size() - NumLToH;
11815 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11816 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11817 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11818 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11820 // If we are shuffling values from one half - check how many different DWORD
11821 // pairs we need to create. If only 1 or 2 then we can perform this as a
11822 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11823 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11824 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11825 V = DAG.getNode(ShufWOp, DL, VT, V,
11826 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11827 V = DAG.getBitcast(PSHUFDVT, V);
11828 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11829 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11830 return DAG.getBitcast(VT, V);
11833 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11834 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11835 SmallVector<std::pair<int, int>, 4> DWordPairs;
11836 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11838 // Collect the different DWORD pairs.
11839 for (int DWord = 0; DWord != 4; ++DWord) {
11840 int M0 = Mask[2 * DWord + 0];
11841 int M1 = Mask[2 * DWord + 1];
11842 M0 = (M0 >= 0 ? M0 % 4 : M0);
11843 M1 = (M1 >= 0 ? M1 % 4 : M1);
11844 if (M0 < 0 && M1 < 0)
11847 bool Match = false;
11848 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11849 auto &DWordPair = DWordPairs[j];
11850 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11851 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11852 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11853 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11854 PSHUFDMask[DWord] = DOffset + j;
11860 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11861 DWordPairs.push_back(std::make_pair(M0, M1));
11865 if (DWordPairs.size() <= 2) {
11866 DWordPairs.resize(2, std::make_pair(-1, -1));
11867 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11868 DWordPairs[1].first, DWordPairs[1].second};
11869 if ((NumHToL + NumHToH) == 0)
11870 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11871 if ((NumLToL + NumLToH) == 0)
11872 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11876 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11877 // such inputs we can swap two of the dwords across the half mark and end up
11878 // with <=2 inputs to each half in each half. Once there, we can fall through
11879 // to the generic code below. For example:
11881 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11882 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11884 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11885 // and an existing 2-into-2 on the other half. In this case we may have to
11886 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11887 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11888 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11889 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11890 // half than the one we target for fixing) will be fixed when we re-enter this
11891 // path. We will also combine away any sequence of PSHUFD instructions that
11892 // result into a single instruction. Here is an example of the tricky case:
11894 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11895 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11897 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11899 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11900 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11902 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11903 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11905 // The result is fine to be handled by the generic logic.
11906 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11907 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11908 int AOffset, int BOffset) {
11909 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11910 "Must call this with A having 3 or 1 inputs from the A half.");
11911 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11912 "Must call this with B having 1 or 3 inputs from the B half.");
11913 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11914 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11916 bool ThreeAInputs = AToAInputs.size() == 3;
11918 // Compute the index of dword with only one word among the three inputs in
11919 // a half by taking the sum of the half with three inputs and subtracting
11920 // the sum of the actual three inputs. The difference is the remaining
11922 int ADWord, BDWord;
11923 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11924 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11925 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11926 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11927 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11928 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11929 int TripleNonInputIdx =
11930 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11931 TripleDWord = TripleNonInputIdx / 2;
11933 // We use xor with one to compute the adjacent DWord to whichever one the
11935 OneInputDWord = (OneInput / 2) ^ 1;
11937 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11938 // and BToA inputs. If there is also such a problem with the BToB and AToB
11939 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11940 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11941 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11942 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11943 // Compute how many inputs will be flipped by swapping these DWords. We
11945 // to balance this to ensure we don't form a 3-1 shuffle in the other
11947 int NumFlippedAToBInputs =
11948 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11949 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11950 int NumFlippedBToBInputs =
11951 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11952 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11953 if ((NumFlippedAToBInputs == 1 &&
11954 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11955 (NumFlippedBToBInputs == 1 &&
11956 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11957 // We choose whether to fix the A half or B half based on whether that
11958 // half has zero flipped inputs. At zero, we may not be able to fix it
11959 // with that half. We also bias towards fixing the B half because that
11960 // will more commonly be the high half, and we have to bias one way.
11961 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11962 ArrayRef<int> Inputs) {
11963 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11964 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11965 // Determine whether the free index is in the flipped dword or the
11966 // unflipped dword based on where the pinned index is. We use this bit
11967 // in an xor to conditionally select the adjacent dword.
11968 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11969 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11970 if (IsFixIdxInput == IsFixFreeIdxInput)
11972 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11973 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11974 "We need to be changing the number of flipped inputs!");
11975 int PSHUFHalfMask[] = {0, 1, 2, 3};
11976 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11978 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11979 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11980 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11982 for (int &M : Mask)
11983 if (M >= 0 && M == FixIdx)
11985 else if (M >= 0 && M == FixFreeIdx)
11988 if (NumFlippedBToBInputs != 0) {
11990 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11991 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11993 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11994 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11995 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
12000 int PSHUFDMask[] = {0, 1, 2, 3};
12001 PSHUFDMask[ADWord] = BDWord;
12002 PSHUFDMask[BDWord] = ADWord;
12003 V = DAG.getBitcast(
12005 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12006 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12008 // Adjust the mask to match the new locations of A and B.
12009 for (int &M : Mask)
12010 if (M >= 0 && M/2 == ADWord)
12011 M = 2 * BDWord + M % 2;
12012 else if (M >= 0 && M/2 == BDWord)
12013 M = 2 * ADWord + M % 2;
12015 // Recurse back into this routine to re-compute state now that this isn't
12016 // a 3 and 1 problem.
12017 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
12020 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
12021 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
12022 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
12023 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
12025 // At this point there are at most two inputs to the low and high halves from
12026 // each half. That means the inputs can always be grouped into dwords and
12027 // those dwords can then be moved to the correct half with a dword shuffle.
12028 // We use at most one low and one high word shuffle to collect these paired
12029 // inputs into dwords, and finally a dword shuffle to place them.
12030 int PSHUFLMask[4] = {-1, -1, -1, -1};
12031 int PSHUFHMask[4] = {-1, -1, -1, -1};
12032 int PSHUFDMask[4] = {-1, -1, -1, -1};
12034 // First fix the masks for all the inputs that are staying in their
12035 // original halves. This will then dictate the targets of the cross-half
12037 auto fixInPlaceInputs =
12038 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
12039 MutableArrayRef<int> SourceHalfMask,
12040 MutableArrayRef<int> HalfMask, int HalfOffset) {
12041 if (InPlaceInputs.empty())
12043 if (InPlaceInputs.size() == 1) {
12044 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12045 InPlaceInputs[0] - HalfOffset;
12046 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
12049 if (IncomingInputs.empty()) {
12050 // Just fix all of the in place inputs.
12051 for (int Input : InPlaceInputs) {
12052 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
12053 PSHUFDMask[Input / 2] = Input / 2;
12058 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
12059 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12060 InPlaceInputs[0] - HalfOffset;
12061 // Put the second input next to the first so that they are packed into
12062 // a dword. We find the adjacent index by toggling the low bit.
12063 int AdjIndex = InPlaceInputs[0] ^ 1;
12064 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
12065 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
12066 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
12068 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
12069 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
12071 // Now gather the cross-half inputs and place them into a free dword of
12072 // their target half.
12073 // FIXME: This operation could almost certainly be simplified dramatically to
12074 // look more like the 3-1 fixing operation.
12075 auto moveInputsToRightHalf = [&PSHUFDMask](
12076 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
12077 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
12078 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
12080 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
12081 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
12083 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
12085 int LowWord = Word & ~1;
12086 int HighWord = Word | 1;
12087 return isWordClobbered(SourceHalfMask, LowWord) ||
12088 isWordClobbered(SourceHalfMask, HighWord);
12091 if (IncomingInputs.empty())
12094 if (ExistingInputs.empty()) {
12095 // Map any dwords with inputs from them into the right half.
12096 for (int Input : IncomingInputs) {
12097 // If the source half mask maps over the inputs, turn those into
12098 // swaps and use the swapped lane.
12099 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
12100 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
12101 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
12102 Input - SourceOffset;
12103 // We have to swap the uses in our half mask in one sweep.
12104 for (int &M : HalfMask)
12105 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
12107 else if (M == Input)
12108 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12110 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
12111 Input - SourceOffset &&
12112 "Previous placement doesn't match!");
12114 // Note that this correctly re-maps both when we do a swap and when
12115 // we observe the other side of the swap above. We rely on that to
12116 // avoid swapping the members of the input list directly.
12117 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12120 // Map the input's dword into the correct half.
12121 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
12122 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
12124 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
12126 "Previous placement doesn't match!");
12129 // And just directly shift any other-half mask elements to be same-half
12130 // as we will have mirrored the dword containing the element into the
12131 // same position within that half.
12132 for (int &M : HalfMask)
12133 if (M >= SourceOffset && M < SourceOffset + 4) {
12134 M = M - SourceOffset + DestOffset;
12135 assert(M >= 0 && "This should never wrap below zero!");
12140 // Ensure we have the input in a viable dword of its current half. This
12141 // is particularly tricky because the original position may be clobbered
12142 // by inputs being moved and *staying* in that half.
12143 if (IncomingInputs.size() == 1) {
12144 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12145 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12147 SourceHalfMask[InputFixed - SourceOffset] =
12148 IncomingInputs[0] - SourceOffset;
12149 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12151 IncomingInputs[0] = InputFixed;
12153 } else if (IncomingInputs.size() == 2) {
12154 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12155 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12156 // We have two non-adjacent or clobbered inputs we need to extract from
12157 // the source half. To do this, we need to map them into some adjacent
12158 // dword slot in the source mask.
12159 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12160 IncomingInputs[1] - SourceOffset};
12162 // If there is a free slot in the source half mask adjacent to one of
12163 // the inputs, place the other input in it. We use (Index XOR 1) to
12164 // compute an adjacent index.
12165 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12166 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12167 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12168 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12169 InputsFixed[1] = InputsFixed[0] ^ 1;
12170 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12171 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12172 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12173 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12174 InputsFixed[0] = InputsFixed[1] ^ 1;
12175 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12176 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12177 // The two inputs are in the same DWord but it is clobbered and the
12178 // adjacent DWord isn't used at all. Move both inputs to the free
12180 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12181 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12182 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12183 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12185 // The only way we hit this point is if there is no clobbering
12186 // (because there are no off-half inputs to this half) and there is no
12187 // free slot adjacent to one of the inputs. In this case, we have to
12188 // swap an input with a non-input.
12189 for (int i = 0; i < 4; ++i)
12190 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
12191 "We can't handle any clobbers here!");
12192 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
12193 "Cannot have adjacent inputs here!");
12195 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12196 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
12198 // We also have to update the final source mask in this case because
12199 // it may need to undo the above swap.
12200 for (int &M : FinalSourceHalfMask)
12201 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
12202 M = InputsFixed[1] + SourceOffset;
12203 else if (M == InputsFixed[1] + SourceOffset)
12204 M = (InputsFixed[0] ^ 1) + SourceOffset;
12206 InputsFixed[1] = InputsFixed[0] ^ 1;
12209 // Point everything at the fixed inputs.
12210 for (int &M : HalfMask)
12211 if (M == IncomingInputs[0])
12212 M = InputsFixed[0] + SourceOffset;
12213 else if (M == IncomingInputs[1])
12214 M = InputsFixed[1] + SourceOffset;
12216 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
12217 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
12220 llvm_unreachable("Unhandled input size!");
12223 // Now hoist the DWord down to the right half.
12224 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
12225 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
12226 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
12227 for (int &M : HalfMask)
12228 for (int Input : IncomingInputs)
12230 M = FreeDWord * 2 + Input % 2;
12232 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
12233 /*SourceOffset*/ 4, /*DestOffset*/ 0);
12234 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
12235 /*SourceOffset*/ 0, /*DestOffset*/ 4);
12237 // Now enact all the shuffles we've computed to move the inputs into their
12239 if (!isNoopShuffleMask(PSHUFLMask))
12240 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12241 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
12242 if (!isNoopShuffleMask(PSHUFHMask))
12243 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12244 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
12245 if (!isNoopShuffleMask(PSHUFDMask))
12246 V = DAG.getBitcast(
12248 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12249 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12251 // At this point, each half should contain all its inputs, and we can then
12252 // just shuffle them into their final position.
12253 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
12254 "Failed to lift all the high half inputs to the low mask!");
12255 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
12256 "Failed to lift all the low half inputs to the high mask!");
12258 // Do a half shuffle for the low mask.
12259 if (!isNoopShuffleMask(LoMask))
12260 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12261 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12263 // Do a half shuffle with the high mask after shifting its values down.
12264 for (int &M : HiMask)
12267 if (!isNoopShuffleMask(HiMask))
12268 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12269 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12274 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12275 /// blend if only one input is used.
12276 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
12277 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12278 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
12280 SDValue V1Mask[16];
12281 SDValue V2Mask[16];
12285 int Size = Mask.size();
12286 int Scale = 16 / Size;
12287 for (int i = 0; i < 16; ++i) {
12288 if (Mask[i / Scale] < 0) {
12289 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
12291 const int ZeroMask = 0x80;
12292 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
12294 int V2Idx = Mask[i / Scale] < Size
12296 : (Mask[i / Scale] - Size) * Scale + i % Scale;
12297 if (Zeroable[i / Scale])
12298 V1Idx = V2Idx = ZeroMask;
12299 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
12300 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
12301 V1InUse |= (ZeroMask != V1Idx);
12302 V2InUse |= (ZeroMask != V2Idx);
12307 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12308 DAG.getBitcast(MVT::v16i8, V1),
12309 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
12311 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12312 DAG.getBitcast(MVT::v16i8, V2),
12313 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
12315 // If we need shuffled inputs from both, blend the two.
12317 if (V1InUse && V2InUse)
12318 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
12320 V = V1InUse ? V1 : V2;
12322 // Cast the result back to the correct type.
12323 return DAG.getBitcast(VT, V);
12326 /// Generic lowering of 8-lane i16 shuffles.
12328 /// This handles both single-input shuffles and combined shuffle/blends with
12329 /// two inputs. The single input shuffles are immediately delegated to
12330 /// a dedicated lowering routine.
12332 /// The blends are lowered in one of three fundamental ways. If there are few
12333 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
12334 /// of the input is significantly cheaper when lowered as an interleaving of
12335 /// the two inputs, try to interleave them. Otherwise, blend the low and high
12336 /// halves of the inputs separately (making them have relatively few inputs)
12337 /// and then concatenate them.
12338 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12339 const APInt &Zeroable,
12340 SDValue V1, SDValue V2,
12341 const X86Subtarget &Subtarget,
12342 SelectionDAG &DAG) {
12343 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12344 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12345 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12347 // Whenever we can lower this as a zext, that instruction is strictly faster
12348 // than any alternative.
12349 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12350 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12353 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
12355 if (NumV2Inputs == 0) {
12356 // Check for being able to broadcast a single element.
12357 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12358 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12361 // Try to use shift instructions.
12362 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
12363 Zeroable, Subtarget, DAG))
12366 // Use dedicated unpack instructions for masks that match their pattern.
12368 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12371 // Use dedicated pack instructions for masks that match their pattern.
12372 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
12376 // Try to use byte rotation instructions.
12377 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
12378 Mask, Subtarget, DAG))
12381 // Make a copy of the mask so it can be modified.
12382 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
12383 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
12384 MutableMask, Subtarget,
12388 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12389 "All single-input shuffles should be canonicalized to be V1-input "
12392 // Try to use shift instructions.
12393 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12394 Zeroable, Subtarget, DAG))
12397 // See if we can use SSE4A Extraction / Insertion.
12398 if (Subtarget.hasSSE4A())
12399 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12403 // There are special ways we can lower some single-element blends.
12404 if (NumV2Inputs == 1)
12405 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12406 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12409 // We have different paths for blend lowering, but they all must use the
12410 // *exact* same predicate.
12411 bool IsBlendSupported = Subtarget.hasSSE41();
12412 if (IsBlendSupported)
12413 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12414 Zeroable, Subtarget, DAG))
12417 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12421 // Use dedicated unpack instructions for masks that match their pattern.
12423 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12426 // Use dedicated pack instructions for masks that match their pattern.
12427 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12431 // Try to use byte rotation instructions.
12432 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12433 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12436 if (SDValue BitBlend =
12437 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12440 // Try to lower by permuting the inputs into an unpack instruction.
12441 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12445 // If we can't directly blend but can use PSHUFB, that will be better as it
12446 // can both shuffle and set up the inefficient blend.
12447 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12448 bool V1InUse, V2InUse;
12449 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12450 Zeroable, DAG, V1InUse, V2InUse);
12453 // We can always bit-blend if we have to so the fallback strategy is to
12454 // decompose into single-input permutes and blends.
12455 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12459 /// Check whether a compaction lowering can be done by dropping even
12460 /// elements and compute how many times even elements must be dropped.
12462 /// This handles shuffles which take every Nth element where N is a power of
12463 /// two. Example shuffle masks:
12465 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12466 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12467 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12468 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12469 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12470 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12472 /// Any of these lanes can of course be undef.
12474 /// This routine only supports N <= 3.
12475 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12478 /// \returns N above, or the number of times even elements must be dropped if
12479 /// there is such a number. Otherwise returns zero.
12480 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12481 bool IsSingleInput) {
12482 // The modulus for the shuffle vector entries is based on whether this is
12483 // a single input or not.
12484 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12485 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12486 "We should only be called with masks with a power-of-2 size!");
12488 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12490 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12491 // and 2^3 simultaneously. This is because we may have ambiguity with
12492 // partially undef inputs.
12493 bool ViableForN[3] = {true, true, true};
12495 for (int i = 0, e = Mask.size(); i < e; ++i) {
12496 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12501 bool IsAnyViable = false;
12502 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12503 if (ViableForN[j]) {
12504 uint64_t N = j + 1;
12506 // The shuffle mask must be equal to (i * 2^N) % M.
12507 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12508 IsAnyViable = true;
12510 ViableForN[j] = false;
12512 // Early exit if we exhaust the possible powers of two.
12517 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12521 // Return 0 as there is no viable power of two.
12525 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12526 ArrayRef<int> Mask, SDValue V1,
12527 SDValue V2, SelectionDAG &DAG) {
12528 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12529 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12531 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12533 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12535 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12538 /// Generic lowering of v16i8 shuffles.
12540 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12541 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12542 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12543 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12545 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12546 const APInt &Zeroable,
12547 SDValue V1, SDValue V2,
12548 const X86Subtarget &Subtarget,
12549 SelectionDAG &DAG) {
12550 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12551 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12552 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12554 // Try to use shift instructions.
12555 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12556 Zeroable, Subtarget, DAG))
12559 // Try to use byte rotation instructions.
12560 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12561 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12564 // Use dedicated pack instructions for masks that match their pattern.
12565 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12569 // Try to use a zext lowering.
12570 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12571 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12574 // See if we can use SSE4A Extraction / Insertion.
12575 if (Subtarget.hasSSE4A())
12576 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12580 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12582 // For single-input shuffles, there are some nicer lowering tricks we can use.
12583 if (NumV2Elements == 0) {
12584 // Check for being able to broadcast a single element.
12585 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12586 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12589 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12590 // Notably, this handles splat and partial-splat shuffles more efficiently.
12591 // However, it only makes sense if the pre-duplication shuffle simplifies
12592 // things significantly. Currently, this means we need to be able to
12593 // express the pre-duplication shuffle as an i16 shuffle.
12595 // FIXME: We should check for other patterns which can be widened into an
12596 // i16 shuffle as well.
12597 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12598 for (int i = 0; i < 16; i += 2)
12599 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12604 auto tryToWidenViaDuplication = [&]() -> SDValue {
12605 if (!canWidenViaDuplication(Mask))
12607 SmallVector<int, 4> LoInputs;
12608 copy_if(Mask, std::back_inserter(LoInputs),
12609 [](int M) { return M >= 0 && M < 8; });
12610 array_pod_sort(LoInputs.begin(), LoInputs.end());
12611 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12613 SmallVector<int, 4> HiInputs;
12614 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12615 array_pod_sort(HiInputs.begin(), HiInputs.end());
12616 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12619 bool TargetLo = LoInputs.size() >= HiInputs.size();
12620 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12621 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12623 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12624 SmallDenseMap<int, int, 8> LaneMap;
12625 for (int I : InPlaceInputs) {
12626 PreDupI16Shuffle[I/2] = I/2;
12629 int j = TargetLo ? 0 : 4, je = j + 4;
12630 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12631 // Check if j is already a shuffle of this input. This happens when
12632 // there are two adjacent bytes after we move the low one.
12633 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12634 // If we haven't yet mapped the input, search for a slot into which
12636 while (j < je && PreDupI16Shuffle[j] >= 0)
12640 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12643 // Map this input with the i16 shuffle.
12644 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12647 // Update the lane map based on the mapping we ended up with.
12648 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12650 V1 = DAG.getBitcast(
12652 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12653 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12655 // Unpack the bytes to form the i16s that will be shuffled into place.
12656 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12657 MVT::v16i8, V1, V1);
12659 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12660 for (int i = 0; i < 16; ++i)
12661 if (Mask[i] >= 0) {
12662 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12663 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12664 if (PostDupI16Shuffle[i / 2] < 0)
12665 PostDupI16Shuffle[i / 2] = MappedMask;
12667 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12668 "Conflicting entries in the original shuffle!");
12670 return DAG.getBitcast(
12672 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12673 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12675 if (SDValue V = tryToWidenViaDuplication())
12679 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12683 // Use dedicated unpack instructions for masks that match their pattern.
12685 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12688 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12689 // with PSHUFB. It is important to do this before we attempt to generate any
12690 // blends but after all of the single-input lowerings. If the single input
12691 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12692 // want to preserve that and we can DAG combine any longer sequences into
12693 // a PSHUFB in the end. But once we start blending from multiple inputs,
12694 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12695 // and there are *very* few patterns that would actually be faster than the
12696 // PSHUFB approach because of its ability to zero lanes.
12698 // FIXME: The only exceptions to the above are blends which are exact
12699 // interleavings with direct instructions supporting them. We currently don't
12700 // handle those well here.
12701 if (Subtarget.hasSSSE3()) {
12702 bool V1InUse = false;
12703 bool V2InUse = false;
12705 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12706 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12708 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12709 // do so. This avoids using them to handle blends-with-zero which is
12710 // important as a single pshufb is significantly faster for that.
12711 if (V1InUse && V2InUse) {
12712 if (Subtarget.hasSSE41())
12713 if (SDValue Blend = lowerVectorShuffleAsBlend(
12714 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12717 // We can use an unpack to do the blending rather than an or in some
12718 // cases. Even though the or may be (very minorly) more efficient, we
12719 // preference this lowering because there are common cases where part of
12720 // the complexity of the shuffles goes away when we do the final blend as
12722 // FIXME: It might be worth trying to detect if the unpack-feeding
12723 // shuffles will both be pshufb, in which case we shouldn't bother with
12725 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12726 DL, MVT::v16i8, V1, V2, Mask, DAG))
12729 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12730 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12731 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12737 // There are special ways we can lower some single-element blends.
12738 if (NumV2Elements == 1)
12739 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12740 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12743 if (SDValue BitBlend =
12744 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12747 // Check whether a compaction lowering can be done. This handles shuffles
12748 // which take every Nth element for some even N. See the helper function for
12751 // We special case these as they can be particularly efficiently handled with
12752 // the PACKUSB instruction on x86 and they show up in common patterns of
12753 // rearranging bytes to truncate wide elements.
12754 bool IsSingleInput = V2.isUndef();
12755 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12756 // NumEvenDrops is the power of two stride of the elements. Another way of
12757 // thinking about it is that we need to drop the even elements this many
12758 // times to get the original input.
12760 // First we need to zero all the dropped bytes.
12761 assert(NumEvenDrops <= 3 &&
12762 "No support for dropping even elements more than 3 times.");
12763 // We use the mask type to pick which bytes are preserved based on how many
12764 // elements are dropped.
12765 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12766 SDValue ByteClearMask = DAG.getBitcast(
12767 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12768 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12769 if (!IsSingleInput)
12770 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12772 // Now pack things back together.
12773 V1 = DAG.getBitcast(MVT::v8i16, V1);
12774 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12775 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12776 for (int i = 1; i < NumEvenDrops; ++i) {
12777 Result = DAG.getBitcast(MVT::v8i16, Result);
12778 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12784 // Handle multi-input cases by blending single-input shuffles.
12785 if (NumV2Elements > 0)
12786 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12789 // The fallback path for single-input shuffles widens this into two v8i16
12790 // vectors with unpacks, shuffles those, and then pulls them back together
12794 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12795 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12796 for (int i = 0; i < 16; ++i)
12798 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12800 SDValue VLoHalf, VHiHalf;
12801 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12802 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12804 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12805 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12806 // Use a mask to drop the high bytes.
12807 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12808 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12809 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12811 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12812 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12814 // Squash the masks to point directly into VLoHalf.
12815 for (int &M : LoBlendMask)
12818 for (int &M : HiBlendMask)
12822 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12823 // VHiHalf so that we can blend them as i16s.
12824 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12826 VLoHalf = DAG.getBitcast(
12827 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12828 VHiHalf = DAG.getBitcast(
12829 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12832 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12833 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12835 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12838 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
12840 /// This routine breaks down the specific type of 128-bit shuffle and
12841 /// dispatches to the lowering routines accordingly.
12842 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12843 MVT VT, SDValue V1, SDValue V2,
12844 const APInt &Zeroable,
12845 const X86Subtarget &Subtarget,
12846 SelectionDAG &DAG) {
12847 switch (VT.SimpleTy) {
12849 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12851 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12853 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12855 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12857 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12859 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12862 llvm_unreachable("Unimplemented!");
12866 /// Generic routine to split vector shuffle into half-sized shuffles.
12868 /// This routine just extracts two subvectors, shuffles them independently, and
12869 /// then concatenates them back together. This should work effectively with all
12870 /// AVX vector shuffle types.
12871 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12872 SDValue V2, ArrayRef<int> Mask,
12873 SelectionDAG &DAG) {
12874 assert(VT.getSizeInBits() >= 256 &&
12875 "Only for 256-bit or wider vector shuffles!");
12876 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12877 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12879 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12880 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12882 int NumElements = VT.getVectorNumElements();
12883 int SplitNumElements = NumElements / 2;
12884 MVT ScalarVT = VT.getVectorElementType();
12885 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12887 // Rather than splitting build-vectors, just build two narrower build
12888 // vectors. This helps shuffling with splats and zeros.
12889 auto SplitVector = [&](SDValue V) {
12890 V = peekThroughBitcasts(V);
12892 MVT OrigVT = V.getSimpleValueType();
12893 int OrigNumElements = OrigVT.getVectorNumElements();
12894 int OrigSplitNumElements = OrigNumElements / 2;
12895 MVT OrigScalarVT = OrigVT.getVectorElementType();
12896 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12900 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12902 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12903 DAG.getIntPtrConstant(0, DL));
12904 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12905 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12908 SmallVector<SDValue, 16> LoOps, HiOps;
12909 for (int i = 0; i < OrigSplitNumElements; ++i) {
12910 LoOps.push_back(BV->getOperand(i));
12911 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12913 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12914 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12916 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12917 DAG.getBitcast(SplitVT, HiV));
12920 SDValue LoV1, HiV1, LoV2, HiV2;
12921 std::tie(LoV1, HiV1) = SplitVector(V1);
12922 std::tie(LoV2, HiV2) = SplitVector(V2);
12924 // Now create two 4-way blends of these half-width vectors.
12925 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12926 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12927 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12928 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12929 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12930 for (int i = 0; i < SplitNumElements; ++i) {
12931 int M = HalfMask[i];
12932 if (M >= NumElements) {
12933 if (M >= NumElements + SplitNumElements)
12937 V2BlendMask[i] = M - NumElements;
12938 BlendMask[i] = SplitNumElements + i;
12939 } else if (M >= 0) {
12940 if (M >= SplitNumElements)
12944 V1BlendMask[i] = M;
12949 // Because the lowering happens after all combining takes place, we need to
12950 // manually combine these blend masks as much as possible so that we create
12951 // a minimal number of high-level vector shuffle nodes.
12953 // First try just blending the halves of V1 or V2.
12954 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12955 return DAG.getUNDEF(SplitVT);
12956 if (!UseLoV2 && !UseHiV2)
12957 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12958 if (!UseLoV1 && !UseHiV1)
12959 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12961 SDValue V1Blend, V2Blend;
12962 if (UseLoV1 && UseHiV1) {
12964 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12966 // We only use half of V1 so map the usage down into the final blend mask.
12967 V1Blend = UseLoV1 ? LoV1 : HiV1;
12968 for (int i = 0; i < SplitNumElements; ++i)
12969 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12970 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12972 if (UseLoV2 && UseHiV2) {
12974 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12976 // We only use half of V2 so map the usage down into the final blend mask.
12977 V2Blend = UseLoV2 ? LoV2 : HiV2;
12978 for (int i = 0; i < SplitNumElements; ++i)
12979 if (BlendMask[i] >= SplitNumElements)
12980 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12982 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12984 SDValue Lo = HalfBlend(LoMask);
12985 SDValue Hi = HalfBlend(HiMask);
12986 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12989 /// Either split a vector in halves or decompose the shuffles and the
12992 /// This is provided as a good fallback for many lowerings of non-single-input
12993 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12994 /// between splitting the shuffle into 128-bit components and stitching those
12995 /// back together vs. extracting the single-input shuffles and blending those
12997 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12998 SDValue V1, SDValue V2,
12999 ArrayRef<int> Mask,
13000 SelectionDAG &DAG) {
13001 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
13002 "shuffles as it could then recurse on itself.");
13003 int Size = Mask.size();
13005 // If this can be modeled as a broadcast of two elements followed by a blend,
13006 // prefer that lowering. This is especially important because broadcasts can
13007 // often fold with memory operands.
13008 auto DoBothBroadcast = [&] {
13009 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
13012 if (V2BroadcastIdx < 0)
13013 V2BroadcastIdx = M - Size;
13014 else if (M - Size != V2BroadcastIdx)
13016 } else if (M >= 0) {
13017 if (V1BroadcastIdx < 0)
13018 V1BroadcastIdx = M;
13019 else if (M != V1BroadcastIdx)
13024 if (DoBothBroadcast())
13025 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13028 // If the inputs all stem from a single 128-bit lane of each input, then we
13029 // split them rather than blending because the split will decompose to
13030 // unusually few instructions.
13031 int LaneCount = VT.getSizeInBits() / 128;
13032 int LaneSize = Size / LaneCount;
13033 SmallBitVector LaneInputs[2];
13034 LaneInputs[0].resize(LaneCount, false);
13035 LaneInputs[1].resize(LaneCount, false);
13036 for (int i = 0; i < Size; ++i)
13038 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
13039 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
13040 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13042 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
13043 // that the decomposed single-input shuffles don't end up here.
13044 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
13047 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13048 /// a permutation and blend of those lanes.
13050 /// This essentially blends the out-of-lane inputs to each lane into the lane
13051 /// from a permuted copy of the vector. This lowering strategy results in four
13052 /// instructions in the worst case for a single-input cross lane shuffle which
13053 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
13054 /// of. Special cases for each particular shuffle pattern should be handled
13055 /// prior to trying this lowering.
13056 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
13057 SDValue V1, SDValue V2,
13058 ArrayRef<int> Mask,
13060 const X86Subtarget &Subtarget) {
13061 // FIXME: This should probably be generalized for 512-bit vectors as well.
13062 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
13063 int Size = Mask.size();
13064 int LaneSize = Size / 2;
13066 // If there are only inputs from one 128-bit lane, splitting will in fact be
13067 // less expensive. The flags track whether the given lane contains an element
13068 // that crosses to another lane.
13069 if (!Subtarget.hasAVX2()) {
13070 bool LaneCrossing[2] = {false, false};
13071 for (int i = 0; i < Size; ++i)
13072 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
13073 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
13074 if (!LaneCrossing[0] || !LaneCrossing[1])
13075 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13077 bool LaneUsed[2] = {false, false};
13078 for (int i = 0; i < Size; ++i)
13080 LaneUsed[(Mask[i] / LaneSize)] = true;
13081 if (!LaneUsed[0] || !LaneUsed[1])
13082 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13085 assert(V2.isUndef() &&
13086 "This last part of this routine only works on single input shuffles");
13088 SmallVector<int, 32> FlippedBlendMask(Size);
13089 for (int i = 0; i < Size; ++i)
13090 FlippedBlendMask[i] =
13091 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
13093 : Mask[i] % LaneSize +
13094 (i / LaneSize) * LaneSize + Size);
13096 // Flip the vector, and blend the results which should now be in-lane.
13097 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
13098 SDValue Flipped = DAG.getBitcast(PVT, V1);
13099 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
13101 Flipped = DAG.getBitcast(VT, Flipped);
13102 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
13105 /// Handle lowering 2-lane 128-bit shuffles.
13106 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13107 SDValue V2, ArrayRef<int> Mask,
13108 const APInt &Zeroable,
13109 const X86Subtarget &Subtarget,
13110 SelectionDAG &DAG) {
13111 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
13112 if (Subtarget.hasAVX2() && V2.isUndef())
13115 SmallVector<int, 4> WidenedMask;
13116 if (!canWidenShuffleElements(Mask, WidenedMask))
13119 bool IsLowZero = (Zeroable & 0x3) == 0x3;
13120 bool IsHighZero = (Zeroable & 0xc) == 0xc;
13122 // Try to use an insert into a zero vector.
13123 if (WidenedMask[0] == 0 && IsHighZero) {
13124 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13125 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13126 DAG.getIntPtrConstant(0, DL));
13127 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
13128 getZeroVector(VT, Subtarget, DAG, DL), LoV,
13129 DAG.getIntPtrConstant(0, DL));
13132 // TODO: If minimizing size and one of the inputs is a zero vector and the
13133 // the zero vector has only one use, we could use a VPERM2X128 to save the
13134 // instruction bytes needed to explicitly generate the zero vector.
13136 // Blends are faster and handle all the non-lane-crossing cases.
13137 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
13138 Zeroable, Subtarget, DAG))
13141 // If either input operand is a zero vector, use VPERM2X128 because its mask
13142 // allows us to replace the zero input with an implicit zero.
13143 if (!IsLowZero && !IsHighZero) {
13144 // Check for patterns which can be matched with a single insert of a 128-bit
13146 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
13147 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
13149 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
13150 // this will likely become vinsertf128 which can't fold a 256-bit memop.
13151 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
13152 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13153 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13154 OnlyUsesV1 ? V1 : V2,
13155 DAG.getIntPtrConstant(0, DL));
13156 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13157 DAG.getIntPtrConstant(2, DL));
13161 // Try to use SHUF128 if possible.
13162 if (Subtarget.hasVLX()) {
13163 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
13164 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
13165 ((WidenedMask[1] % 2) << 1);
13166 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
13167 DAG.getConstant(PermMask, DL, MVT::i8));
13172 // Otherwise form a 128-bit permutation. After accounting for undefs,
13173 // convert the 64-bit shuffle mask selection values into 128-bit
13174 // selection bits by dividing the indexes by 2 and shifting into positions
13175 // defined by a vperm2*128 instruction's immediate control byte.
13177 // The immediate permute control byte looks like this:
13178 // [1:0] - select 128 bits from sources for low half of destination
13180 // [3] - zero low half of destination
13181 // [5:4] - select 128 bits from sources for high half of destination
13183 // [7] - zero high half of destination
13185 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
13187 unsigned PermMask = 0;
13188 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
13189 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
13191 // Check the immediate mask and replace unused sources with undef.
13192 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
13193 V1 = DAG.getUNDEF(VT);
13194 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
13195 V2 = DAG.getUNDEF(VT);
13197 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
13198 DAG.getConstant(PermMask, DL, MVT::i8));
13201 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
13202 /// shuffling each lane.
13204 /// This will only succeed when the result of fixing the 128-bit lanes results
13205 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
13206 /// each 128-bit lanes. This handles many cases where we can quickly blend away
13207 /// the lane crosses early and then use simpler shuffles within each lane.
13209 /// FIXME: It might be worthwhile at some point to support this without
13210 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
13211 /// in x86 only floating point has interesting non-repeating shuffles, and even
13212 /// those are still *marginally* more expensive.
13213 static SDValue lowerVectorShuffleByMerging128BitLanes(
13214 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13215 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13216 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
13218 int Size = Mask.size();
13219 int LaneSize = 128 / VT.getScalarSizeInBits();
13220 int NumLanes = Size / LaneSize;
13221 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
13223 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
13224 // check whether the in-128-bit lane shuffles share a repeating pattern.
13225 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
13226 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
13227 for (int i = 0; i < Size; ++i) {
13231 int j = i / LaneSize;
13233 if (Lanes[j] < 0) {
13234 // First entry we've seen for this lane.
13235 Lanes[j] = Mask[i] / LaneSize;
13236 } else if (Lanes[j] != Mask[i] / LaneSize) {
13237 // This doesn't match the lane selected previously!
13241 // Check that within each lane we have a consistent shuffle mask.
13242 int k = i % LaneSize;
13243 if (InLaneMask[k] < 0) {
13244 InLaneMask[k] = Mask[i] % LaneSize;
13245 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
13246 // This doesn't fit a repeating in-lane mask.
13251 // First shuffle the lanes into place.
13252 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
13253 VT.getSizeInBits() / 64);
13254 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
13255 for (int i = 0; i < NumLanes; ++i)
13256 if (Lanes[i] >= 0) {
13257 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
13258 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
13261 V1 = DAG.getBitcast(LaneVT, V1);
13262 V2 = DAG.getBitcast(LaneVT, V2);
13263 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
13265 // Cast it back to the type we actually want.
13266 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
13268 // Now do a simple shuffle that isn't lane crossing.
13269 SmallVector<int, 8> NewMask((unsigned)Size, -1);
13270 for (int i = 0; i < Size; ++i)
13272 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
13273 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
13274 "Must not introduce lane crosses at this point!");
13276 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
13279 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
13280 /// This allows for fast cases such as subvector extraction/insertion
13281 /// or shuffling smaller vector types which can lower more efficiently.
13282 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
13283 SDValue V1, SDValue V2,
13284 ArrayRef<int> Mask,
13285 const X86Subtarget &Subtarget,
13286 SelectionDAG &DAG) {
13287 assert((VT.is256BitVector() || VT.is512BitVector()) &&
13288 "Expected 256-bit or 512-bit vector");
13290 unsigned NumElts = VT.getVectorNumElements();
13291 unsigned HalfNumElts = NumElts / 2;
13292 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
13294 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
13295 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
13296 if (!UndefLower && !UndefUpper)
13299 // Upper half is undef and lower half is whole upper subvector.
13300 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13302 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
13303 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13304 DAG.getIntPtrConstant(HalfNumElts, DL));
13305 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13306 DAG.getIntPtrConstant(0, DL));
13309 // Lower half is undef and upper half is whole lower subvector.
13310 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13312 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
13313 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13314 DAG.getIntPtrConstant(0, DL));
13315 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13316 DAG.getIntPtrConstant(HalfNumElts, DL));
13319 // If the shuffle only uses two of the four halves of the input operands,
13320 // then extract them and perform the 'half' shuffle at half width.
13321 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
13322 int HalfIdx1 = -1, HalfIdx2 = -1;
13323 SmallVector<int, 8> HalfMask(HalfNumElts);
13324 unsigned Offset = UndefLower ? HalfNumElts : 0;
13325 for (unsigned i = 0; i != HalfNumElts; ++i) {
13326 int M = Mask[i + Offset];
13332 // Determine which of the 4 half vectors this element is from.
13333 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
13334 int HalfIdx = M / HalfNumElts;
13336 // Determine the element index into its half vector source.
13337 int HalfElt = M % HalfNumElts;
13339 // We can shuffle with up to 2 half vectors, set the new 'half'
13340 // shuffle mask accordingly.
13341 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
13342 HalfMask[i] = HalfElt;
13343 HalfIdx1 = HalfIdx;
13346 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
13347 HalfMask[i] = HalfElt + HalfNumElts;
13348 HalfIdx2 = HalfIdx;
13352 // Too many half vectors referenced.
13355 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
13357 // Only shuffle the halves of the inputs when useful.
13358 int NumLowerHalves =
13359 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
13360 int NumUpperHalves =
13361 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
13363 // uuuuXXXX - don't extract uppers just to insert again.
13364 if (UndefLower && NumUpperHalves != 0)
13367 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
13368 if (UndefUpper && NumUpperHalves == 2)
13371 // AVX2 - XXXXuuuu - always extract lowers.
13372 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
13373 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
13374 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13376 // AVX2 supports variable 32-bit element cross-lane shuffles.
13377 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
13378 // XXXXuuuu - don't extract lowers and uppers.
13379 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
13384 // AVX512 - XXXXuuuu - always extract lowers.
13385 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
13388 auto GetHalfVector = [&](int HalfIdx) {
13390 return DAG.getUNDEF(HalfVT);
13391 SDValue V = (HalfIdx < 2 ? V1 : V2);
13392 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13393 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13394 DAG.getIntPtrConstant(HalfIdx, DL));
13397 SDValue Half1 = GetHalfVector(HalfIdx1);
13398 SDValue Half2 = GetHalfVector(HalfIdx2);
13399 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13400 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13401 DAG.getIntPtrConstant(Offset, DL));
13404 /// Test whether the specified input (0 or 1) is in-place blended by the
13407 /// This returns true if the elements from a particular input are already in the
13408 /// slot required by the given mask and require no permutation.
13409 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13410 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13411 int Size = Mask.size();
13412 for (int i = 0; i < Size; ++i)
13413 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13419 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13420 /// every lane can be represented as the same repeating mask - allowing us to
13421 /// shuffle the sources with the repeating shuffle and then permute the result
13422 /// to the destination lanes.
13423 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13424 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13425 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13426 int NumElts = VT.getVectorNumElements();
13427 int NumLanes = VT.getSizeInBits() / 128;
13428 int NumLaneElts = NumElts / NumLanes;
13430 // On AVX2 we may be able to just shuffle the lowest elements and then
13431 // broadcast the result.
13432 if (Subtarget.hasAVX2()) {
13433 for (unsigned BroadcastSize : {16, 32, 64}) {
13434 if (BroadcastSize <= VT.getScalarSizeInBits())
13436 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13438 // Attempt to match a repeating pattern every NumBroadcastElts,
13439 // accounting for UNDEFs but only references the lowest 128-bit
13440 // lane of the inputs.
13441 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13442 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13443 for (int j = 0; j != NumBroadcastElts; ++j) {
13444 int M = Mask[i + j];
13447 int &R = RepeatMask[j];
13448 if (0 != ((M % NumElts) / NumLaneElts))
13450 if (0 <= R && R != M)
13457 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13458 if (!FindRepeatingBroadcastMask(RepeatMask))
13461 // Shuffle the (lowest) repeated elements in place for broadcast.
13462 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13464 // Shuffle the actual broadcast.
13465 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13466 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13467 for (int j = 0; j != NumBroadcastElts; ++j)
13468 BroadcastMask[i + j] = j;
13469 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13474 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13475 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13478 // Bail if we already have a repeated lane shuffle mask.
13479 SmallVector<int, 8> RepeatedShuffleMask;
13480 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13483 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13484 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13485 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13486 int NumSubLanes = NumLanes * SubLaneScale;
13487 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13489 // Check that all the sources are coming from the same lane and see if we can
13490 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13491 // determine the source sub-lane for each destination sub-lane.
13492 int TopSrcSubLane = -1;
13493 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13494 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13495 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13496 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13498 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13499 // Extract the sub-lane mask, check that it all comes from the same lane
13500 // and normalize the mask entries to come from the first lane.
13502 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13503 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13504 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13507 int Lane = (M % NumElts) / NumLaneElts;
13508 if ((0 <= SrcLane) && (SrcLane != Lane))
13511 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13512 SubLaneMask[Elt] = LocalM;
13515 // Whole sub-lane is UNDEF.
13519 // Attempt to match against the candidate repeated sub-lane masks.
13520 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13521 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13522 for (int i = 0; i != NumSubLaneElts; ++i) {
13523 if (M1[i] < 0 || M2[i] < 0)
13525 if (M1[i] != M2[i])
13531 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13532 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13535 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13536 for (int i = 0; i != NumSubLaneElts; ++i) {
13537 int M = SubLaneMask[i];
13540 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13541 "Unexpected mask element");
13542 RepeatedSubLaneMask[i] = M;
13545 // Track the top most source sub-lane - by setting the remaining to UNDEF
13546 // we can greatly simplify shuffle matching.
13547 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13548 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13549 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13553 // Bail if we failed to find a matching repeated sub-lane mask.
13554 if (Dst2SrcSubLanes[DstSubLane] < 0)
13557 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13558 "Unexpected source lane");
13560 // Create a repeating shuffle mask for the entire vector.
13561 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13562 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13563 int Lane = SubLane / SubLaneScale;
13564 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13565 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13566 int M = RepeatedSubLaneMask[Elt];
13569 int Idx = (SubLane * NumSubLaneElts) + Elt;
13570 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13573 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13575 // Shuffle each source sub-lane to its destination.
13576 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13577 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13578 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13579 if (SrcSubLane < 0)
13581 for (int j = 0; j != NumSubLaneElts; ++j)
13582 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13585 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13589 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13590 unsigned &ShuffleImm,
13591 ArrayRef<int> Mask) {
13592 int NumElts = VT.getVectorNumElements();
13593 assert(VT.getScalarSizeInBits() == 64 &&
13594 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13595 "Unexpected data type for VSHUFPD");
13597 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13598 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13600 bool ShufpdMask = true;
13601 bool CommutableMask = true;
13602 for (int i = 0; i < NumElts; ++i) {
13603 if (Mask[i] == SM_SentinelUndef)
13607 int Val = (i & 6) + NumElts * (i & 1);
13608 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13609 if (Mask[i] < Val || Mask[i] > Val + 1)
13610 ShufpdMask = false;
13611 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13612 CommutableMask = false;
13613 ShuffleImm |= (Mask[i] % 2) << i;
13618 if (CommutableMask) {
13626 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13627 ArrayRef<int> Mask, SDValue V1,
13628 SDValue V2, SelectionDAG &DAG) {
13629 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13630 "Unexpected data type for VSHUFPD");
13632 unsigned Immediate = 0;
13633 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13636 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13637 DAG.getConstant(Immediate, DL, MVT::i8));
13640 /// Handle lowering of 4-lane 64-bit floating point shuffles.
13642 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13643 /// isn't available.
13644 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13645 const APInt &Zeroable,
13646 SDValue V1, SDValue V2,
13647 const X86Subtarget &Subtarget,
13648 SelectionDAG &DAG) {
13649 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13650 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13651 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13653 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13654 Zeroable, Subtarget, DAG))
13657 if (V2.isUndef()) {
13658 // Check for being able to broadcast a single element.
13659 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13660 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13663 // Use low duplicate instructions for masks that match their pattern.
13664 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13665 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13667 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13668 // Non-half-crossing single input shuffles can be lowered with an
13669 // interleaved permutation.
13670 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13671 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13672 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13673 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13676 // With AVX2 we have direct support for this permutation.
13677 if (Subtarget.hasAVX2())
13678 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13679 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13681 // Try to create an in-lane repeating shuffle mask and then shuffle the
13682 // the results into the target lanes.
13683 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13684 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13687 // Otherwise, fall back.
13688 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13692 // Use dedicated unpack instructions for masks that match their pattern.
13694 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13697 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13698 Zeroable, Subtarget, DAG))
13701 // Check if the blend happens to exactly fit that of SHUFPD.
13703 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13706 // Try to create an in-lane repeating shuffle mask and then shuffle the
13707 // the results into the target lanes.
13708 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13709 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13712 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13713 // shuffle. However, if we have AVX2 and either inputs are already in place,
13714 // we will be able to shuffle even across lanes the other input in a single
13715 // instruction so skip this pattern.
13716 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13717 isShuffleMaskInputInPlace(1, Mask))))
13718 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13719 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13721 // If we have VLX support, we can use VEXPAND.
13722 if (Subtarget.hasVLX())
13723 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13724 V1, V2, DAG, Subtarget))
13727 // If we have AVX2 then we always want to lower with a blend because an v4 we
13728 // can fully permute the elements.
13729 if (Subtarget.hasAVX2())
13730 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13733 // Otherwise fall back on generic lowering.
13734 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13737 /// Handle lowering of 4-lane 64-bit integer shuffles.
13739 /// This routine is only called when we have AVX2 and thus a reasonable
13740 /// instruction set for v4i64 shuffling..
13741 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13742 const APInt &Zeroable,
13743 SDValue V1, SDValue V2,
13744 const X86Subtarget &Subtarget,
13745 SelectionDAG &DAG) {
13746 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13747 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13748 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13749 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13751 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13752 Zeroable, Subtarget, DAG))
13755 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13756 Zeroable, Subtarget, DAG))
13759 // Check for being able to broadcast a single element.
13760 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13761 Mask, Subtarget, DAG))
13764 if (V2.isUndef()) {
13765 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13766 // can use lower latency instructions that will operate on both lanes.
13767 SmallVector<int, 2> RepeatedMask;
13768 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13769 SmallVector<int, 4> PSHUFDMask;
13770 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13771 return DAG.getBitcast(
13773 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13774 DAG.getBitcast(MVT::v8i32, V1),
13775 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13778 // AVX2 provides a direct instruction for permuting a single input across
13780 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13781 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13784 // Try to use shift instructions.
13785 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13786 Zeroable, Subtarget, DAG))
13789 // If we have VLX support, we can use VALIGN or VEXPAND.
13790 if (Subtarget.hasVLX()) {
13791 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13792 Mask, Subtarget, DAG))
13795 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13796 V1, V2, DAG, Subtarget))
13800 // Try to use PALIGNR.
13801 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13802 Mask, Subtarget, DAG))
13805 // Use dedicated unpack instructions for masks that match their pattern.
13807 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13810 // Try to create an in-lane repeating shuffle mask and then shuffle the
13811 // the results into the target lanes.
13812 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13813 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13816 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13817 // shuffle. However, if we have AVX2 and either inputs are already in place,
13818 // we will be able to shuffle even across lanes the other input in a single
13819 // instruction so skip this pattern.
13820 if (!isShuffleMaskInputInPlace(0, Mask) &&
13821 !isShuffleMaskInputInPlace(1, Mask))
13822 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13823 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13826 // Otherwise fall back on generic blend lowering.
13827 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13831 /// Handle lowering of 8-lane 32-bit floating point shuffles.
13833 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13834 /// isn't available.
13835 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13836 const APInt &Zeroable,
13837 SDValue V1, SDValue V2,
13838 const X86Subtarget &Subtarget,
13839 SelectionDAG &DAG) {
13840 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13841 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13842 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13844 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13845 Zeroable, Subtarget, DAG))
13848 // Check for being able to broadcast a single element.
13849 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13850 Mask, Subtarget, DAG))
13853 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13854 // options to efficiently lower the shuffle.
13855 SmallVector<int, 4> RepeatedMask;
13856 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13857 assert(RepeatedMask.size() == 4 &&
13858 "Repeated masks must be half the mask width!");
13860 // Use even/odd duplicate instructions for masks that match their pattern.
13861 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13862 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13863 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13864 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13867 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13868 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13870 // Use dedicated unpack instructions for masks that match their pattern.
13872 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13875 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13876 // have already handled any direct blends.
13877 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13880 // Try to create an in-lane repeating shuffle mask and then shuffle the
13881 // the results into the target lanes.
13882 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13883 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13886 // If we have a single input shuffle with different shuffle patterns in the
13887 // two 128-bit lanes use the variable mask to VPERMILPS.
13888 if (V2.isUndef()) {
13889 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13890 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13891 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13893 if (Subtarget.hasAVX2())
13894 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13896 // Otherwise, fall back.
13897 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13901 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13903 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13904 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13906 // If we have VLX support, we can use VEXPAND.
13907 if (Subtarget.hasVLX())
13908 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13909 V1, V2, DAG, Subtarget))
13912 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13913 // since after split we get a more efficient code using vpunpcklwd and
13914 // vpunpckhwd instrs than vblend.
13915 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13916 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13920 // If we have AVX2 then we always want to lower with a blend because at v8 we
13921 // can fully permute the elements.
13922 if (Subtarget.hasAVX2())
13923 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13926 // Otherwise fall back on generic lowering.
13927 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13930 /// Handle lowering of 8-lane 32-bit integer shuffles.
13932 /// This routine is only called when we have AVX2 and thus a reasonable
13933 /// instruction set for v8i32 shuffling..
13934 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13935 const APInt &Zeroable,
13936 SDValue V1, SDValue V2,
13937 const X86Subtarget &Subtarget,
13938 SelectionDAG &DAG) {
13939 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13940 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13941 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13942 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13944 // Whenever we can lower this as a zext, that instruction is strictly faster
13945 // than any alternative. It also allows us to fold memory operands into the
13946 // shuffle in many cases.
13947 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13948 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13951 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13952 // since after split we get a more efficient code than vblend by using
13953 // vpunpcklwd and vpunpckhwd instrs.
13954 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13955 !Subtarget.hasAVX512())
13957 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13960 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13961 Zeroable, Subtarget, DAG))
13964 // Check for being able to broadcast a single element.
13965 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13966 Mask, Subtarget, DAG))
13969 // If the shuffle mask is repeated in each 128-bit lane we can use more
13970 // efficient instructions that mirror the shuffles across the two 128-bit
13972 SmallVector<int, 4> RepeatedMask;
13973 bool Is128BitLaneRepeatedShuffle =
13974 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13975 if (Is128BitLaneRepeatedShuffle) {
13976 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13978 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13979 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13981 // Use dedicated unpack instructions for masks that match their pattern.
13983 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13987 // Try to use shift instructions.
13988 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13989 Zeroable, Subtarget, DAG))
13992 // If we have VLX support, we can use VALIGN or EXPAND.
13993 if (Subtarget.hasVLX()) {
13994 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13995 Mask, Subtarget, DAG))
13998 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13999 V1, V2, DAG, Subtarget))
14003 // Try to use byte rotation instructions.
14004 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14005 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14008 // Try to create an in-lane repeating shuffle mask and then shuffle the
14009 // results into the target lanes.
14010 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14011 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14014 // If the shuffle patterns aren't repeated but it is a single input, directly
14015 // generate a cross-lane VPERMD instruction.
14016 if (V2.isUndef()) {
14017 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
14018 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
14021 // Assume that a single SHUFPS is faster than an alternative sequence of
14022 // multiple instructions (even if the CPU has a domain penalty).
14023 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14024 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14025 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
14026 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
14027 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
14028 CastV1, CastV2, DAG);
14029 return DAG.getBitcast(MVT::v8i32, ShufPS);
14032 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14034 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14035 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14038 // Otherwise fall back on generic blend lowering.
14039 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
14043 /// Handle lowering of 16-lane 16-bit integer shuffles.
14045 /// This routine is only called when we have AVX2 and thus a reasonable
14046 /// instruction set for v16i16 shuffling..
14047 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14048 const APInt &Zeroable,
14049 SDValue V1, SDValue V2,
14050 const X86Subtarget &Subtarget,
14051 SelectionDAG &DAG) {
14052 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14053 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14054 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14055 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
14057 // Whenever we can lower this as a zext, that instruction is strictly faster
14058 // than any alternative. It also allows us to fold memory operands into the
14059 // shuffle in many cases.
14060 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14061 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14064 // Check for being able to broadcast a single element.
14065 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
14066 Mask, Subtarget, DAG))
14069 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
14070 Zeroable, Subtarget, DAG))
14073 // Use dedicated unpack instructions for masks that match their pattern.
14075 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
14078 // Use dedicated pack instructions for masks that match their pattern.
14079 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
14083 // Try to use shift instructions.
14084 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
14085 Zeroable, Subtarget, DAG))
14088 // Try to use byte rotation instructions.
14089 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14090 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14093 // Try to create an in-lane repeating shuffle mask and then shuffle the
14094 // the results into the target lanes.
14095 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14096 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14099 if (V2.isUndef()) {
14100 // There are no generalized cross-lane shuffle operations available on i16
14102 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
14103 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
14104 Mask, DAG, Subtarget);
14106 SmallVector<int, 8> RepeatedMask;
14107 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
14108 // As this is a single-input shuffle, the repeated mask should be
14109 // a strictly valid v8i16 mask that we can pass through to the v8i16
14110 // lowering to handle even the v16 case.
14111 return lowerV8I16GeneralSingleInputVectorShuffle(
14112 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
14116 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14117 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14120 // AVX512BWVL can lower to VPERMW.
14121 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14122 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
14124 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14126 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14127 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14130 // Otherwise fall back on generic lowering.
14131 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
14134 /// Handle lowering of 32-lane 8-bit integer shuffles.
14136 /// This routine is only called when we have AVX2 and thus a reasonable
14137 /// instruction set for v32i8 shuffling..
14138 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14139 const APInt &Zeroable,
14140 SDValue V1, SDValue V2,
14141 const X86Subtarget &Subtarget,
14142 SelectionDAG &DAG) {
14143 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14144 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14145 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14146 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
14148 // Whenever we can lower this as a zext, that instruction is strictly faster
14149 // than any alternative. It also allows us to fold memory operands into the
14150 // shuffle in many cases.
14151 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14152 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14155 // Check for being able to broadcast a single element.
14156 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
14157 Mask, Subtarget, DAG))
14160 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
14161 Zeroable, Subtarget, DAG))
14164 // Use dedicated unpack instructions for masks that match their pattern.
14166 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
14169 // Use dedicated pack instructions for masks that match their pattern.
14170 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
14174 // Try to use shift instructions.
14175 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
14176 Zeroable, Subtarget, DAG))
14179 // Try to use byte rotation instructions.
14180 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14181 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14184 // Try to create an in-lane repeating shuffle mask and then shuffle the
14185 // the results into the target lanes.
14186 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14187 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14190 // There are no generalized cross-lane shuffle operations available on i8
14192 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
14193 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
14196 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14197 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14200 // AVX512VBMIVL can lower to VPERMB.
14201 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14202 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
14204 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14206 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14207 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14210 // Otherwise fall back on generic lowering.
14211 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
14214 /// High-level routine to lower various 256-bit x86 vector shuffles.
14216 /// This routine either breaks down the specific type of a 256-bit x86 vector
14217 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
14218 /// together based on the available instructions.
14219 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14220 MVT VT, SDValue V1, SDValue V2,
14221 const APInt &Zeroable,
14222 const X86Subtarget &Subtarget,
14223 SelectionDAG &DAG) {
14224 // If we have a single input to the zero element, insert that into V1 if we
14225 // can do so cheaply.
14226 int NumElts = VT.getVectorNumElements();
14227 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14229 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14230 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14231 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14234 // Handle special cases where the lower or upper half is UNDEF.
14236 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14239 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
14240 // can check for those subtargets here and avoid much of the subtarget
14241 // querying in the per-vector-type lowering routines. With AVX1 we have
14242 // essentially *zero* ability to manipulate a 256-bit vector with integer
14243 // types. Since we'll use floating point types there eventually, just
14244 // immediately cast everything to a float and operate entirely in that domain.
14245 if (VT.isInteger() && !Subtarget.hasAVX2()) {
14246 int ElementBits = VT.getScalarSizeInBits();
14247 if (ElementBits < 32) {
14248 // No floating point type available, if we can't use the bit operations
14249 // for masking/blending then decompose into 128-bit vectors.
14251 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
14253 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
14255 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
14258 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
14259 VT.getVectorNumElements());
14260 V1 = DAG.getBitcast(FpVT, V1);
14261 V2 = DAG.getBitcast(FpVT, V2);
14262 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
14265 switch (VT.SimpleTy) {
14267 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14269 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14271 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14273 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14275 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14277 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14280 llvm_unreachable("Not a valid 256-bit x86 vector type!");
14284 /// Try to lower a vector shuffle as a 128-bit shuffles.
14285 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
14286 ArrayRef<int> Mask,
14287 const APInt &Zeroable,
14288 SDValue V1, SDValue V2,
14289 const X86Subtarget &Subtarget,
14290 SelectionDAG &DAG) {
14291 assert(VT.getScalarSizeInBits() == 64 &&
14292 "Unexpected element type size for 128bit shuffle.");
14294 // To handle 256 bit vector requires VLX and most probably
14295 // function lowerV2X128VectorShuffle() is better solution.
14296 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
14298 SmallVector<int, 4> WidenedMask;
14299 if (!canWidenShuffleElements(Mask, WidenedMask))
14302 // Try to use an insert into a zero vector.
14303 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
14304 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
14305 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
14306 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
14307 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14308 DAG.getIntPtrConstant(0, DL));
14309 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14310 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14311 DAG.getIntPtrConstant(0, DL));
14314 // Check for patterns which can be matched with a single insert of a 256-bit
14316 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
14317 {0, 1, 2, 3, 0, 1, 2, 3});
14318 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
14319 {0, 1, 2, 3, 8, 9, 10, 11})) {
14320 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
14321 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14322 OnlyUsesV1 ? V1 : V2,
14323 DAG.getIntPtrConstant(0, DL));
14324 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14325 DAG.getIntPtrConstant(4, DL));
14328 assert(WidenedMask.size() == 4);
14330 // See if this is an insertion of the lower 128-bits of V2 into V1.
14331 bool IsInsert = true;
14333 for (int i = 0; i < 4; ++i) {
14334 assert(WidenedMask[i] >= -1);
14335 if (WidenedMask[i] < 0)
14338 // Make sure all V1 subvectors are in place.
14339 if (WidenedMask[i] < 4) {
14340 if (WidenedMask[i] != i) {
14345 // Make sure we only have a single V2 index and its the lowest 128-bits.
14346 if (V2Index >= 0 || WidenedMask[i] != 4) {
14353 if (IsInsert && V2Index >= 0) {
14354 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14355 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
14356 DAG.getIntPtrConstant(0, DL));
14357 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
14360 // Try to lower to vshuf64x2/vshuf32x4.
14361 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
14362 unsigned PermMask = 0;
14363 // Insure elements came from the same Op.
14364 for (int i = 0; i < 4; ++i) {
14365 assert(WidenedMask[i] >= -1);
14366 if (WidenedMask[i] < 0)
14369 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
14370 unsigned OpIndex = i / 2;
14371 if (Ops[OpIndex].isUndef())
14373 else if (Ops[OpIndex] != Op)
14376 // Convert the 128-bit shuffle mask selection values into 128-bit selection
14377 // bits defined by a vshuf64x2 instruction's immediate control byte.
14378 PermMask |= (WidenedMask[i] % 4) << (i * 2);
14381 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
14382 DAG.getConstant(PermMask, DL, MVT::i8));
14385 /// Handle lowering of 8-lane 64-bit floating point shuffles.
14386 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14387 const APInt &Zeroable,
14388 SDValue V1, SDValue V2,
14389 const X86Subtarget &Subtarget,
14390 SelectionDAG &DAG) {
14391 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14392 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14393 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14395 if (V2.isUndef()) {
14396 // Use low duplicate instructions for masks that match their pattern.
14397 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14398 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14400 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14401 // Non-half-crossing single input shuffles can be lowered with an
14402 // interleaved permutation.
14403 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14404 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14405 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14406 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14407 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14408 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14411 SmallVector<int, 4> RepeatedMask;
14412 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14413 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14414 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14417 if (SDValue Shuf128 =
14418 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14422 if (SDValue Unpck =
14423 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14426 // Check if the blend happens to exactly fit that of SHUFPD.
14428 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14431 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14432 V2, DAG, Subtarget))
14435 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14436 Zeroable, Subtarget, DAG))
14439 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14442 /// Handle lowering of 16-lane 32-bit floating point shuffles.
14443 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14444 const APInt &Zeroable,
14445 SDValue V1, SDValue V2,
14446 const X86Subtarget &Subtarget,
14447 SelectionDAG &DAG) {
14448 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14449 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14450 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14452 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14453 // options to efficiently lower the shuffle.
14454 SmallVector<int, 4> RepeatedMask;
14455 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14456 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14458 // Use even/odd duplicate instructions for masks that match their pattern.
14459 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14460 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14461 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14462 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14465 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14466 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14468 // Use dedicated unpack instructions for masks that match their pattern.
14469 if (SDValue Unpck =
14470 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14473 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14474 Zeroable, Subtarget, DAG))
14477 // Otherwise, fall back to a SHUFPS sequence.
14478 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14481 // If we have a single input shuffle with different shuffle patterns in the
14482 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14483 if (V2.isUndef() &&
14484 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14485 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14486 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14489 // If we have AVX512F support, we can use VEXPAND.
14490 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14491 V1, V2, DAG, Subtarget))
14494 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14497 /// Handle lowering of 8-lane 64-bit integer shuffles.
14498 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14499 const APInt &Zeroable,
14500 SDValue V1, SDValue V2,
14501 const X86Subtarget &Subtarget,
14502 SelectionDAG &DAG) {
14503 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14504 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14505 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14507 if (V2.isUndef()) {
14508 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14509 // can use lower latency instructions that will operate on all four
14511 SmallVector<int, 2> Repeated128Mask;
14512 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14513 SmallVector<int, 4> PSHUFDMask;
14514 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14515 return DAG.getBitcast(
14517 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14518 DAG.getBitcast(MVT::v16i32, V1),
14519 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14522 SmallVector<int, 4> Repeated256Mask;
14523 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14524 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14525 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14528 if (SDValue Shuf128 =
14529 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14530 V1, V2, Subtarget, DAG))
14533 // Try to use shift instructions.
14534 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14535 Zeroable, Subtarget, DAG))
14538 // Try to use VALIGN.
14539 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14540 Mask, Subtarget, DAG))
14543 // Try to use PALIGNR.
14544 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14545 Mask, Subtarget, DAG))
14548 if (SDValue Unpck =
14549 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14551 // If we have AVX512F support, we can use VEXPAND.
14552 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14553 V2, DAG, Subtarget))
14556 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14557 Zeroable, Subtarget, DAG))
14560 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14563 /// Handle lowering of 16-lane 32-bit integer shuffles.
14564 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14565 const APInt &Zeroable,
14566 SDValue V1, SDValue V2,
14567 const X86Subtarget &Subtarget,
14568 SelectionDAG &DAG) {
14569 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14570 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14571 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14573 // Whenever we can lower this as a zext, that instruction is strictly faster
14574 // than any alternative. It also allows us to fold memory operands into the
14575 // shuffle in many cases.
14576 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14577 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14580 // If the shuffle mask is repeated in each 128-bit lane we can use more
14581 // efficient instructions that mirror the shuffles across the four 128-bit
14583 SmallVector<int, 4> RepeatedMask;
14584 bool Is128BitLaneRepeatedShuffle =
14585 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14586 if (Is128BitLaneRepeatedShuffle) {
14587 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14589 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14590 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14592 // Use dedicated unpack instructions for masks that match their pattern.
14594 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14598 // Try to use shift instructions.
14599 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14600 Zeroable, Subtarget, DAG))
14603 // Try to use VALIGN.
14604 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14605 Mask, Subtarget, DAG))
14608 // Try to use byte rotation instructions.
14609 if (Subtarget.hasBWI())
14610 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14611 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14614 // Assume that a single SHUFPS is faster than using a permv shuffle.
14615 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14616 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14617 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14618 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14619 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14620 CastV1, CastV2, DAG);
14621 return DAG.getBitcast(MVT::v16i32, ShufPS);
14623 // If we have AVX512F support, we can use VEXPAND.
14624 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14625 V1, V2, DAG, Subtarget))
14628 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14629 Zeroable, Subtarget, DAG))
14631 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14634 /// Handle lowering of 32-lane 16-bit integer shuffles.
14635 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14636 const APInt &Zeroable,
14637 SDValue V1, SDValue V2,
14638 const X86Subtarget &Subtarget,
14639 SelectionDAG &DAG) {
14640 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14641 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14642 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14643 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14645 // Whenever we can lower this as a zext, that instruction is strictly faster
14646 // than any alternative. It also allows us to fold memory operands into the
14647 // shuffle in many cases.
14648 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14649 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14652 // Use dedicated unpack instructions for masks that match their pattern.
14654 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14657 // Try to use shift instructions.
14658 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14659 Zeroable, Subtarget, DAG))
14662 // Try to use byte rotation instructions.
14663 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14664 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14667 if (V2.isUndef()) {
14668 SmallVector<int, 8> RepeatedMask;
14669 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14670 // As this is a single-input shuffle, the repeated mask should be
14671 // a strictly valid v8i16 mask that we can pass through to the v8i16
14672 // lowering to handle even the v32 case.
14673 return lowerV8I16GeneralSingleInputVectorShuffle(
14674 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14678 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14679 Zeroable, Subtarget, DAG))
14682 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14683 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14686 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14689 /// Handle lowering of 64-lane 8-bit integer shuffles.
14690 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14691 const APInt &Zeroable,
14692 SDValue V1, SDValue V2,
14693 const X86Subtarget &Subtarget,
14694 SelectionDAG &DAG) {
14695 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14696 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14697 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14698 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14700 // Whenever we can lower this as a zext, that instruction is strictly faster
14701 // than any alternative. It also allows us to fold memory operands into the
14702 // shuffle in many cases.
14703 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14704 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14707 // Use dedicated unpack instructions for masks that match their pattern.
14709 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14712 // Try to use shift instructions.
14713 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14714 Zeroable, Subtarget, DAG))
14717 // Try to use byte rotation instructions.
14718 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14719 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14722 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14723 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14726 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14727 if (Subtarget.hasVBMI())
14728 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14730 // Try to create an in-lane repeating shuffle mask and then shuffle the
14731 // the results into the target lanes.
14732 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14733 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14736 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14737 Zeroable, Subtarget, DAG))
14740 // FIXME: Implement direct support for this type!
14741 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14744 /// High-level routine to lower various 512-bit x86 vector shuffles.
14746 /// This routine either breaks down the specific type of a 512-bit x86 vector
14747 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14748 /// together based on the available instructions.
14749 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14750 MVT VT, SDValue V1, SDValue V2,
14751 const APInt &Zeroable,
14752 const X86Subtarget &Subtarget,
14753 SelectionDAG &DAG) {
14754 assert(Subtarget.hasAVX512() &&
14755 "Cannot lower 512-bit vectors w/ basic ISA!");
14757 // If we have a single input to the zero element, insert that into V1 if we
14758 // can do so cheaply.
14759 int NumElts = Mask.size();
14760 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14762 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14763 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14764 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14767 // Handle special cases where the lower or upper half is UNDEF.
14769 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14772 // Check for being able to broadcast a single element.
14773 if (SDValue Broadcast =
14774 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14777 // Dispatch to each element type for lowering. If we don't have support for
14778 // specific element type shuffles at 512 bits, immediately split them and
14779 // lower them. Each lowering routine of a given type is allowed to assume that
14780 // the requisite ISA extensions for that element type are available.
14781 switch (VT.SimpleTy) {
14783 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14785 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14787 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14789 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14791 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14793 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14796 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14800 // Lower vXi1 vector shuffles.
14801 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14802 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14803 // vector, shuffle and then truncate it back.
14804 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14805 MVT VT, SDValue V1, SDValue V2,
14806 const APInt &Zeroable,
14807 const X86Subtarget &Subtarget,
14808 SelectionDAG &DAG) {
14809 unsigned NumElts = Mask.size();
14811 // Try to recognize shuffles that are just padding a subvector with zeros.
14812 unsigned SubvecElts = 0;
14813 for (int i = 0; i != (int)NumElts; ++i) {
14814 if (Mask[i] >= 0 && Mask[i] != i)
14819 assert(SubvecElts != NumElts && "Identity shuffle?");
14821 // Clip to a power 2.
14822 SubvecElts = PowerOf2Floor(SubvecElts);
14824 // Make sure the number of zeroable bits in the top at least covers the bits
14825 // not covered by the subvector.
14826 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14827 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14828 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14829 V1, DAG.getIntPtrConstant(0, DL));
14830 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14831 getZeroVector(VT, Subtarget, DAG, DL),
14832 Extract, DAG.getIntPtrConstant(0, DL));
14836 assert(Subtarget.hasAVX512() &&
14837 "Cannot lower 512-bit vectors w/o basic ISA!");
14839 switch (VT.SimpleTy) {
14841 llvm_unreachable("Expected a vector of i1 elements");
14843 ExtVT = MVT::v2i64;
14846 ExtVT = MVT::v4i32;
14849 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14851 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14854 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14855 // 256-bit operation available.
14856 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14859 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14860 // 256-bit operation available.
14861 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14862 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14865 ExtVT = MVT::v64i8;
14869 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14870 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14872 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14873 // i1 was sign extended we can use X86ISD::CVT2MASK.
14874 int NumElems = VT.getVectorNumElements();
14875 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14876 (Subtarget.hasDQI() && (NumElems < 32)))
14877 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
14878 Shuffle, ISD::SETGT);
14880 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14883 /// Helper function that returns true if the shuffle mask should be
14884 /// commuted to improve canonicalization.
14885 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14886 int NumElements = Mask.size();
14888 int NumV1Elements = 0, NumV2Elements = 0;
14892 else if (M < NumElements)
14897 // Commute the shuffle as needed such that more elements come from V1 than
14898 // V2. This allows us to match the shuffle pattern strictly on how many
14899 // elements come from V1 without handling the symmetric cases.
14900 if (NumV2Elements > NumV1Elements)
14903 assert(NumV1Elements > 0 && "No V1 indices");
14905 if (NumV2Elements == 0)
14908 // When the number of V1 and V2 elements are the same, try to minimize the
14909 // number of uses of V2 in the low half of the vector. When that is tied,
14910 // ensure that the sum of indices for V1 is equal to or lower than the sum
14911 // indices for V2. When those are equal, try to ensure that the number of odd
14912 // indices for V1 is lower than the number of odd indices for V2.
14913 if (NumV1Elements == NumV2Elements) {
14914 int LowV1Elements = 0, LowV2Elements = 0;
14915 for (int M : Mask.slice(0, NumElements / 2))
14916 if (M >= NumElements)
14920 if (LowV2Elements > LowV1Elements)
14922 if (LowV2Elements == LowV1Elements) {
14923 int SumV1Indices = 0, SumV2Indices = 0;
14924 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14925 if (Mask[i] >= NumElements)
14927 else if (Mask[i] >= 0)
14929 if (SumV2Indices < SumV1Indices)
14931 if (SumV2Indices == SumV1Indices) {
14932 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14933 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14934 if (Mask[i] >= NumElements)
14935 NumV2OddIndices += i % 2;
14936 else if (Mask[i] >= 0)
14937 NumV1OddIndices += i % 2;
14938 if (NumV2OddIndices < NumV1OddIndices)
14947 /// Top-level lowering for x86 vector shuffles.
14949 /// This handles decomposition, canonicalization, and lowering of all x86
14950 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14951 /// above in helper routines. The canonicalization attempts to widen shuffles
14952 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14953 /// s.t. only one of the two inputs needs to be tested, etc.
14954 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14955 SelectionDAG &DAG) {
14956 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14957 ArrayRef<int> Mask = SVOp->getMask();
14958 SDValue V1 = Op.getOperand(0);
14959 SDValue V2 = Op.getOperand(1);
14960 MVT VT = Op.getSimpleValueType();
14961 int NumElements = VT.getVectorNumElements();
14963 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14965 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14966 "Can't lower MMX shuffles");
14968 bool V1IsUndef = V1.isUndef();
14969 bool V2IsUndef = V2.isUndef();
14970 if (V1IsUndef && V2IsUndef)
14971 return DAG.getUNDEF(VT);
14973 // When we create a shuffle node we put the UNDEF node to second operand,
14974 // but in some cases the first operand may be transformed to UNDEF.
14975 // In this case we should just commute the node.
14977 return DAG.getCommutedVectorShuffle(*SVOp);
14979 // Check for non-undef masks pointing at an undef vector and make the masks
14980 // undef as well. This makes it easier to match the shuffle based solely on
14984 if (M >= NumElements) {
14985 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14986 for (int &M : NewMask)
14987 if (M >= NumElements)
14989 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14992 // Check for illegal shuffle mask element index values.
14993 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14994 assert(llvm::all_of(Mask,
14995 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14996 "Out of bounds shuffle index");
14998 // We actually see shuffles that are entirely re-arrangements of a set of
14999 // zero inputs. This mostly happens while decomposing complex shuffles into
15000 // simple ones. Directly lower these as a buildvector of zeros.
15001 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
15002 if (Zeroable.isAllOnesValue())
15003 return getZeroVector(VT, Subtarget, DAG, DL);
15005 // Try to collapse shuffles into using a vector type with fewer elements but
15006 // wider element types. We cap this to not form integers or floating point
15007 // elements wider than 64 bits, but it might be interesting to form i128
15008 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
15009 SmallVector<int, 16> WidenedMask;
15010 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
15011 canWidenShuffleElements(Mask, WidenedMask)) {
15012 MVT NewEltVT = VT.isFloatingPoint()
15013 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
15014 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
15015 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
15016 // Make sure that the new vector type is legal. For example, v2f64 isn't
15018 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
15019 V1 = DAG.getBitcast(NewVT, V1);
15020 V2 = DAG.getBitcast(NewVT, V2);
15021 return DAG.getBitcast(
15022 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
15026 // Commute the shuffle if it will improve canonicalization.
15027 if (canonicalizeShuffleMaskWithCommute(Mask))
15028 return DAG.getCommutedVectorShuffle(*SVOp);
15031 lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
15034 // For each vector width, delegate to a specialized lowering routine.
15035 if (VT.is128BitVector())
15036 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15039 if (VT.is256BitVector())
15040 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15043 if (VT.is512BitVector())
15044 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15048 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15051 llvm_unreachable("Unimplemented!");
15054 /// Try to lower a VSELECT instruction to a vector shuffle.
15055 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
15056 const X86Subtarget &Subtarget,
15057 SelectionDAG &DAG) {
15058 SDValue Cond = Op.getOperand(0);
15059 SDValue LHS = Op.getOperand(1);
15060 SDValue RHS = Op.getOperand(2);
15062 MVT VT = Op.getSimpleValueType();
15064 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
15066 auto *CondBV = cast<BuildVectorSDNode>(Cond);
15068 // Only non-legal VSELECTs reach this lowering, convert those into generic
15069 // shuffles and re-use the shuffle lowering path for blends.
15070 SmallVector<int, 32> Mask;
15071 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
15072 SDValue CondElt = CondBV->getOperand(i);
15074 // We can't map undef to undef here. They have different meanings. Treat
15075 // as the same as zero.
15076 if (CondElt.isUndef() || isNullConstant(CondElt))
15080 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
15083 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
15084 // A vselect where all conditions and data are constants can be optimized into
15085 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
15086 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
15087 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
15088 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
15091 // Try to lower this to a blend-style vector shuffle. This can handle all
15092 // constant condition cases.
15093 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
15096 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
15097 // with patterns on the mask registers on AVX-512.
15098 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
15101 // Variable blends are only legal from SSE4.1 onward.
15102 if (!Subtarget.hasSSE41())
15106 MVT VT = Op.getSimpleValueType();
15108 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
15109 // into an i1 condition so that we can use the mask-based 512-bit blend
15111 if (VT.getSizeInBits() == 512) {
15112 SDValue Cond = Op.getOperand(0);
15113 // The vNi1 condition case should be handled above as it can be trivially
15115 assert(Cond.getValueType().getScalarSizeInBits() ==
15116 VT.getScalarSizeInBits() &&
15117 "Should have a size-matched integer condition!");
15118 // Build a mask by testing the condition against zero.
15119 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
15120 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
15121 getZeroVector(VT, Subtarget, DAG, dl),
15123 // Now return a new VSELECT using the mask.
15124 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
15127 // Only some types will be legal on some subtargets. If we can emit a legal
15128 // VSELECT-matching blend, return Op, and but if we need to expand, return
15130 switch (VT.SimpleTy) {
15132 // Most of the vector types have blends past SSE4.1.
15136 // The byte blends for AVX vectors were introduced only in AVX2.
15137 if (Subtarget.hasAVX2())
15143 case MVT::v16i16: {
15144 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
15145 MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
15146 SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
15147 SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
15148 SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
15149 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
15150 return DAG.getBitcast(VT, Select);
15155 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
15156 MVT VT = Op.getSimpleValueType();
15159 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
15162 if (VT.getSizeInBits() == 8) {
15163 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
15164 Op.getOperand(0), Op.getOperand(1));
15165 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15168 if (VT == MVT::f32) {
15169 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
15170 // the result back to FR32 register. It's only worth matching if the
15171 // result has a single use which is a store or a bitcast to i32. And in
15172 // the case of a store, it's not worth it if the index is a constant 0,
15173 // because a MOVSSmr can be used instead, which is smaller and faster.
15174 if (!Op.hasOneUse())
15176 SDNode *User = *Op.getNode()->use_begin();
15177 if ((User->getOpcode() != ISD::STORE ||
15178 isNullConstant(Op.getOperand(1))) &&
15179 (User->getOpcode() != ISD::BITCAST ||
15180 User->getValueType(0) != MVT::i32))
15182 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15183 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
15185 return DAG.getBitcast(MVT::f32, Extract);
15188 if (VT == MVT::i32 || VT == MVT::i64) {
15189 // ExtractPS/pextrq works with constant index.
15190 if (isa<ConstantSDNode>(Op.getOperand(1)))
15197 /// Extract one bit from mask vector, like v16i1 or v8i1.
15198 /// AVX-512 feature.
15199 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
15200 const X86Subtarget &Subtarget) {
15201 SDValue Vec = Op.getOperand(0);
15203 MVT VecVT = Vec.getSimpleValueType();
15204 SDValue Idx = Op.getOperand(1);
15205 MVT EltVT = Op.getSimpleValueType();
15207 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
15208 "Unexpected vector type in ExtractBitFromMaskVector");
15210 // variable index can't be handled in mask registers,
15211 // extend vector to VR512/128
15212 if (!isa<ConstantSDNode>(Idx)) {
15213 unsigned NumElts = VecVT.getVectorNumElements();
15214 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
15215 // than extending to 128/256bit.
15216 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15217 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15218 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
15219 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
15220 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
15223 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15225 // If the kshift instructions of the correct width aren't natively supported
15226 // then we need to promote the vector to the native size to get the correct
15227 // zeroing behavior.
15228 if (VecVT.getVectorNumElements() < 16) {
15229 VecVT = MVT::v16i1;
15230 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
15231 DAG.getUNDEF(VecVT), Vec,
15232 DAG.getIntPtrConstant(0, dl));
15235 // Extracts from element 0 are always allowed.
15237 // Use kshiftr instruction to move to the lower element.
15238 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
15239 DAG.getConstant(IdxVal, dl, MVT::i8));
15242 // Shrink to v16i1 since that's always legal.
15243 if (VecVT.getVectorNumElements() > 16) {
15244 VecVT = MVT::v16i1;
15245 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
15246 DAG.getIntPtrConstant(0, dl));
15249 // Convert to a bitcast+aext/trunc.
15250 MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
15251 return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
15255 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15256 SelectionDAG &DAG) const {
15258 SDValue Vec = Op.getOperand(0);
15259 MVT VecVT = Vec.getSimpleValueType();
15260 SDValue Idx = Op.getOperand(1);
15262 if (VecVT.getVectorElementType() == MVT::i1)
15263 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
15265 if (!isa<ConstantSDNode>(Idx)) {
15266 // Its more profitable to go through memory (1 cycles throughput)
15267 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
15268 // IACA tool was used to get performance estimation
15269 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
15271 // example : extractelement <16 x i8> %a, i32 %i
15273 // Block Throughput: 3.00 Cycles
15274 // Throughput Bottleneck: Port5
15276 // | Num Of | Ports pressure in cycles | |
15277 // | Uops | 0 - DV | 5 | 6 | 7 | |
15278 // ---------------------------------------------
15279 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
15280 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
15281 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
15282 // Total Num Of Uops: 4
15285 // Block Throughput: 1.00 Cycles
15286 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
15288 // | | Ports pressure in cycles | |
15289 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
15290 // ---------------------------------------------------------
15291 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
15292 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
15293 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
15294 // Total Num Of Uops: 4
15299 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15301 // If this is a 256-bit vector result, first extract the 128-bit vector and
15302 // then extract the element from the 128-bit vector.
15303 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
15304 // Get the 128-bit vector.
15305 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
15306 MVT EltVT = VecVT.getVectorElementType();
15308 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
15309 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
15311 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
15312 // this can be done with a mask.
15313 IdxVal &= ElemsPerChunk - 1;
15314 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
15315 DAG.getConstant(IdxVal, dl, MVT::i32));
15318 assert(VecVT.is128BitVector() && "Unexpected vector length");
15320 MVT VT = Op.getSimpleValueType();
15322 if (VT.getSizeInBits() == 16) {
15323 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
15324 // we're going to zero extend the register or fold the store (SSE41 only).
15325 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
15326 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
15327 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
15328 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15329 DAG.getBitcast(MVT::v4i32, Vec), Idx));
15331 // Transform it so it match pextrw which produces a 32-bit result.
15332 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
15333 Op.getOperand(0), Op.getOperand(1));
15334 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15337 if (Subtarget.hasSSE41())
15338 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
15341 // TODO: We only extract a single element from v16i8, we can probably afford
15342 // to be more aggressive here before using the default approach of spilling to
15344 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
15345 // Extract either the lowest i32 or any i16, and extract the sub-byte.
15346 int DWordIdx = IdxVal / 4;
15347 if (DWordIdx == 0) {
15348 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15349 DAG.getBitcast(MVT::v4i32, Vec),
15350 DAG.getIntPtrConstant(DWordIdx, dl));
15351 int ShiftVal = (IdxVal % 4) * 8;
15353 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
15354 DAG.getConstant(ShiftVal, dl, MVT::i32));
15355 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15358 int WordIdx = IdxVal / 2;
15359 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
15360 DAG.getBitcast(MVT::v8i16, Vec),
15361 DAG.getIntPtrConstant(WordIdx, dl));
15362 int ShiftVal = (IdxVal % 2) * 8;
15364 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
15365 DAG.getConstant(ShiftVal, dl, MVT::i16));
15366 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15369 if (VT.getSizeInBits() == 32) {
15373 // SHUFPS the element to the lowest double word, then movss.
15374 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
15375 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15376 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15377 DAG.getIntPtrConstant(0, dl));
15380 if (VT.getSizeInBits() == 64) {
15381 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
15382 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
15383 // to match extract_elt for f64.
15387 // UNPCKHPD the element to the lowest double word, then movsd.
15388 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
15389 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
15390 int Mask[2] = { 1, -1 };
15391 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15392 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15393 DAG.getIntPtrConstant(0, dl));
15399 /// Insert one bit to mask vector, like v16i1 or v8i1.
15400 /// AVX-512 feature.
15401 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15402 const X86Subtarget &Subtarget) {
15404 SDValue Vec = Op.getOperand(0);
15405 SDValue Elt = Op.getOperand(1);
15406 SDValue Idx = Op.getOperand(2);
15407 MVT VecVT = Vec.getSimpleValueType();
15409 if (!isa<ConstantSDNode>(Idx)) {
15410 // Non constant index. Extend source and destination,
15411 // insert element and then truncate the result.
15412 unsigned NumElts = VecVT.getVectorNumElements();
15413 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15414 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15415 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15416 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15417 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15418 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15421 // Copy into a k-register, extract to v1i1 and insert_subvector.
15422 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15424 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15428 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15429 SelectionDAG &DAG) const {
15430 MVT VT = Op.getSimpleValueType();
15431 MVT EltVT = VT.getVectorElementType();
15432 unsigned NumElts = VT.getVectorNumElements();
15434 if (EltVT == MVT::i1)
15435 return InsertBitToMaskVector(Op, DAG, Subtarget);
15438 SDValue N0 = Op.getOperand(0);
15439 SDValue N1 = Op.getOperand(1);
15440 SDValue N2 = Op.getOperand(2);
15441 if (!isa<ConstantSDNode>(N2))
15443 auto *N2C = cast<ConstantSDNode>(N2);
15444 unsigned IdxVal = N2C->getZExtValue();
15446 bool IsZeroElt = X86::isZeroNode(N1);
15447 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15449 // If we are inserting a element, see if we can do this more efficiently with
15450 // a blend shuffle with a rematerializable vector than a costly integer
15452 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15453 16 <= EltVT.getSizeInBits()) {
15454 SmallVector<int, 8> BlendMask;
15455 for (unsigned i = 0; i != NumElts; ++i)
15456 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15457 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15458 : getOnesVector(VT, DAG, dl);
15459 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15462 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15463 // into that, and then insert the subvector back into the result.
15464 if (VT.is256BitVector() || VT.is512BitVector()) {
15465 // With a 256-bit vector, we can insert into the zero element efficiently
15466 // using a blend if we have AVX or AVX2 and the right data type.
15467 if (VT.is256BitVector() && IdxVal == 0) {
15468 // TODO: It is worthwhile to cast integer to floating point and back
15469 // and incur a domain crossing penalty if that's what we'll end up
15470 // doing anyway after extracting to a 128-bit vector.
15471 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15472 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15473 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15474 N2 = DAG.getIntPtrConstant(1, dl);
15475 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15479 // Get the desired 128-bit vector chunk.
15480 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15482 // Insert the element into the desired chunk.
15483 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15484 assert(isPowerOf2_32(NumEltsIn128));
15485 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15486 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15488 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15489 DAG.getConstant(IdxIn128, dl, MVT::i32));
15491 // Insert the changed part back into the bigger vector
15492 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15494 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15496 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15497 // argument. SSE41 required for pinsrb.
15498 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15500 if (VT == MVT::v8i16) {
15501 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15502 Opc = X86ISD::PINSRW;
15504 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15505 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15506 Opc = X86ISD::PINSRB;
15509 if (N1.getValueType() != MVT::i32)
15510 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15511 if (N2.getValueType() != MVT::i32)
15512 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15513 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15516 if (Subtarget.hasSSE41()) {
15517 if (EltVT == MVT::f32) {
15518 // Bits [7:6] of the constant are the source select. This will always be
15519 // zero here. The DAG Combiner may combine an extract_elt index into
15520 // these bits. For example (insert (extract, 3), 2) could be matched by
15521 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15522 // Bits [5:4] of the constant are the destination select. This is the
15523 // value of the incoming immediate.
15524 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15525 // combine either bitwise AND or insert of float 0.0 to set these bits.
15527 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15528 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15529 // If this is an insertion of 32-bits into the low 32-bits of
15530 // a vector, we prefer to generate a blend with immediate rather
15531 // than an insertps. Blends are simpler operations in hardware and so
15532 // will always have equal or better performance than insertps.
15533 // But if optimizing for size and there's a load folding opportunity,
15534 // generate insertps because blendps does not have a 32-bit memory
15536 N2 = DAG.getIntPtrConstant(1, dl);
15537 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15538 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15540 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15541 // Create this as a scalar to vector..
15542 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15543 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15546 // PINSR* works with constant index.
15547 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15554 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15555 SelectionDAG &DAG) {
15557 MVT OpVT = Op.getSimpleValueType();
15559 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15561 if (X86::isZeroNode(Op.getOperand(0)))
15562 return getZeroVector(OpVT, Subtarget, DAG, dl);
15564 // If this is a 256-bit vector result, first insert into a 128-bit
15565 // vector and then insert into the 256-bit vector.
15566 if (!OpVT.is128BitVector()) {
15567 // Insert into a 128-bit vector.
15568 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15569 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15570 OpVT.getVectorNumElements() / SizeFactor);
15572 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15574 // Insert the 128-bit vector.
15575 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15577 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15579 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15580 if (OpVT == MVT::v4i32)
15583 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15584 return DAG.getBitcast(
15585 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15588 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15589 // simple superregister reference or explicit instructions to insert
15590 // the upper bits of a vector.
15591 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15592 SelectionDAG &DAG) {
15593 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15595 return insert1BitVector(Op, DAG, Subtarget);
15598 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15599 SelectionDAG &DAG) {
15600 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15601 "Only vXi1 extract_subvectors need custom lowering");
15604 SDValue Vec = Op.getOperand(0);
15605 SDValue Idx = Op.getOperand(1);
15607 if (!isa<ConstantSDNode>(Idx))
15610 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15611 if (IdxVal == 0) // the operation is legal
15614 MVT VecVT = Vec.getSimpleValueType();
15615 unsigned NumElems = VecVT.getVectorNumElements();
15617 // Extend to natively supported kshift.
15618 MVT WideVecVT = VecVT;
15619 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15620 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15621 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15622 DAG.getUNDEF(WideVecVT), Vec,
15623 DAG.getIntPtrConstant(0, dl));
15626 // Shift to the LSB.
15627 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15628 DAG.getConstant(IdxVal, dl, MVT::i8));
15630 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15631 DAG.getIntPtrConstant(0, dl));
15634 // Returns the appropriate wrapper opcode for a global reference.
15635 unsigned X86TargetLowering::getGlobalWrapperKind(
15636 const GlobalValue *GV, const unsigned char OpFlags) const {
15637 // References to absolute symbols are never PC-relative.
15638 if (GV && GV->isAbsoluteSymbolRef())
15639 return X86ISD::Wrapper;
15641 CodeModel::Model M = getTargetMachine().getCodeModel();
15642 if (Subtarget.isPICStyleRIPRel() &&
15643 (M == CodeModel::Small || M == CodeModel::Kernel))
15644 return X86ISD::WrapperRIP;
15646 // GOTPCREL references must always use RIP.
15647 if (OpFlags == X86II::MO_GOTPCREL)
15648 return X86ISD::WrapperRIP;
15650 return X86ISD::Wrapper;
15653 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15654 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15655 // one of the above mentioned nodes. It has to be wrapped because otherwise
15656 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15657 // be used to form addressing mode. These wrapped nodes will be selected
15660 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15661 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15663 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15664 // global base reg.
15665 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15667 auto PtrVT = getPointerTy(DAG.getDataLayout());
15668 SDValue Result = DAG.getTargetConstantPool(
15669 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15671 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15672 // With PIC, the address is actually $g + Offset.
15675 DAG.getNode(ISD::ADD, DL, PtrVT,
15676 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15682 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15683 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15685 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15686 // global base reg.
15687 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15689 auto PtrVT = getPointerTy(DAG.getDataLayout());
15690 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15692 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15694 // With PIC, the address is actually $g + Offset.
15697 DAG.getNode(ISD::ADD, DL, PtrVT,
15698 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15704 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15705 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15707 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15708 // global base reg.
15709 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15710 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15712 auto PtrVT = getPointerTy(DAG.getDataLayout());
15713 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15716 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15718 // With PIC, the address is actually $g + Offset.
15719 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15721 DAG.getNode(ISD::ADD, DL, PtrVT,
15722 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15725 // For symbols that require a load from a stub to get the address, emit the
15727 if (isGlobalStubReference(OpFlag))
15728 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15729 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15735 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15736 // Create the TargetBlockAddressAddress node.
15737 unsigned char OpFlags =
15738 Subtarget.classifyBlockAddressReference();
15739 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15740 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15742 auto PtrVT = getPointerTy(DAG.getDataLayout());
15743 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15744 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15746 // With PIC, the address is actually $g + Offset.
15747 if (isGlobalRelativeToPICBase(OpFlags)) {
15748 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15749 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15755 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15756 const SDLoc &dl, int64_t Offset,
15757 SelectionDAG &DAG) const {
15758 // Create the TargetGlobalAddress node, folding in the constant
15759 // offset if it is legal.
15760 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15761 CodeModel::Model M = DAG.getTarget().getCodeModel();
15762 auto PtrVT = getPointerTy(DAG.getDataLayout());
15764 if (OpFlags == X86II::MO_NO_FLAG &&
15765 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15766 // A direct static reference to a global.
15767 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15770 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15773 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
15775 // With PIC, the address is actually $g + Offset.
15776 if (isGlobalRelativeToPICBase(OpFlags)) {
15777 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15778 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15781 // For globals that require a load from a stub to get the address, emit the
15783 if (isGlobalStubReference(OpFlags))
15784 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15785 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15787 // If there was a non-zero offset that we didn't fold, create an explicit
15788 // addition for it.
15790 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15791 DAG.getConstant(Offset, dl, PtrVT));
15797 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15798 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15799 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15800 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15804 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15805 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15806 unsigned char OperandFlags, bool LocalDynamic = false) {
15807 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15808 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15810 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15811 GA->getValueType(0),
15815 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15819 SDValue Ops[] = { Chain, TGA, *InFlag };
15820 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15822 SDValue Ops[] = { Chain, TGA };
15823 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15826 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15827 MFI.setAdjustsStack(true);
15828 MFI.setHasCalls(true);
15830 SDValue Flag = Chain.getValue(1);
15831 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15834 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15836 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15839 SDLoc dl(GA); // ? function entry point might be better
15840 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15841 DAG.getNode(X86ISD::GlobalBaseReg,
15842 SDLoc(), PtrVT), InFlag);
15843 InFlag = Chain.getValue(1);
15845 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15848 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15850 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15852 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15853 X86::RAX, X86II::MO_TLSGD);
15856 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15862 // Get the start address of the TLS block for this module.
15863 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15864 .getInfo<X86MachineFunctionInfo>();
15865 MFI->incNumLocalDynamicTLSAccesses();
15869 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15870 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15873 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15874 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15875 InFlag = Chain.getValue(1);
15876 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15877 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15880 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15884 unsigned char OperandFlags = X86II::MO_DTPOFF;
15885 unsigned WrapperKind = X86ISD::Wrapper;
15886 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15887 GA->getValueType(0),
15888 GA->getOffset(), OperandFlags);
15889 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15891 // Add x@dtpoff with the base.
15892 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15895 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15896 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15897 const EVT PtrVT, TLSModel::Model model,
15898 bool is64Bit, bool isPIC) {
15901 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15902 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15903 is64Bit ? 257 : 256));
15905 SDValue ThreadPointer =
15906 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15907 MachinePointerInfo(Ptr));
15909 unsigned char OperandFlags = 0;
15910 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15912 unsigned WrapperKind = X86ISD::Wrapper;
15913 if (model == TLSModel::LocalExec) {
15914 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15915 } else if (model == TLSModel::InitialExec) {
15917 OperandFlags = X86II::MO_GOTTPOFF;
15918 WrapperKind = X86ISD::WrapperRIP;
15920 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15923 llvm_unreachable("Unexpected model");
15926 // emit "addl x@ntpoff,%eax" (local exec)
15927 // or "addl x@indntpoff,%eax" (initial exec)
15928 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15930 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15931 GA->getOffset(), OperandFlags);
15932 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15934 if (model == TLSModel::InitialExec) {
15935 if (isPIC && !is64Bit) {
15936 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15937 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15941 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15942 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15945 // The address of the thread local variable is the add of the thread
15946 // pointer with the offset of the variable.
15947 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15951 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15953 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15955 if (DAG.getTarget().useEmulatedTLS())
15956 return LowerToTLSEmulatedModel(GA, DAG);
15958 const GlobalValue *GV = GA->getGlobal();
15959 auto PtrVT = getPointerTy(DAG.getDataLayout());
15960 bool PositionIndependent = isPositionIndependent();
15962 if (Subtarget.isTargetELF()) {
15963 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15965 case TLSModel::GeneralDynamic:
15966 if (Subtarget.is64Bit())
15967 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15968 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15969 case TLSModel::LocalDynamic:
15970 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15971 Subtarget.is64Bit());
15972 case TLSModel::InitialExec:
15973 case TLSModel::LocalExec:
15974 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15975 PositionIndependent);
15977 llvm_unreachable("Unknown TLS model.");
15980 if (Subtarget.isTargetDarwin()) {
15981 // Darwin only has one model of TLS. Lower to that.
15982 unsigned char OpFlag = 0;
15983 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15984 X86ISD::WrapperRIP : X86ISD::Wrapper;
15986 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15987 // global base reg.
15988 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15990 OpFlag = X86II::MO_TLVP_PIC_BASE;
15992 OpFlag = X86II::MO_TLVP;
15994 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15995 GA->getValueType(0),
15996 GA->getOffset(), OpFlag);
15997 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15999 // With PIC32, the address is actually $g + Offset.
16001 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
16002 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16005 // Lowering the machine isd will make sure everything is in the right
16007 SDValue Chain = DAG.getEntryNode();
16008 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16009 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16010 SDValue Args[] = { Chain, Offset };
16011 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
16012 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
16013 DAG.getIntPtrConstant(0, DL, true),
16014 Chain.getValue(1), DL);
16016 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
16017 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
16018 MFI.setAdjustsStack(true);
16020 // And our return value (tls address) is in the standard call return value
16022 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
16023 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
16026 if (Subtarget.isTargetKnownWindowsMSVC() ||
16027 Subtarget.isTargetWindowsItanium() ||
16028 Subtarget.isTargetWindowsGNU()) {
16029 // Just use the implicit TLS architecture
16030 // Need to generate something similar to:
16031 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
16033 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
16034 // mov rcx, qword [rdx+rcx*8]
16035 // mov eax, .tls$:tlsvar
16036 // [rax+rcx] contains the address
16037 // Windows 64bit: gs:0x58
16038 // Windows 32bit: fs:__tls_array
16041 SDValue Chain = DAG.getEntryNode();
16043 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
16044 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
16045 // use its literal value of 0x2C.
16046 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
16047 ? Type::getInt8PtrTy(*DAG.getContext(),
16049 : Type::getInt32PtrTy(*DAG.getContext(),
16052 SDValue TlsArray = Subtarget.is64Bit()
16053 ? DAG.getIntPtrConstant(0x58, dl)
16054 : (Subtarget.isTargetWindowsGNU()
16055 ? DAG.getIntPtrConstant(0x2C, dl)
16056 : DAG.getExternalSymbol("_tls_array", PtrVT));
16058 SDValue ThreadPointer =
16059 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
16062 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
16063 res = ThreadPointer;
16065 // Load the _tls_index variable
16066 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
16067 if (Subtarget.is64Bit())
16068 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
16069 MachinePointerInfo(), MVT::i32);
16071 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
16073 auto &DL = DAG.getDataLayout();
16075 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
16076 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
16078 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
16081 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
16083 // Get the offset of start of .tls section
16084 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16085 GA->getValueType(0),
16086 GA->getOffset(), X86II::MO_SECREL);
16087 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
16089 // The address of the thread local variable is the add of the thread
16090 // pointer with the offset of the variable.
16091 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
16094 llvm_unreachable("TLS not implemented for this target.");
16097 /// Lower SRA_PARTS and friends, which return two i32 values
16098 /// and take a 2 x i32 value to shift plus a shift amount.
16099 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
16100 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
16101 MVT VT = Op.getSimpleValueType();
16102 unsigned VTBits = VT.getSizeInBits();
16104 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
16105 SDValue ShOpLo = Op.getOperand(0);
16106 SDValue ShOpHi = Op.getOperand(1);
16107 SDValue ShAmt = Op.getOperand(2);
16108 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
16109 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
16111 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16112 DAG.getConstant(VTBits - 1, dl, MVT::i8));
16113 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
16114 DAG.getConstant(VTBits - 1, dl, MVT::i8))
16115 : DAG.getConstant(0, dl, VT);
16117 SDValue Tmp2, Tmp3;
16118 if (Op.getOpcode() == ISD::SHL_PARTS) {
16119 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
16120 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
16122 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
16123 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
16126 // If the shift amount is larger or equal than the width of a part we can't
16127 // rely on the results of shld/shrd. Insert a test and select the appropriate
16128 // values for large shift amounts.
16129 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16130 DAG.getConstant(VTBits, dl, MVT::i8));
16131 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
16132 AndNode, DAG.getConstant(0, dl, MVT::i8));
16135 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
16136 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
16137 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
16139 if (Op.getOpcode() == ISD::SHL_PARTS) {
16140 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
16141 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
16143 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
16144 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
16147 SDValue Ops[2] = { Lo, Hi };
16148 return DAG.getMergeValues(Ops, dl);
16151 // Try to use a packed vector operation to handle i64 on 32-bit targets when
16152 // AVX512DQ is enabled.
16153 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
16154 const X86Subtarget &Subtarget) {
16155 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
16156 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
16157 SDValue Src = Op.getOperand(0);
16158 MVT SrcVT = Src.getSimpleValueType();
16159 MVT VT = Op.getSimpleValueType();
16161 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
16162 (VT != MVT::f32 && VT != MVT::f64))
16165 // Pack the i64 into a vector, do the operation and extract.
16167 // Using 256-bit to ensure result is 128-bits for f32 case.
16168 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
16169 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
16170 MVT VecVT = MVT::getVectorVT(VT, NumElts);
16173 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
16174 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
16175 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
16176 DAG.getIntPtrConstant(0, dl));
16179 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
16180 SelectionDAG &DAG) const {
16181 SDValue Src = Op.getOperand(0);
16182 MVT SrcVT = Src.getSimpleValueType();
16183 MVT VT = Op.getSimpleValueType();
16186 if (SrcVT.isVector()) {
16187 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
16188 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
16189 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
16190 DAG.getUNDEF(SrcVT)));
16195 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
16196 "Unknown SINT_TO_FP to lower!");
16198 // These are really Legal; return the operand so the caller accepts it as
16200 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
16202 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
16206 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16209 SDValue ValueToStore = Op.getOperand(0);
16210 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
16211 !Subtarget.is64Bit())
16212 // Bitcasting to f64 here allows us to do a single 64-bit store from
16213 // an SSE register, avoiding the store forwarding penalty that would come
16214 // with two 32-bit stores.
16215 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16217 unsigned Size = SrcVT.getSizeInBits()/8;
16218 MachineFunction &MF = DAG.getMachineFunction();
16219 auto PtrVT = getPointerTy(MF.getDataLayout());
16220 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
16221 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16222 SDValue Chain = DAG.getStore(
16223 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16224 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16225 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
16228 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
16230 SelectionDAG &DAG) const {
16234 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
16236 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
16238 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
16240 unsigned ByteSize = SrcVT.getSizeInBits()/8;
16242 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
16243 MachineMemOperand *MMO;
16245 int SSFI = FI->getIndex();
16246 MMO = DAG.getMachineFunction().getMachineMemOperand(
16247 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16248 MachineMemOperand::MOLoad, ByteSize, ByteSize);
16250 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
16251 StackSlot = StackSlot.getOperand(1);
16253 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
16254 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
16256 Tys, Ops, SrcVT, MMO);
16259 Chain = Result.getValue(1);
16260 SDValue InFlag = Result.getValue(2);
16262 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
16263 // shouldn't be necessary except that RFP cannot be live across
16264 // multiple blocks. When stackifier is fixed, they can be uncoupled.
16265 MachineFunction &MF = DAG.getMachineFunction();
16266 unsigned SSFISize = Op.getValueSizeInBits()/8;
16267 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
16268 auto PtrVT = getPointerTy(MF.getDataLayout());
16269 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16270 Tys = DAG.getVTList(MVT::Other);
16272 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
16274 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16275 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16276 MachineMemOperand::MOStore, SSFISize, SSFISize);
16278 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
16279 Ops, Op.getValueType(), MMO);
16280 Result = DAG.getLoad(
16281 Op.getValueType(), DL, Chain, StackSlot,
16282 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16288 /// 64-bit unsigned integer to double expansion.
16289 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
16290 const X86Subtarget &Subtarget) {
16291 // This algorithm is not obvious. Here it is what we're trying to output:
16294 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
16295 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
16297 haddpd %xmm0, %xmm0
16299 pshufd $0x4e, %xmm0, %xmm1
16305 LLVMContext *Context = DAG.getContext();
16307 // Build some magic constants.
16308 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
16309 Constant *C0 = ConstantDataVector::get(*Context, CV0);
16310 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
16311 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
16313 SmallVector<Constant*,2> CV1;
16315 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16316 APInt(64, 0x4330000000000000ULL))));
16318 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16319 APInt(64, 0x4530000000000000ULL))));
16320 Constant *C1 = ConstantVector::get(CV1);
16321 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
16323 // Load the 64-bit value into an XMM register.
16324 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
16327 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
16328 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16329 /* Alignment = */ 16);
16331 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
16334 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
16335 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16336 /* Alignment = */ 16);
16337 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
16338 // TODO: Are there any fast-math-flags to propagate here?
16339 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
16342 if (Subtarget.hasSSE3()) {
16343 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
16344 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
16346 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
16347 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
16348 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
16349 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
16352 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
16353 DAG.getIntPtrConstant(0, dl));
16356 /// 32-bit unsigned integer to float expansion.
16357 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
16358 const X86Subtarget &Subtarget) {
16360 // FP constant to bias correct the final result.
16361 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
16364 // Load the 32-bit value into an XMM register.
16365 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
16368 // Zero out the upper parts of the register.
16369 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
16371 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16372 DAG.getBitcast(MVT::v2f64, Load),
16373 DAG.getIntPtrConstant(0, dl));
16375 // Or the load with the bias.
16376 SDValue Or = DAG.getNode(
16377 ISD::OR, dl, MVT::v2i64,
16378 DAG.getBitcast(MVT::v2i64,
16379 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
16380 DAG.getBitcast(MVT::v2i64,
16381 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
16383 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16384 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
16386 // Subtract the bias.
16387 // TODO: Are there any fast-math-flags to propagate here?
16388 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
16390 // Handle final rounding.
16391 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
16394 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
16395 const X86Subtarget &Subtarget,
16397 if (Op.getSimpleValueType() != MVT::v2f64)
16400 SDValue N0 = Op.getOperand(0);
16401 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
16403 // Legalize to v4i32 type.
16404 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
16405 DAG.getUNDEF(MVT::v2i32));
16407 if (Subtarget.hasAVX512())
16408 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
16410 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
16411 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
16412 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
16413 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
16415 // Two to the power of half-word-size.
16416 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
16418 // Clear upper part of LO, lower HI.
16419 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
16420 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
16422 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
16423 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
16424 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
16426 // Add the two halves.
16427 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16430 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16431 const X86Subtarget &Subtarget) {
16432 // The algorithm is the following:
16433 // #ifdef __SSE4_1__
16434 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16435 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16436 // (uint4) 0x53000000, 0xaa);
16438 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16439 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16441 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16442 // return (float4) lo + fhi;
16444 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16445 // reassociate the two FADDs, and if we do that, the algorithm fails
16446 // spectacularly (PR24512).
16447 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16448 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16449 // there's also the MachineCombiner reassociations happening on Machine IR.
16450 if (DAG.getTarget().Options.UnsafeFPMath)
16454 SDValue V = Op->getOperand(0);
16455 MVT VecIntVT = V.getSimpleValueType();
16456 bool Is128 = VecIntVT == MVT::v4i32;
16457 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16458 // If we convert to something else than the supported type, e.g., to v4f64,
16460 if (VecFloatVT != Op->getSimpleValueType(0))
16463 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16464 "Unsupported custom type");
16466 // In the #idef/#else code, we have in common:
16467 // - The vector of constants:
16473 // Create the splat vector for 0x4b000000.
16474 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16475 // Create the splat vector for 0x53000000.
16476 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16478 // Create the right shift.
16479 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16480 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16483 if (Subtarget.hasSSE41()) {
16484 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16485 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16486 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16487 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16488 // Low will be bitcasted right away, so do not bother bitcasting back to its
16490 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16491 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16492 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16493 // (uint4) 0x53000000, 0xaa);
16494 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16495 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16496 // High will be bitcasted right away, so do not bother bitcasting back to
16497 // its original type.
16498 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16499 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16501 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16502 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16503 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16504 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16506 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16507 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16510 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16511 SDValue VecCstFAdd = DAG.getConstantFP(
16512 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16514 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16515 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16516 // TODO: Are there any fast-math-flags to propagate here?
16518 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16519 // return (float4) lo + fhi;
16520 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16521 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16524 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16525 const X86Subtarget &Subtarget) {
16526 SDValue N0 = Op.getOperand(0);
16527 MVT SrcVT = N0.getSimpleValueType();
16530 switch (SrcVT.SimpleTy) {
16532 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16534 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16537 assert(!Subtarget.hasAVX512());
16538 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16542 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16543 SelectionDAG &DAG) const {
16544 SDValue N0 = Op.getOperand(0);
16546 auto PtrVT = getPointerTy(DAG.getDataLayout());
16548 if (Op.getSimpleValueType().isVector())
16549 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16551 MVT SrcVT = N0.getSimpleValueType();
16552 MVT DstVT = Op.getSimpleValueType();
16554 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16555 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16556 // Conversions from unsigned i32 to f32/f64 are legal,
16557 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16561 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16564 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16565 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16566 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16567 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16568 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16571 // Make a 64-bit buffer, and use it to build an FILD.
16572 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16573 if (SrcVT == MVT::i32) {
16574 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16575 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16576 StackSlot, MachinePointerInfo());
16577 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16578 OffsetSlot, MachinePointerInfo());
16579 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16583 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16584 SDValue ValueToStore = Op.getOperand(0);
16585 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16586 // Bitcasting to f64 here allows us to do a single 64-bit store from
16587 // an SSE register, avoiding the store forwarding penalty that would come
16588 // with two 32-bit stores.
16589 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16590 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16591 MachinePointerInfo());
16592 // For i64 source, we need to add the appropriate power of 2 if the input
16593 // was negative. This is the same as the optimization in
16594 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16595 // we must be careful to do the computation in x87 extended precision, not
16596 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16597 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16598 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16599 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16600 MachineMemOperand::MOLoad, 8, 8);
16602 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16603 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16604 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16607 APInt FF(32, 0x5F800000ULL);
16609 // Check whether the sign bit is set.
16610 SDValue SignSet = DAG.getSetCC(
16611 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16612 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16614 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16615 SDValue FudgePtr = DAG.getConstantPool(
16616 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16618 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16619 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16620 SDValue Four = DAG.getIntPtrConstant(4, dl);
16621 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16622 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16624 // Load the value out, extending it from f32 to f80.
16625 // FIXME: Avoid the extend by constructing the right constant pool?
16626 SDValue Fudge = DAG.getExtLoad(
16627 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16628 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16629 /* Alignment = */ 4);
16630 // Extend everything to 80 bits to force it to be done on x87.
16631 // TODO: Are there any fast-math-flags to propagate here?
16632 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16633 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16634 DAG.getIntPtrConstant(0, dl));
16637 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16638 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16639 // just return an <SDValue(), SDValue()> pair.
16640 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16641 // to i16, i32 or i64, and we lower it to a legal sequence.
16642 // If lowered to the final integer result we return a <result, SDValue()> pair.
16643 // Otherwise we lower it to a sequence ending with a FIST, return a
16644 // <FIST, StackSlot> pair, and the caller is responsible for loading
16645 // the final integer result from StackSlot.
16646 std::pair<SDValue,SDValue>
16647 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16648 bool IsSigned, bool IsReplace) const {
16651 EVT DstTy = Op.getValueType();
16652 EVT TheVT = Op.getOperand(0).getValueType();
16653 auto PtrVT = getPointerTy(DAG.getDataLayout());
16655 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16656 // f16 must be promoted before using the lowering in this routine.
16657 // fp128 does not use this lowering.
16658 return std::make_pair(SDValue(), SDValue());
16661 // If using FIST to compute an unsigned i64, we'll need some fixup
16662 // to handle values above the maximum signed i64. A FIST is always
16663 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16664 bool UnsignedFixup = !IsSigned &&
16665 DstTy == MVT::i64 &&
16666 (!Subtarget.is64Bit() ||
16667 !isScalarFPTypeInSSEReg(TheVT));
16669 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16670 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16671 // The low 32 bits of the fist result will have the correct uint32 result.
16672 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16676 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16677 DstTy.getSimpleVT() >= MVT::i16 &&
16678 "Unknown FP_TO_INT to lower!");
16680 // These are really Legal.
16681 if (DstTy == MVT::i32 &&
16682 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16683 return std::make_pair(SDValue(), SDValue());
16684 if (Subtarget.is64Bit() &&
16685 DstTy == MVT::i64 &&
16686 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16687 return std::make_pair(SDValue(), SDValue());
16689 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16691 MachineFunction &MF = DAG.getMachineFunction();
16692 unsigned MemSize = DstTy.getSizeInBits()/8;
16693 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16694 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16697 switch (DstTy.getSimpleVT().SimpleTy) {
16698 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16699 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16700 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16701 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16704 SDValue Chain = DAG.getEntryNode();
16705 SDValue Value = Op.getOperand(0);
16706 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16708 if (UnsignedFixup) {
16710 // Conversion to unsigned i64 is implemented with a select,
16711 // depending on whether the source value fits in the range
16712 // of a signed i64. Let Thresh be the FP equivalent of
16713 // 0x8000000000000000ULL.
16715 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16716 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16717 // Fist-to-mem64 FistSrc
16718 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16719 // to XOR'ing the high 32 bits with Adjust.
16721 // Being a power of 2, Thresh is exactly representable in all FP formats.
16722 // For X87 we'd like to use the smallest FP type for this constant, but
16723 // for DAG type consistency we have to match the FP operand type.
16725 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16726 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16727 bool LosesInfo = false;
16728 if (TheVT == MVT::f64)
16729 // The rounding mode is irrelevant as the conversion should be exact.
16730 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16732 else if (TheVT == MVT::f80)
16733 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16734 APFloat::rmNearestTiesToEven, &LosesInfo);
16736 assert(Status == APFloat::opOK && !LosesInfo &&
16737 "FP conversion should have been exact");
16739 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16741 SDValue Cmp = DAG.getSetCC(DL,
16742 getSetCCResultType(DAG.getDataLayout(),
16743 *DAG.getContext(), TheVT),
16744 Value, ThreshVal, ISD::SETLT);
16745 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16746 DAG.getConstant(0, DL, MVT::i32),
16747 DAG.getConstant(0x80000000, DL, MVT::i32));
16748 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16749 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16750 *DAG.getContext(), TheVT),
16751 Value, ThreshVal, ISD::SETLT);
16752 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16755 // FIXME This causes a redundant load/store if the SSE-class value is already
16756 // in memory, such as if it is on the callstack.
16757 if (isScalarFPTypeInSSEReg(TheVT)) {
16758 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16759 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16760 MachinePointerInfo::getFixedStack(MF, SSFI));
16761 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16763 Chain, StackSlot, DAG.getValueType(TheVT)
16766 MachineMemOperand *MMO =
16767 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16768 MachineMemOperand::MOLoad, MemSize, MemSize);
16769 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16770 Chain = Value.getValue(1);
16771 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16772 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16775 MachineMemOperand *MMO =
16776 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16777 MachineMemOperand::MOStore, MemSize, MemSize);
16779 if (UnsignedFixup) {
16781 // Insert the FIST, load its result as two i32's,
16782 // and XOR the high i32 with Adjust.
16784 SDValue FistOps[] = { Chain, Value, StackSlot };
16785 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16786 FistOps, DstTy, MMO);
16789 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16790 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16793 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16794 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16796 if (Subtarget.is64Bit()) {
16797 // Join High32 and Low32 into a 64-bit result.
16798 // (High32 << 32) | Low32
16799 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16800 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16801 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16802 DAG.getConstant(32, DL, MVT::i8));
16803 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16804 return std::make_pair(Result, SDValue());
16807 SDValue ResultOps[] = { Low32, High32 };
16809 SDValue pair = IsReplace
16810 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16811 : DAG.getMergeValues(ResultOps, DL);
16812 return std::make_pair(pair, SDValue());
16814 // Build the FP_TO_INT*_IN_MEM
16815 SDValue Ops[] = { Chain, Value, StackSlot };
16816 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16818 return std::make_pair(FIST, StackSlot);
16822 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16823 const X86Subtarget &Subtarget) {
16824 MVT VT = Op->getSimpleValueType(0);
16825 SDValue In = Op->getOperand(0);
16826 MVT InVT = In.getSimpleValueType();
16829 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16830 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16831 "Expected same number of elements");
16832 assert((VT.getVectorElementType() == MVT::i16 ||
16833 VT.getVectorElementType() == MVT::i32 ||
16834 VT.getVectorElementType() == MVT::i64) &&
16835 "Unexpected element type");
16836 assert((InVT.getVectorElementType() == MVT::i8 ||
16837 InVT.getVectorElementType() == MVT::i16 ||
16838 InVT.getVectorElementType() == MVT::i32) &&
16839 "Unexpected element type");
16841 if (Subtarget.hasInt256())
16842 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16844 // Optimize vectors in AVX mode:
16847 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16848 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16849 // Concat upper and lower parts.
16852 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16853 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16854 // Concat upper and lower parts.
16857 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16858 SDValue Undef = DAG.getUNDEF(InVT);
16859 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16860 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16861 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16863 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16864 VT.getVectorNumElements()/2);
16866 OpLo = DAG.getBitcast(HVT, OpLo);
16867 OpHi = DAG.getBitcast(HVT, OpHi);
16869 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16872 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16873 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16874 const SDLoc &dl, SelectionDAG &DAG) {
16875 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16876 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16877 DAG.getIntPtrConstant(0, dl));
16878 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16879 DAG.getIntPtrConstant(8, dl));
16880 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16881 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16882 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16883 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16886 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16887 const X86Subtarget &Subtarget,
16888 SelectionDAG &DAG) {
16889 MVT VT = Op->getSimpleValueType(0);
16890 SDValue In = Op->getOperand(0);
16891 MVT InVT = In.getSimpleValueType();
16892 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16894 unsigned NumElts = VT.getVectorNumElements();
16896 // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
16897 // avoids a constant pool load.
16898 if (VT.getVectorElementType() != MVT::i8) {
16899 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
16900 return DAG.getNode(ISD::SRL, DL, VT, Extend,
16901 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
16904 // Extend VT if BWI is not supported.
16906 if (!Subtarget.hasBWI()) {
16907 // If v16i32 is to be avoided, we'll need to split and concatenate.
16908 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16909 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16911 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16914 // Widen to 512-bits if VLX is not supported.
16915 MVT WideVT = ExtVT;
16916 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16917 NumElts *= 512 / ExtVT.getSizeInBits();
16918 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16919 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16920 In, DAG.getIntPtrConstant(0, DL));
16921 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16925 SDValue One = DAG.getConstant(1, DL, WideVT);
16926 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16928 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16930 // Truncate if we had to extend above.
16932 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
16933 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
16936 // Extract back to 128/256-bit if we widened.
16938 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16939 DAG.getIntPtrConstant(0, DL));
16941 return SelectedVal;
16944 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16945 SelectionDAG &DAG) {
16946 SDValue In = Op.getOperand(0);
16947 MVT SVT = In.getSimpleValueType();
16949 if (SVT.getVectorElementType() == MVT::i1)
16950 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16952 assert(Subtarget.hasAVX() && "Expected AVX support");
16953 return LowerAVXExtend(Op, DAG, Subtarget);
16956 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16957 /// It makes use of the fact that vectors with enough leading sign/zero bits
16958 /// prevent the PACKSS/PACKUS from saturating the results.
16959 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16960 /// within each 128-bit lane.
16961 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16962 const SDLoc &DL, SelectionDAG &DAG,
16963 const X86Subtarget &Subtarget) {
16964 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16965 "Unexpected PACK opcode");
16967 // Requires SSE2 but AVX512 has fast vector truncate.
16968 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
16971 EVT SrcVT = In.getValueType();
16973 // No truncation required, we might get here due to recursive calls.
16974 if (SrcVT == DstVT)
16977 // We only support vector truncation to 64bits or greater from a
16978 // 128bits or greater source.
16979 unsigned DstSizeInBits = DstVT.getSizeInBits();
16980 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16981 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
16984 unsigned NumElems = SrcVT.getVectorNumElements();
16985 if (!isPowerOf2_32(NumElems))
16988 LLVMContext &Ctx = *DAG.getContext();
16989 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16990 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16992 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16994 // Pack to the largest type possible:
16995 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16996 EVT InVT = MVT::i16, OutVT = MVT::i8;
16997 if (SrcVT.getScalarSizeInBits() > 16 &&
16998 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
17003 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
17004 if (SrcVT.is128BitVector()) {
17005 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
17006 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
17007 In = DAG.getBitcast(InVT, In);
17008 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
17009 Res = extractSubVector(Res, 0, DAG, DL, 64);
17010 return DAG.getBitcast(DstVT, Res);
17013 // Extract lower/upper subvectors.
17014 unsigned NumSubElts = NumElems / 2;
17015 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17016 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17018 unsigned SubSizeInBits = SrcSizeInBits / 2;
17019 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
17020 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
17022 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
17023 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
17024 Lo = DAG.getBitcast(InVT, Lo);
17025 Hi = DAG.getBitcast(InVT, Hi);
17026 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17027 return DAG.getBitcast(DstVT, Res);
17030 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
17031 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
17032 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
17033 Lo = DAG.getBitcast(InVT, Lo);
17034 Hi = DAG.getBitcast(InVT, Hi);
17035 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17037 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
17038 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
17039 Res = DAG.getBitcast(MVT::v4i64, Res);
17040 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
17042 if (DstVT.is256BitVector())
17043 return DAG.getBitcast(DstVT, Res);
17045 // If 512bit -> 128bit truncate another stage.
17046 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17047 Res = DAG.getBitcast(PackedVT, Res);
17048 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17051 // Recursively pack lower/upper subvectors, concat result and pack again.
17052 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
17053 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
17054 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
17055 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
17057 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17058 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
17059 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17062 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
17063 const X86Subtarget &Subtarget) {
17066 MVT VT = Op.getSimpleValueType();
17067 SDValue In = Op.getOperand(0);
17068 MVT InVT = In.getSimpleValueType();
17070 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
17072 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
17073 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
17074 if (InVT.getScalarSizeInBits() <= 16) {
17075 if (Subtarget.hasBWI()) {
17076 // legal, will go to VPMOVB2M, VPMOVW2M
17077 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17078 // We need to shift to get the lsb into sign position.
17079 // Shift packed bytes not supported natively, bitcast to word
17080 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
17081 In = DAG.getNode(ISD::SHL, DL, ExtVT,
17082 DAG.getBitcast(ExtVT, In),
17083 DAG.getConstant(ShiftInx, DL, ExtVT));
17084 In = DAG.getBitcast(InVT, In);
17086 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17089 // Use TESTD/Q, extended vector to packed dword/qword.
17090 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
17091 "Unexpected vector type.");
17092 unsigned NumElts = InVT.getVectorNumElements();
17093 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
17094 // We need to change to a wider element type that we have support for.
17095 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
17096 // For 16 element vectors we extend to v16i32 unless we are explicitly
17097 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
17098 // we need to split into two 8 element vectors which we can extend to v8i32,
17099 // truncate and concat the results. There's an additional complication if
17100 // the original type is v16i8. In that case we can't split the v16i8 so
17101 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
17102 // to v8i32, truncate that to v8i1 and concat the two halves.
17103 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
17104 if (InVT == MVT::v16i8) {
17105 // First we need to sign extend up to 256-bits so we can split that.
17106 InVT = MVT::v16i16;
17107 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
17109 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
17110 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
17111 // We're split now, just emit two truncates and a concat. The two
17112 // truncates will trigger legalization to come back to this function.
17113 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
17114 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
17115 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17117 // We either have 8 elements or we're allowed to use 512-bit vectors.
17118 // If we have VLX, we want to use the narrowest vector that can get the
17119 // job done so we use vXi32.
17120 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
17121 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
17122 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
17124 ShiftInx = InVT.getScalarSizeInBits() - 1;
17127 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17128 // We need to shift to get the lsb into sign position.
17129 In = DAG.getNode(ISD::SHL, DL, InVT, In,
17130 DAG.getConstant(ShiftInx, DL, InVT));
17132 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
17133 if (Subtarget.hasDQI())
17134 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17136 return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
17140 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
17142 MVT VT = Op.getSimpleValueType();
17143 SDValue In = Op.getOperand(0);
17144 MVT InVT = In.getSimpleValueType();
17145 unsigned InNumEltBits = InVT.getScalarSizeInBits();
17147 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
17148 "Invalid TRUNCATE operation");
17150 if (VT.getVectorElementType() == MVT::i1)
17151 return LowerTruncateVecI1(Op, DAG, Subtarget);
17153 // vpmovqb/w/d, vpmovdb/w, vpmovwb
17154 if (Subtarget.hasAVX512()) {
17155 // word to byte only under BWI
17156 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
17157 // Make sure we're allowed to promote 512-bits.
17158 if (Subtarget.canExtendTo512DQ())
17159 return DAG.getNode(ISD::TRUNCATE, DL, VT,
17160 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
17166 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
17167 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
17169 // Truncate with PACKUS if we are truncating a vector with leading zero bits
17170 // that extend all the way to the packed/truncated value.
17171 // Pre-SSE41 we can only use PACKUSWB.
17173 DAG.computeKnownBits(In, Known);
17174 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
17176 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
17179 // Truncate with PACKSS if we are truncating a vector with sign-bits that
17180 // extend all the way to the packed/truncated value.
17181 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
17183 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
17186 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
17187 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
17188 if (Subtarget.hasInt256()) {
17189 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
17190 In = DAG.getBitcast(MVT::v8i32, In);
17191 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
17192 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
17193 DAG.getIntPtrConstant(0, DL));
17196 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17197 DAG.getIntPtrConstant(0, DL));
17198 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17199 DAG.getIntPtrConstant(2, DL));
17200 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17201 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17202 static const int ShufMask[] = {0, 2, 4, 6};
17203 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
17206 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
17207 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17208 if (Subtarget.hasInt256()) {
17209 In = DAG.getBitcast(MVT::v32i8, In);
17211 // The PSHUFB mask:
17212 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
17213 -1, -1, -1, -1, -1, -1, -1, -1,
17214 16, 17, 20, 21, 24, 25, 28, 29,
17215 -1, -1, -1, -1, -1, -1, -1, -1 };
17216 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
17217 In = DAG.getBitcast(MVT::v4i64, In);
17219 static const int ShufMask2[] = {0, 2, -1, -1};
17220 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
17221 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17222 DAG.getIntPtrConstant(0, DL));
17223 return DAG.getBitcast(VT, In);
17226 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17227 DAG.getIntPtrConstant(0, DL));
17229 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17230 DAG.getIntPtrConstant(4, DL));
17232 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
17233 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
17235 // The PSHUFB mask:
17236 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
17237 -1, -1, -1, -1, -1, -1, -1, -1};
17239 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
17240 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
17242 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17243 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17245 // The MOVLHPS Mask:
17246 static const int ShufMask2[] = {0, 1, 4, 5};
17247 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
17248 return DAG.getBitcast(MVT::v8i16, res);
17251 // Handle truncation of V256 to V128 using shuffles.
17252 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
17254 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
17256 unsigned NumElems = VT.getVectorNumElements();
17257 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
17259 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
17260 // Prepare truncation shuffle mask
17261 for (unsigned i = 0; i != NumElems; ++i)
17262 MaskVec[i] = i * 2;
17263 In = DAG.getBitcast(NVT, In);
17264 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
17265 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
17266 DAG.getIntPtrConstant(0, DL));
17269 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
17270 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
17271 MVT VT = Op.getSimpleValueType();
17273 if (VT.isVector()) {
17274 SDValue Src = Op.getOperand(0);
17277 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
17278 MVT ResVT = MVT::v4i32;
17279 MVT TruncVT = MVT::v4i1;
17280 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
17281 if (!IsSigned && !Subtarget.hasVLX()) {
17282 // Widen to 512-bits.
17283 ResVT = MVT::v8i32;
17284 TruncVT = MVT::v8i1;
17285 Opc = ISD::FP_TO_UINT;
17286 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
17287 DAG.getUNDEF(MVT::v8f64),
17288 Src, DAG.getIntPtrConstant(0, dl));
17290 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
17291 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
17292 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
17293 DAG.getIntPtrConstant(0, dl));
17296 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
17297 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
17298 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
17299 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
17300 DAG.getUNDEF(MVT::v2f32)));
17306 assert(!VT.isVector());
17308 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
17309 IsSigned, /*IsReplace=*/ false);
17310 SDValue FIST = Vals.first, StackSlot = Vals.second;
17311 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
17312 if (!FIST.getNode())
17315 if (StackSlot.getNode())
17316 // Load the result.
17317 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
17319 // The node is the result.
17323 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
17325 MVT VT = Op.getSimpleValueType();
17326 SDValue In = Op.getOperand(0);
17327 MVT SVT = In.getSimpleValueType();
17329 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
17331 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
17332 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
17333 In, DAG.getUNDEF(SVT)));
17336 /// The only differences between FABS and FNEG are the mask and the logic op.
17337 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
17338 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
17339 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
17340 "Wrong opcode for lowering FABS or FNEG.");
17342 bool IsFABS = (Op.getOpcode() == ISD::FABS);
17344 // If this is a FABS and it has an FNEG user, bail out to fold the combination
17345 // into an FNABS. We'll lower the FABS after that if it is still in use.
17347 for (SDNode *User : Op->uses())
17348 if (User->getOpcode() == ISD::FNEG)
17352 MVT VT = Op.getSimpleValueType();
17354 bool IsF128 = (VT == MVT::f128);
17356 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
17357 // decide if we should generate a 16-byte constant mask when we only need 4 or
17358 // 8 bytes for the scalar case.
17363 if (VT.isVector()) {
17365 EltVT = VT.getVectorElementType();
17366 } else if (IsF128) {
17367 // SSE instructions are used for optimized f128 logical operations.
17368 LogicVT = MVT::f128;
17371 // There are no scalar bitwise logical SSE/AVX instructions, so we
17372 // generate a 16-byte vector constant and logic op even for the scalar case.
17373 // Using a 16-byte mask allows folding the load of the mask with
17374 // the logic op, so it can save (~4 bytes) on code size.
17375 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17379 unsigned EltBits = EltVT.getSizeInBits();
17380 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
17382 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
17383 const fltSemantics &Sem =
17384 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
17385 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17386 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
17388 SDValue Op0 = Op.getOperand(0);
17389 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
17391 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
17392 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
17394 if (VT.isVector() || IsF128)
17395 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17397 // For the scalar case extend to a 128-bit vector, perform the logic op,
17398 // and extract the scalar result back out.
17399 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
17400 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17401 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
17402 DAG.getIntPtrConstant(0, dl));
17405 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
17406 SDValue Mag = Op.getOperand(0);
17407 SDValue Sign = Op.getOperand(1);
17410 // If the sign operand is smaller, extend it first.
17411 MVT VT = Op.getSimpleValueType();
17412 if (Sign.getSimpleValueType().bitsLT(VT))
17413 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
17415 // And if it is bigger, shrink it first.
17416 if (Sign.getSimpleValueType().bitsGT(VT))
17417 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
17419 // At this point the operands and the result should have the same
17420 // type, and that won't be f80 since that is not custom lowered.
17421 bool IsF128 = (VT == MVT::f128);
17422 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
17423 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
17424 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
17425 "Unexpected type in LowerFCOPYSIGN");
17427 MVT EltVT = VT.getScalarType();
17428 const fltSemantics &Sem =
17429 EltVT == MVT::f64 ? APFloat::IEEEdouble()
17430 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17432 // Perform all scalar logic operations as 16-byte vectors because there are no
17433 // scalar FP logic instructions in SSE.
17434 // TODO: This isn't necessary. If we used scalar types, we might avoid some
17435 // unnecessary splats, but we might miss load folding opportunities. Should
17436 // this decision be based on OptimizeForSize?
17437 bool IsFakeVector = !VT.isVector() && !IsF128;
17440 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17442 // The mask constants are automatically splatted for vector types.
17443 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17444 SDValue SignMask = DAG.getConstantFP(
17445 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17446 SDValue MagMask = DAG.getConstantFP(
17447 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17449 // First, clear all bits but the sign bit from the second operand (sign).
17451 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17452 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17454 // Next, clear the sign bit from the first operand (magnitude).
17455 // TODO: If we had general constant folding for FP logic ops, this check
17456 // wouldn't be necessary.
17458 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17459 APFloat APF = Op0CN->getValueAPF();
17461 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17463 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17465 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17466 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17469 // OR the magnitude value with the sign bit.
17470 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17471 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17472 DAG.getIntPtrConstant(0, dl));
17475 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17476 SDValue N0 = Op.getOperand(0);
17478 MVT VT = Op.getSimpleValueType();
17480 MVT OpVT = N0.getSimpleValueType();
17481 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17482 "Unexpected type for FGETSIGN");
17484 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17485 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17486 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17487 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17488 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17489 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17493 /// Helper for creating a X86ISD::SETCC node.
17494 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17495 SelectionDAG &DAG) {
17496 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17497 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17500 // Check whether an OR'd tree is PTEST-able.
17501 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17502 const X86Subtarget &Subtarget,
17503 SelectionDAG &DAG) {
17504 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17506 if (!Subtarget.hasSSE41())
17509 if (!Op->hasOneUse())
17512 SDNode *N = Op.getNode();
17515 SmallVector<SDValue, 8> Opnds;
17516 DenseMap<SDValue, unsigned> VecInMap;
17517 SmallVector<SDValue, 8> VecIns;
17518 EVT VT = MVT::Other;
17520 // Recognize a special case where a vector is casted into wide integer to
17522 Opnds.push_back(N->getOperand(0));
17523 Opnds.push_back(N->getOperand(1));
17525 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17526 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17527 // BFS traverse all OR'd operands.
17528 if (I->getOpcode() == ISD::OR) {
17529 Opnds.push_back(I->getOperand(0));
17530 Opnds.push_back(I->getOperand(1));
17531 // Re-evaluate the number of nodes to be traversed.
17532 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17536 // Quit if a non-EXTRACT_VECTOR_ELT
17537 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17540 // Quit if without a constant index.
17541 SDValue Idx = I->getOperand(1);
17542 if (!isa<ConstantSDNode>(Idx))
17545 SDValue ExtractedFromVec = I->getOperand(0);
17546 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17547 if (M == VecInMap.end()) {
17548 VT = ExtractedFromVec.getValueType();
17549 // Quit if not 128/256-bit vector.
17550 if (!VT.is128BitVector() && !VT.is256BitVector())
17552 // Quit if not the same type.
17553 if (VecInMap.begin() != VecInMap.end() &&
17554 VT != VecInMap.begin()->first.getValueType())
17556 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17557 VecIns.push_back(ExtractedFromVec);
17559 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17562 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17563 "Not extracted from 128-/256-bit vector.");
17565 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17567 for (DenseMap<SDValue, unsigned>::const_iterator
17568 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17569 // Quit if not all elements are used.
17570 if (I->second != FullMask)
17574 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17576 // Cast all vectors into TestVT for PTEST.
17577 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17578 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17580 // If more than one full vector is evaluated, OR them first before PTEST.
17581 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17582 // Each iteration will OR 2 nodes and append the result until there is only
17583 // 1 node left, i.e. the final OR'd value of all vectors.
17584 SDValue LHS = VecIns[Slot];
17585 SDValue RHS = VecIns[Slot + 1];
17586 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17589 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17590 VecIns.back(), VecIns.back());
17591 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17594 /// return true if \c Op has a use that doesn't just read flags.
17595 static bool hasNonFlagsUse(SDValue Op) {
17596 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17598 SDNode *User = *UI;
17599 unsigned UOpNo = UI.getOperandNo();
17600 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17601 // Look pass truncate.
17602 UOpNo = User->use_begin().getOperandNo();
17603 User = *User->use_begin();
17606 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17607 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17613 /// Emit nodes that will be selected as "test Op0,Op0", or something
17615 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17616 SelectionDAG &DAG) const {
17617 // CF and OF aren't always set the way we want. Determine which
17618 // of these we need.
17619 bool NeedCF = false;
17620 bool NeedOF = false;
17623 case X86::COND_A: case X86::COND_AE:
17624 case X86::COND_B: case X86::COND_BE:
17627 case X86::COND_G: case X86::COND_GE:
17628 case X86::COND_L: case X86::COND_LE:
17629 case X86::COND_O: case X86::COND_NO: {
17630 // Check if we really need to set the
17631 // Overflow flag. If NoSignedWrap is present
17632 // that is not actually needed.
17633 switch (Op->getOpcode()) {
17638 if (Op.getNode()->getFlags().hasNoSignedWrap())
17648 // See if we can use the EFLAGS value from the operand instead of
17649 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17650 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17651 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17652 // Emit a CMP with 0, which is the TEST pattern.
17653 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17654 DAG.getConstant(0, dl, Op.getValueType()));
17656 unsigned Opcode = 0;
17657 unsigned NumOperands = 0;
17659 // Truncate operations may prevent the merge of the SETCC instruction
17660 // and the arithmetic instruction before it. Attempt to truncate the operands
17661 // of the arithmetic instruction and use a reduced bit-width instruction.
17662 bool NeedTruncation = false;
17663 SDValue ArithOp = Op;
17664 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17665 SDValue Arith = Op->getOperand(0);
17666 // Both the trunc and the arithmetic op need to have one user each.
17667 if (Arith->hasOneUse())
17668 switch (Arith.getOpcode()) {
17675 NeedTruncation = true;
17681 // Sometimes flags can be set either with an AND or with an SRL/SHL
17682 // instruction. SRL/SHL variant should be preferred for masks longer than this
17684 const int ShiftToAndMaxMaskWidth = 32;
17685 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17687 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17688 // which may be the result of a CAST. We use the variable 'Op', which is the
17689 // non-casted variable when we check for possible users.
17690 switch (ArithOp.getOpcode()) {
17692 // We only want to rewrite this as a target-specific node with attached
17693 // flags if there is a reasonable chance of either using that to do custom
17694 // instructions selection that can fold some of the memory operands, or if
17695 // only the flags are used. If there are other uses, leave the node alone
17696 // and emit a test instruction.
17697 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17698 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17699 if (UI->getOpcode() != ISD::CopyToReg &&
17700 UI->getOpcode() != ISD::SETCC &&
17701 UI->getOpcode() != ISD::STORE)
17704 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17705 // An add of one will be selected as an INC.
17707 (!Subtarget.slowIncDec() ||
17708 DAG.getMachineFunction().getFunction().optForSize())) {
17709 Opcode = X86ISD::INC;
17714 // An add of negative one (subtract of one) will be selected as a DEC.
17715 if (C->isAllOnesValue() &&
17716 (!Subtarget.slowIncDec() ||
17717 DAG.getMachineFunction().getFunction().optForSize())) {
17718 Opcode = X86ISD::DEC;
17724 // Otherwise use a regular EFLAGS-setting add.
17725 Opcode = X86ISD::ADD;
17730 // If we have a constant logical shift that's only used in a comparison
17731 // against zero turn it into an equivalent AND. This allows turning it into
17732 // a TEST instruction later.
17733 if (ZeroCheck && Op->hasOneUse() &&
17734 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17735 EVT VT = Op.getValueType();
17736 unsigned BitWidth = VT.getSizeInBits();
17737 unsigned ShAmt = Op->getConstantOperandVal(1);
17738 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17740 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17741 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17742 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17743 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17745 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17746 DAG.getConstant(Mask, dl, VT));
17751 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17752 // because a TEST instruction will be better. However, AND should be
17753 // preferred if the instruction can be combined into ANDN.
17754 if (!hasNonFlagsUse(Op)) {
17755 SDValue Op0 = ArithOp->getOperand(0);
17756 SDValue Op1 = ArithOp->getOperand(1);
17757 EVT VT = ArithOp.getValueType();
17758 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17759 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17760 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17762 // If we cannot select an ANDN instruction, check if we can replace
17763 // AND+IMM64 with a shift before giving up. This is possible for masks
17764 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17765 if (!isProperAndn) {
17769 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17770 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17774 const APInt &Mask = CN->getAPIntValue();
17775 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17776 break; // Prefer TEST instruction.
17778 unsigned BitWidth = Mask.getBitWidth();
17779 unsigned LeadingOnes = Mask.countLeadingOnes();
17780 unsigned TrailingZeros = Mask.countTrailingZeros();
17782 if (LeadingOnes + TrailingZeros == BitWidth) {
17783 assert(TrailingZeros < VT.getSizeInBits() &&
17784 "Shift amount should be less than the type width");
17785 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17786 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17787 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17791 unsigned LeadingZeros = Mask.countLeadingZeros();
17792 unsigned TrailingOnes = Mask.countTrailingOnes();
17794 if (LeadingZeros + TrailingOnes == BitWidth) {
17795 assert(LeadingZeros < VT.getSizeInBits() &&
17796 "Shift amount should be less than the type width");
17797 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17798 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17799 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17810 // Similar to ISD::ADD above, check if the uses will preclude useful
17811 // lowering of the target-specific node.
17812 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17813 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17814 if (UI->getOpcode() != ISD::CopyToReg &&
17815 UI->getOpcode() != ISD::SETCC &&
17816 UI->getOpcode() != ISD::STORE)
17819 // Otherwise use a regular EFLAGS-setting instruction.
17820 switch (ArithOp.getOpcode()) {
17821 default: llvm_unreachable("unexpected operator!");
17822 case ISD::SUB: Opcode = X86ISD::SUB; break;
17823 case ISD::XOR: Opcode = X86ISD::XOR; break;
17824 case ISD::AND: Opcode = X86ISD::AND; break;
17825 case ISD::OR: Opcode = X86ISD::OR; break;
17837 return SDValue(Op.getNode(), 1);
17843 // If we found that truncation is beneficial, perform the truncation and
17845 if (NeedTruncation) {
17846 EVT VT = Op.getValueType();
17847 SDValue WideVal = Op->getOperand(0);
17848 EVT WideVT = WideVal.getValueType();
17849 unsigned ConvertedOp = 0;
17850 // Use a target machine opcode to prevent further DAGCombine
17851 // optimizations that may separate the arithmetic operations
17852 // from the setcc node.
17853 switch (WideVal.getOpcode()) {
17855 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17856 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17857 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17858 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17859 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17863 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17864 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17865 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17866 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17867 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17868 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17874 // Emit a CMP with 0, which is the TEST pattern.
17875 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17876 DAG.getConstant(0, dl, Op.getValueType()));
17878 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17879 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17881 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17882 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17883 return SDValue(New.getNode(), 1);
17886 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17888 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17889 const SDLoc &dl, SelectionDAG &DAG) const {
17890 if (isNullConstant(Op1))
17891 return EmitTest(Op0, X86CC, dl, DAG);
17893 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17894 "Unexpected comparison operation for MVT::i1 operands");
17896 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17897 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17898 // Only promote the compare up to I32 if it is a 16 bit operation
17899 // with an immediate. 16 bit immediates are to be avoided.
17900 if ((Op0.getValueType() == MVT::i16 &&
17901 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17902 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17903 !Subtarget.isAtom()) {
17904 unsigned ExtendOp =
17905 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17906 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17907 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17909 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17910 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17911 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17912 return SDValue(Sub.getNode(), 1);
17914 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17917 /// Convert a comparison if required by the subtarget.
17918 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17919 SelectionDAG &DAG) const {
17920 // If the subtarget does not support the FUCOMI instruction, floating-point
17921 // comparisons have to be converted.
17922 if (Subtarget.hasCMov() ||
17923 Cmp.getOpcode() != X86ISD::CMP ||
17924 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17925 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17928 // The instruction selector will select an FUCOM instruction instead of
17929 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17930 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17931 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17933 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17934 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17935 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17936 DAG.getConstant(8, dl, MVT::i8));
17937 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17939 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17940 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17941 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17944 /// Check if replacement of SQRT with RSQRT should be disabled.
17945 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17946 EVT VT = Op.getValueType();
17948 // We never want to use both SQRT and RSQRT instructions for the same input.
17949 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17953 return Subtarget.hasFastVectorFSQRT();
17954 return Subtarget.hasFastScalarFSQRT();
17957 /// The minimum architected relative accuracy is 2^-12. We need one
17958 /// Newton-Raphson step to have a good float result (24 bits of precision).
17959 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17960 SelectionDAG &DAG, int Enabled,
17961 int &RefinementSteps,
17962 bool &UseOneConstNR,
17963 bool Reciprocal) const {
17964 EVT VT = Op.getValueType();
17966 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17967 // It is likely not profitable to do this for f64 because a double-precision
17968 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17969 // instructions: convert to single, rsqrtss, convert back to double, refine
17970 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17971 // along with FMA, this could be a throughput win.
17972 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17973 // after legalize types.
17974 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17975 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17976 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17977 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
17978 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
17979 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17980 RefinementSteps = 1;
17982 UseOneConstNR = false;
17983 // There is no FSQRT for 512-bits, but there is RSQRT14.
17984 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
17985 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
17990 /// The minimum architected relative accuracy is 2^-12. We need one
17991 /// Newton-Raphson step to have a good float result (24 bits of precision).
17992 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17994 int &RefinementSteps) const {
17995 EVT VT = Op.getValueType();
17997 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17998 // It is likely not profitable to do this for f64 because a double-precision
17999 // reciprocal estimate with refinement on x86 prior to FMA requires
18000 // 15 instructions: convert to single, rcpss, convert back to double, refine
18001 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
18002 // along with FMA, this could be a throughput win.
18004 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18005 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
18006 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18007 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18008 // Enable estimate codegen with 1 refinement step for vector division.
18009 // Scalar division estimates are disabled because they break too much
18010 // real-world code. These defaults are intended to match GCC behavior.
18011 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
18014 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18015 RefinementSteps = 1;
18017 // There is no FSQRT for 512-bits, but there is RCP14.
18018 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
18019 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18024 /// If we have at least two divisions that use the same divisor, convert to
18025 /// multiplication by a reciprocal. This may need to be adjusted for a given
18026 /// CPU if a division's cost is not at least twice the cost of a multiplication.
18027 /// This is because we still need one division to calculate the reciprocal and
18028 /// then we need two multiplies by that reciprocal as replacements for the
18029 /// original divisions.
18030 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
18034 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
18035 /// according to equal/not-equal condition code \p CC.
18036 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
18037 const SDLoc &dl, SelectionDAG &DAG) {
18038 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
18039 // instruction. Since the shift amount is in-range-or-undefined, we know
18040 // that doing a bittest on the i32 value is ok. We extend to i32 because
18041 // the encoding for the i16 version is larger than the i32 version.
18042 // Also promote i16 to i32 for performance / code size reason.
18043 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
18044 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
18046 // See if we can use the 32-bit instruction instead of the 64-bit one for a
18047 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
18048 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
18049 // known to be zero.
18050 if (Src.getValueType() == MVT::i64 &&
18051 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
18052 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
18054 // If the operand types disagree, extend the shift amount to match. Since
18055 // BT ignores high bits (like shifts) we can use anyextend.
18056 if (Src.getValueType() != BitNo.getValueType())
18057 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
18059 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
18060 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
18061 return getSETCC(Cond, BT, dl , DAG);
18064 /// Result of 'and' is compared against zero. Change to a BT node if possible.
18065 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
18066 const SDLoc &dl, SelectionDAG &DAG) {
18067 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
18068 SDValue Op0 = And.getOperand(0);
18069 SDValue Op1 = And.getOperand(1);
18070 if (Op0.getOpcode() == ISD::TRUNCATE)
18071 Op0 = Op0.getOperand(0);
18072 if (Op1.getOpcode() == ISD::TRUNCATE)
18073 Op1 = Op1.getOperand(0);
18076 if (Op1.getOpcode() == ISD::SHL)
18077 std::swap(Op0, Op1);
18078 if (Op0.getOpcode() == ISD::SHL) {
18079 if (isOneConstant(Op0.getOperand(0))) {
18080 // If we looked past a truncate, check that it's only truncating away
18082 unsigned BitWidth = Op0.getValueSizeInBits();
18083 unsigned AndBitWidth = And.getValueSizeInBits();
18084 if (BitWidth > AndBitWidth) {
18086 DAG.computeKnownBits(Op0, Known);
18087 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
18091 RHS = Op0.getOperand(1);
18093 } else if (Op1.getOpcode() == ISD::Constant) {
18094 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
18095 uint64_t AndRHSVal = AndRHS->getZExtValue();
18096 SDValue AndLHS = Op0;
18098 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
18099 LHS = AndLHS.getOperand(0);
18100 RHS = AndLHS.getOperand(1);
18102 // Use BT if the immediate can't be encoded in a TEST instruction or we
18103 // are optimizing for size and the immedaite won't fit in a byte.
18104 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
18105 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
18106 isPowerOf2_64(AndRHSVal)) {
18108 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
18114 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
18119 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
18121 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
18126 // SSE Condition code mapping:
18135 switch (SetCCOpcode) {
18136 default: llvm_unreachable("Unexpected SETCC condition");
18138 case ISD::SETEQ: SSECC = 0; break;
18140 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
18142 case ISD::SETOLT: SSECC = 1; break;
18144 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
18146 case ISD::SETOLE: SSECC = 2; break;
18147 case ISD::SETUO: SSECC = 3; break;
18149 case ISD::SETNE: SSECC = 4; break;
18150 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
18151 case ISD::SETUGE: SSECC = 5; break;
18152 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
18153 case ISD::SETUGT: SSECC = 6; break;
18154 case ISD::SETO: SSECC = 7; break;
18155 case ISD::SETUEQ: SSECC = 8; break;
18156 case ISD::SETONE: SSECC = 12; break;
18159 std::swap(Op0, Op1);
18164 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
18165 /// concatenate the result back.
18166 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
18167 MVT VT = Op.getSimpleValueType();
18169 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
18170 "Unsupported value type for operation");
18172 unsigned NumElems = VT.getVectorNumElements();
18174 SDValue CC = Op.getOperand(2);
18176 // Extract the LHS vectors
18177 SDValue LHS = Op.getOperand(0);
18178 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
18179 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
18181 // Extract the RHS vectors
18182 SDValue RHS = Op.getOperand(1);
18183 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
18184 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
18186 // Issue the operation on the smaller types and concatenate the result back
18187 MVT EltVT = VT.getVectorElementType();
18188 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18189 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18190 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
18191 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
18194 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
18196 SDValue Op0 = Op.getOperand(0);
18197 SDValue Op1 = Op.getOperand(1);
18198 SDValue CC = Op.getOperand(2);
18199 MVT VT = Op.getSimpleValueType();
18202 assert(VT.getVectorElementType() == MVT::i1 &&
18203 "Cannot set masked compare for this operation");
18205 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
18207 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
18208 // This helps with vptestm matching.
18209 // TODO: Should we just canonicalize the setcc during DAG combine?
18210 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
18211 ISD::isBuildVectorAllZeros(Op0.getNode()))
18212 std::swap(Op0, Op1);
18214 // Prefer SETGT over SETLT.
18215 if (SetCCOpcode == ISD::SETLT) {
18216 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
18217 std::swap(Op0, Op1);
18220 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
18223 /// Try to turn a VSETULT into a VSETULE by modifying its second
18224 /// operand \p Op1. If non-trivial (for example because it's not constant)
18225 /// return an empty value.
18226 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
18227 SelectionDAG &DAG) {
18228 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
18232 MVT VT = Op1.getSimpleValueType();
18233 MVT EVT = VT.getVectorElementType();
18234 unsigned n = VT.getVectorNumElements();
18235 SmallVector<SDValue, 8> ULTOp1;
18237 for (unsigned i = 0; i < n; ++i) {
18238 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
18239 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
18242 // Avoid underflow.
18243 APInt Val = Elt->getAPIntValue();
18247 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
18250 return DAG.getBuildVector(VT, dl, ULTOp1);
18253 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18255 /// t = psubus Op0, Op1
18256 /// pcmpeq t, <0..0>
18257 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
18258 ISD::CondCode Cond, const SDLoc &dl,
18259 const X86Subtarget &Subtarget,
18260 SelectionDAG &DAG) {
18261 if (!Subtarget.hasSSE2())
18264 MVT VET = VT.getVectorElementType();
18265 if (VET != MVT::i8 && VET != MVT::i16)
18271 case ISD::SETULT: {
18272 // If the comparison is against a constant we can turn this into a
18273 // setule. With psubus, setule does not require a swap. This is
18274 // beneficial because the constant in the register is no longer
18275 // destructed as the destination so it can be hoisted out of a loop.
18276 // Only do this pre-AVX since vpcmp* is no longer destructive.
18277 if (Subtarget.hasAVX())
18279 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
18285 // Psubus is better than flip-sign because it requires no inversion.
18287 std::swap(Op0, Op1);
18293 SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
18294 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18295 getZeroVector(VT, Subtarget, DAG, dl));
18298 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
18299 SelectionDAG &DAG) {
18300 SDValue Op0 = Op.getOperand(0);
18301 SDValue Op1 = Op.getOperand(1);
18302 SDValue CC = Op.getOperand(2);
18303 MVT VT = Op.getSimpleValueType();
18304 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
18305 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
18310 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
18311 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
18315 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
18316 assert(VT.getVectorNumElements() <= 16);
18317 Opc = X86ISD::CMPM;
18319 Opc = X86ISD::CMPP;
18320 // The SSE/AVX packed FP comparison nodes are defined with a
18321 // floating-point vector result that matches the operand type. This allows
18322 // them to work with an SSE1 target (integer vector types are not legal).
18323 VT = Op0.getSimpleValueType();
18326 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
18327 // emit two comparisons and a logic op to tie them together.
18329 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
18330 if (SSECC >= 8 && !Subtarget.hasAVX()) {
18331 // LLVM predicate is SETUEQ or SETONE.
18333 unsigned CombineOpc;
18334 if (Cond == ISD::SETUEQ) {
18337 CombineOpc = X86ISD::FOR;
18339 assert(Cond == ISD::SETONE);
18342 CombineOpc = X86ISD::FAND;
18345 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18346 DAG.getConstant(CC0, dl, MVT::i8));
18347 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18348 DAG.getConstant(CC1, dl, MVT::i8));
18349 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
18351 // Handle all other FP comparisons here.
18352 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
18353 DAG.getConstant(SSECC, dl, MVT::i8));
18356 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
18357 // result type of SETCC. The bitcast is expected to be optimized away
18358 // during combining/isel.
18359 if (Opc == X86ISD::CMPP)
18360 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
18365 MVT VTOp0 = Op0.getSimpleValueType();
18366 assert(VTOp0 == Op1.getSimpleValueType() &&
18367 "Expected operands with same type!");
18368 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
18369 "Invalid number of packed elements for source and destination!");
18371 // This is being called by type legalization because v2i32 is marked custom
18372 // for result type legalization for v2f32.
18373 if (VTOp0 == MVT::v2i32)
18376 // The non-AVX512 code below works under the assumption that source and
18377 // destination types are the same.
18378 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
18379 "Value types for source and destination must be the same!");
18381 // Break 256-bit integer vector compare into smaller ones.
18382 if (VT.is256BitVector() && !Subtarget.hasInt256())
18383 return Lower256IntVSETCC(Op, DAG);
18385 // The result is boolean, but operands are int/float
18386 if (VT.getVectorElementType() == MVT::i1) {
18387 // In AVX-512 architecture setcc returns mask with i1 elements,
18388 // But there is no compare instruction for i8 and i16 elements in KNL.
18389 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
18390 "Unexpected operand type");
18391 return LowerIntVSETCC_AVX512(Op, DAG);
18394 // Lower using XOP integer comparisons.
18395 if (VT.is128BitVector() && Subtarget.hasXOP()) {
18396 // Translate compare code to XOP PCOM compare mode.
18397 unsigned CmpMode = 0;
18399 default: llvm_unreachable("Unexpected SETCC condition");
18401 case ISD::SETLT: CmpMode = 0x00; break;
18403 case ISD::SETLE: CmpMode = 0x01; break;
18405 case ISD::SETGT: CmpMode = 0x02; break;
18407 case ISD::SETGE: CmpMode = 0x03; break;
18408 case ISD::SETEQ: CmpMode = 0x04; break;
18409 case ISD::SETNE: CmpMode = 0x05; break;
18412 // Are we comparing unsigned or signed integers?
18414 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
18416 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18417 DAG.getConstant(CmpMode, dl, MVT::i8));
18420 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
18421 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
18422 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
18423 SDValue BC0 = peekThroughBitcasts(Op0);
18424 if (BC0.getOpcode() == ISD::AND) {
18426 SmallVector<APInt, 64> EltBits;
18427 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18428 VT.getScalarSizeInBits(), UndefElts,
18429 EltBits, false, false)) {
18430 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18432 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18438 // If this is a SETNE against the signed minimum value, change it to SETGT.
18439 // If this is a SETNE against the signed maximum value, change it to SETLT.
18440 // which will be swapped to SETGT.
18441 // Otherwise we use PCMPEQ+invert.
18443 if (Cond == ISD::SETNE &&
18444 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
18445 if (ConstValue.isMinSignedValue())
18447 else if (ConstValue.isMaxSignedValue())
18451 // If both operands are known non-negative, then an unsigned compare is the
18452 // same as a signed compare and there's no need to flip signbits.
18453 // TODO: We could check for more general simplifications here since we're
18454 // computing known bits.
18455 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18456 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18458 // Special case: Use min/max operations for unsigned compares. We only want
18459 // to do this for unsigned compares if we need to flip signs or if it allows
18460 // use to avoid an invert.
18461 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18462 if (ISD::isUnsignedIntSetCC(Cond) &&
18463 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
18464 TLI.isOperationLegal(ISD::UMIN, VT)) {
18465 bool Invert = false;
18468 default: llvm_unreachable("Unexpected condition code");
18469 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
18470 case ISD::SETULE: Opc = ISD::UMIN; break;
18471 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
18472 case ISD::SETUGE: Opc = ISD::UMAX; break;
18475 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18476 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18478 // If the logical-not of the result is required, perform that now.
18480 Result = DAG.getNOT(dl, Result, VT);
18485 // Try to use SUBUS and PCMPEQ.
18486 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
18489 // We are handling one of the integer comparisons here. Since SSE only has
18490 // GT and EQ comparisons for integer, swapping operands and multiple
18491 // operations may be required for some comparisons.
18492 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18494 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18495 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18496 bool Invert = Cond == ISD::SETNE ||
18497 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18500 std::swap(Op0, Op1);
18502 // Check that the operation in question is available (most are plain SSE2,
18503 // but PCMPGTQ and PCMPEQQ have different requirements).
18504 if (VT == MVT::v2i64) {
18505 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18506 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18508 // First cast everything to the right type.
18509 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18510 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18512 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18513 // bits of the inputs before performing those operations. The lower
18514 // compare is always unsigned.
18517 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18519 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18520 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18521 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18523 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18524 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18526 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18527 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18528 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18530 // Create masks for only the low parts/high parts of the 64 bit integers.
18531 static const int MaskHi[] = { 1, 1, 3, 3 };
18532 static const int MaskLo[] = { 0, 0, 2, 2 };
18533 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18534 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18535 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18537 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18538 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18541 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18543 return DAG.getBitcast(VT, Result);
18546 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18547 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18548 // pcmpeqd + pshufd + pand.
18549 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18551 // First cast everything to the right type.
18552 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18553 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18556 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18558 // Make sure the lower and upper halves are both all-ones.
18559 static const int Mask[] = { 1, 0, 3, 2 };
18560 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18561 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18564 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18566 return DAG.getBitcast(VT, Result);
18570 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18571 // bits of the inputs before performing those operations.
18573 MVT EltVT = VT.getVectorElementType();
18574 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18576 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18577 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18580 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18582 // If the logical-not of the result is required, perform that now.
18584 Result = DAG.getNOT(dl, Result, VT);
18589 // Try to select this as a KTEST+SETCC if possible.
18590 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18591 const SDLoc &dl, SelectionDAG &DAG,
18592 const X86Subtarget &Subtarget) {
18593 // Only support equality comparisons.
18594 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18597 // Must be a bitcast from vXi1.
18598 if (Op0.getOpcode() != ISD::BITCAST)
18601 Op0 = Op0.getOperand(0);
18602 MVT VT = Op0.getSimpleValueType();
18603 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18604 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18605 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18608 X86::CondCode X86CC;
18609 if (isNullConstant(Op1)) {
18610 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18611 } else if (isAllOnesConstant(Op1)) {
18612 // C flag is set for all ones.
18613 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18617 // If the input is an OR, we can combine it's operands into the KORTEST.
18620 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18621 LHS = Op0.getOperand(0);
18622 RHS = Op0.getOperand(1);
18625 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18626 return getSETCC(X86CC, KORTEST, dl, DAG);
18629 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18631 MVT VT = Op.getSimpleValueType();
18633 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18635 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18636 SDValue Op0 = Op.getOperand(0);
18637 SDValue Op1 = Op.getOperand(1);
18639 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18641 // Optimize to BT if possible.
18642 // Lower (X & (1 << N)) == 0 to BT(X, N).
18643 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18644 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18645 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18646 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18647 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18651 // Try to use PTEST for a tree ORs equality compared with 0.
18652 // TODO: We could do AND tree with all 1s as well by using the C flag.
18653 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18654 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18655 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18659 // Try to lower using KTEST.
18660 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18663 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18665 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18666 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18668 // If the input is a setcc, then reuse the input setcc or use a new one with
18669 // the inverted condition.
18670 if (Op0.getOpcode() == X86ISD::SETCC) {
18671 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18672 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18676 CCode = X86::GetOppositeBranchCondition(CCode);
18677 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18681 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18682 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18683 if (X86CC == X86::COND_INVALID)
18686 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18687 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18688 return getSETCC(X86CC, EFLAGS, dl, DAG);
18691 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18692 SDValue LHS = Op.getOperand(0);
18693 SDValue RHS = Op.getOperand(1);
18694 SDValue Carry = Op.getOperand(2);
18695 SDValue Cond = Op.getOperand(3);
18698 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18699 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18701 // Recreate the carry if needed.
18702 EVT CarryVT = Carry.getValueType();
18703 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18704 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18705 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18707 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18708 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18709 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18712 /// Return true if opcode is a X86 logical comparison.
18713 static bool isX86LogicalCmp(SDValue Op) {
18714 unsigned Opc = Op.getOpcode();
18715 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18716 Opc == X86ISD::SAHF)
18718 if (Op.getResNo() == 1 &&
18719 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18720 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18721 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18722 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18725 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18731 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18732 if (V.getOpcode() != ISD::TRUNCATE)
18735 SDValue VOp0 = V.getOperand(0);
18736 unsigned InBits = VOp0.getValueSizeInBits();
18737 unsigned Bits = V.getValueSizeInBits();
18738 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18741 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18742 bool AddTest = true;
18743 SDValue Cond = Op.getOperand(0);
18744 SDValue Op1 = Op.getOperand(1);
18745 SDValue Op2 = Op.getOperand(2);
18747 MVT VT = Op1.getSimpleValueType();
18750 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18751 // are available or VBLENDV if AVX is available.
18752 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18753 if (Cond.getOpcode() == ISD::SETCC &&
18754 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
18755 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18756 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18757 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18758 unsigned SSECC = translateX86FSETCC(
18759 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18761 if (Subtarget.hasAVX512()) {
18762 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18763 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18764 assert(!VT.isVector() && "Not a scalar type?");
18765 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18768 if (SSECC < 8 || Subtarget.hasAVX()) {
18769 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18770 DAG.getConstant(SSECC, DL, MVT::i8));
18772 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18773 // of 3 logic instructions for size savings and potentially speed.
18774 // Unfortunately, there is no scalar form of VBLENDV.
18776 // If either operand is a constant, don't try this. We can expect to
18777 // optimize away at least one of the logic instructions later in that
18778 // case, so that sequence would be faster than a variable blend.
18780 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18781 // uses XMM0 as the selection register. That may need just as many
18782 // instructions as the AND/ANDN/OR sequence due to register moves, so
18785 if (Subtarget.hasAVX() &&
18786 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18788 // Convert to vectors, do a VSELECT, and convert back to scalar.
18789 // All of the conversions should be optimized away.
18791 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18792 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18793 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18794 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18796 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18797 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18799 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18801 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18802 VSel, DAG.getIntPtrConstant(0, DL));
18804 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18805 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18806 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18810 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18811 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18812 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18813 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18816 // For v64i1 without 64-bit support we need to split and rejoin.
18817 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18818 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18819 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18820 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18821 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18822 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18823 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18824 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18825 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18828 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18830 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18831 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18832 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18833 Op1Scalar = Op1.getOperand(0);
18835 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18836 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18837 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18838 Op2Scalar = Op2.getOperand(0);
18839 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18840 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18841 Op1Scalar, Op2Scalar);
18842 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18843 return DAG.getBitcast(VT, newSelect);
18844 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18845 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18846 DAG.getIntPtrConstant(0, DL));
18850 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18851 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18852 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18853 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18854 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18855 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18856 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18857 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18860 if (Cond.getOpcode() == ISD::SETCC) {
18861 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18863 // If the condition was updated, it's possible that the operands of the
18864 // select were also updated (for example, EmitTest has a RAUW). Refresh
18865 // the local references to the select operands in case they got stale.
18866 Op1 = Op.getOperand(1);
18867 Op2 = Op.getOperand(2);
18871 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18872 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18873 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18874 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18875 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18876 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18877 if (Cond.getOpcode() == X86ISD::SETCC &&
18878 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18879 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18880 SDValue Cmp = Cond.getOperand(1);
18881 unsigned CondCode =
18882 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18884 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18885 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18886 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18887 SDValue CmpOp0 = Cmp.getOperand(0);
18889 // Apply further optimizations for special cases
18890 // (select (x != 0), -1, 0) -> neg & sbb
18891 // (select (x == 0), 0, -1) -> neg & sbb
18892 if (isNullConstant(Y) &&
18893 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18894 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18895 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18896 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18897 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18898 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18899 SDValue(Neg.getNode(), 1));
18903 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18904 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18905 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18907 SDValue Res = // Res = 0 or -1.
18908 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18909 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18911 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18912 Res = DAG.getNOT(DL, Res, Res.getValueType());
18914 if (!isNullConstant(Op2))
18915 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18917 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18918 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18919 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18920 SDValue CmpOp0 = Cmp.getOperand(0);
18921 SDValue Src1, Src2;
18922 // true if Op2 is XOR or OR operator and one of its operands
18924 // ( a , a op b) || ( b , a op b)
18925 auto isOrXorPattern = [&]() {
18926 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18927 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18929 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18936 if (isOrXorPattern()) {
18938 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18939 // we need mask of all zeros or ones with same size of the other
18941 if (CmpSz > VT.getSizeInBits())
18942 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18943 else if (CmpSz < VT.getSizeInBits())
18944 Neg = DAG.getNode(ISD::AND, DL, VT,
18945 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18946 DAG.getConstant(1, DL, VT));
18949 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18950 Neg); // -(and (x, 0x1))
18951 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18952 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18957 // Look past (and (setcc_carry (cmp ...)), 1).
18958 if (Cond.getOpcode() == ISD::AND &&
18959 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18960 isOneConstant(Cond.getOperand(1)))
18961 Cond = Cond.getOperand(0);
18963 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18964 // setting operand in place of the X86ISD::SETCC.
18965 unsigned CondOpcode = Cond.getOpcode();
18966 if (CondOpcode == X86ISD::SETCC ||
18967 CondOpcode == X86ISD::SETCC_CARRY) {
18968 CC = Cond.getOperand(0);
18970 SDValue Cmp = Cond.getOperand(1);
18971 unsigned Opc = Cmp.getOpcode();
18972 MVT VT = Op.getSimpleValueType();
18974 bool IllegalFPCMov = false;
18975 if (VT.isFloatingPoint() && !VT.isVector() &&
18976 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18977 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18979 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18980 Opc == X86ISD::BT) { // FIXME
18984 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18985 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18986 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18987 Cond.getOperand(0).getValueType() != MVT::i8)) {
18988 SDValue LHS = Cond.getOperand(0);
18989 SDValue RHS = Cond.getOperand(1);
18990 unsigned X86Opcode;
18993 switch (CondOpcode) {
18994 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18995 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18996 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18997 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18998 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18999 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19000 default: llvm_unreachable("unexpected overflowing operator");
19002 if (CondOpcode == ISD::UMULO)
19003 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19006 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19008 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
19010 if (CondOpcode == ISD::UMULO)
19011 Cond = X86Op.getValue(2);
19013 Cond = X86Op.getValue(1);
19015 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
19020 // Look past the truncate if the high bits are known zero.
19021 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19022 Cond = Cond.getOperand(0);
19024 // We know the result of AND is compared against zero. Try to match
19026 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19027 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
19028 CC = NewSetCC.getOperand(0);
19029 Cond = NewSetCC.getOperand(1);
19036 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
19037 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
19040 // a < b ? -1 : 0 -> RES = ~setcc_carry
19041 // a < b ? 0 : -1 -> RES = setcc_carry
19042 // a >= b ? -1 : 0 -> RES = setcc_carry
19043 // a >= b ? 0 : -1 -> RES = ~setcc_carry
19044 if (Cond.getOpcode() == X86ISD::SUB) {
19045 Cond = ConvertCmpIfNecessary(Cond, DAG);
19046 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
19048 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
19049 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
19050 (isNullConstant(Op1) || isNullConstant(Op2))) {
19051 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
19052 DAG.getConstant(X86::COND_B, DL, MVT::i8),
19054 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
19055 return DAG.getNOT(DL, Res, Res.getValueType());
19060 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
19061 // widen the cmov and push the truncate through. This avoids introducing a new
19062 // branch during isel and doesn't add any extensions.
19063 if (Op.getValueType() == MVT::i8 &&
19064 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
19065 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
19066 if (T1.getValueType() == T2.getValueType() &&
19067 // Blacklist CopyFromReg to avoid partial register stalls.
19068 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
19069 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
19071 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19075 // Promote i16 cmovs if it won't prevent folding a load.
19076 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
19077 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
19078 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
19079 SDValue Ops[] = { Op2, Op1, CC, Cond };
19080 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
19081 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19084 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
19085 // condition is true.
19086 SDValue Ops[] = { Op2, Op1, CC, Cond };
19087 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
19090 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
19091 const X86Subtarget &Subtarget,
19092 SelectionDAG &DAG) {
19093 MVT VT = Op->getSimpleValueType(0);
19094 SDValue In = Op->getOperand(0);
19095 MVT InVT = In.getSimpleValueType();
19096 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19097 MVT VTElt = VT.getVectorElementType();
19100 unsigned NumElts = VT.getVectorNumElements();
19102 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
19104 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
19105 // If v16i32 is to be avoided, we'll need to split and concatenate.
19106 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19107 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
19109 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19112 // Widen to 512-bits if VLX is not supported.
19113 MVT WideVT = ExtVT;
19114 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19115 NumElts *= 512 / ExtVT.getSizeInBits();
19116 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19117 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
19118 In, DAG.getIntPtrConstant(0, dl));
19119 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
19123 MVT WideEltVT = WideVT.getVectorElementType();
19124 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
19125 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
19126 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
19128 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
19129 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
19130 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
19133 // Truncate if we had to extend i16/i8 above.
19135 WideVT = MVT::getVectorVT(VTElt, NumElts);
19136 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
19139 // Extract back to 128/256-bit if we widened.
19141 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
19142 DAG.getIntPtrConstant(0, dl));
19147 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19148 SelectionDAG &DAG) {
19149 SDValue In = Op->getOperand(0);
19150 MVT InVT = In.getSimpleValueType();
19152 if (InVT.getVectorElementType() == MVT::i1)
19153 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19155 assert(Subtarget.hasAVX() && "Expected AVX support");
19156 return LowerAVXExtend(Op, DAG, Subtarget);
19159 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
19160 // For sign extend this needs to handle all vector sizes and SSE4.1 and
19161 // non-SSE4.1 targets. For zero extend this should only handle inputs of
19162 // MVT::v64i8 when BWI is not supported, but AVX512 is.
19163 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
19164 const X86Subtarget &Subtarget,
19165 SelectionDAG &DAG) {
19166 SDValue In = Op->getOperand(0);
19167 MVT VT = Op->getSimpleValueType(0);
19168 MVT InVT = In.getSimpleValueType();
19169 assert(VT.getSizeInBits() == InVT.getSizeInBits());
19171 MVT SVT = VT.getVectorElementType();
19172 MVT InSVT = InVT.getVectorElementType();
19173 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
19175 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
19177 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
19179 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
19180 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
19181 !(VT.is512BitVector() && Subtarget.hasAVX512()))
19186 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
19187 // For 512-bit vectors, we need 128-bits or 256-bits.
19188 if (VT.getSizeInBits() > 128) {
19189 // Input needs to be at least the same number of elements as output, and
19190 // at least 128-bits.
19191 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
19192 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
19195 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
19196 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
19198 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
19199 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
19200 // need to be handled here for 256/512-bit results.
19201 if (Subtarget.hasInt256()) {
19202 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
19203 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
19204 X86ISD::VSEXT : X86ISD::VZEXT;
19205 return DAG.getNode(ExtOpc, dl, VT, In);
19208 // We should only get here for sign extend.
19209 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
19210 "Unexpected opcode!");
19212 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
19216 // As SRAI is only available on i16/i32 types, we expand only up to i32
19217 // and handle i64 separately.
19218 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
19219 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
19220 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
19221 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
19222 Curr = DAG.getBitcast(CurrVT, Curr);
19225 SDValue SignExt = Curr;
19226 if (CurrVT != InVT) {
19227 unsigned SignExtShift =
19228 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
19229 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19230 DAG.getConstant(SignExtShift, dl, MVT::i8));
19236 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
19237 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19238 DAG.getConstant(31, dl, MVT::i8));
19239 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
19240 return DAG.getBitcast(VT, Ext);
19246 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19247 SelectionDAG &DAG) {
19248 MVT VT = Op->getSimpleValueType(0);
19249 SDValue In = Op->getOperand(0);
19250 MVT InVT = In.getSimpleValueType();
19253 if (InVT.getVectorElementType() == MVT::i1)
19254 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19256 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19257 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
19258 "Expected same number of elements");
19259 assert((VT.getVectorElementType() == MVT::i16 ||
19260 VT.getVectorElementType() == MVT::i32 ||
19261 VT.getVectorElementType() == MVT::i64) &&
19262 "Unexpected element type");
19263 assert((InVT.getVectorElementType() == MVT::i8 ||
19264 InVT.getVectorElementType() == MVT::i16 ||
19265 InVT.getVectorElementType() == MVT::i32) &&
19266 "Unexpected element type");
19268 if (Subtarget.hasInt256())
19269 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
19271 // Optimize vectors in AVX mode
19272 // Sign extend v8i16 to v8i32 and
19275 // Divide input vector into two parts
19276 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
19277 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
19278 // concat the vectors to original VT
19280 unsigned NumElems = InVT.getVectorNumElements();
19281 SDValue Undef = DAG.getUNDEF(InVT);
19283 SmallVector<int,8> ShufMask1(NumElems, -1);
19284 for (unsigned i = 0; i != NumElems/2; ++i)
19287 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
19289 SmallVector<int,8> ShufMask2(NumElems, -1);
19290 for (unsigned i = 0; i != NumElems/2; ++i)
19291 ShufMask2[i] = i + NumElems/2;
19293 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
19295 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
19296 VT.getVectorNumElements() / 2);
19298 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
19299 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
19301 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19304 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
19305 SelectionDAG &DAG) {
19306 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
19308 SDValue StoredVal = St->getValue();
19310 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19311 assert(StoredVal.getValueType().isVector() &&
19312 StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
19313 StoredVal.getValueType().getVectorNumElements() <= 8 &&
19315 assert(!St->isTruncatingStore() && "Expected non-truncating store");
19316 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19317 "Expected AVX512F without AVX512DQI");
19319 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
19320 DAG.getUNDEF(MVT::v8i1), StoredVal,
19321 DAG.getIntPtrConstant(0, dl));
19322 StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
19324 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
19325 St->getPointerInfo(), St->getAlignment(),
19326 St->getMemOperand()->getFlags());
19329 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
19330 // may emit an illegal shuffle but the expansion is still better than scalar
19331 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
19332 // we'll emit a shuffle and a arithmetic shift.
19333 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
19334 // TODO: It is possible to support ZExt by zeroing the undef values during
19335 // the shuffle phase or after the shuffle.
19336 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
19337 SelectionDAG &DAG) {
19338 MVT RegVT = Op.getSimpleValueType();
19339 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
19340 assert(RegVT.isInteger() &&
19341 "We only custom lower integer vector sext loads.");
19343 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
19345 EVT MemVT = Ld->getMemoryVT();
19347 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19348 if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
19349 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
19350 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
19351 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19352 "Expected AVX512F without AVX512DQI");
19354 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
19355 Ld->getPointerInfo(), Ld->getAlignment(),
19356 Ld->getMemOperand()->getFlags());
19358 // Replace chain users with the new chain.
19359 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
19360 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
19362 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
19363 DAG.getBitcast(MVT::v8i1, NewLd),
19364 DAG.getIntPtrConstant(0, dl));
19365 return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
19368 // Nothing useful we can do without SSE2 shuffles.
19369 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
19371 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19372 unsigned RegSz = RegVT.getSizeInBits();
19374 ISD::LoadExtType Ext = Ld->getExtensionType();
19376 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
19377 && "Only anyext and sext are currently implemented.");
19378 assert(MemVT != RegVT && "Cannot extend to the same type");
19379 assert(MemVT.isVector() && "Must load a vector from memory");
19381 unsigned NumElems = RegVT.getVectorNumElements();
19382 unsigned MemSz = MemVT.getSizeInBits();
19383 assert(RegSz > MemSz && "Register size must be greater than the mem size");
19385 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
19386 // The only way in which we have a legal 256-bit vector result but not the
19387 // integer 256-bit operations needed to directly lower a sextload is if we
19388 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
19389 // a 128-bit vector and a normal sign_extend to 256-bits that should get
19390 // correctly legalized. We do this late to allow the canonical form of
19391 // sextload to persist throughout the rest of the DAG combiner -- it wants
19392 // to fold together any extensions it can, and so will fuse a sign_extend
19393 // of an sextload into a sextload targeting a wider value.
19395 if (MemSz == 128) {
19396 // Just switch this to a normal load.
19397 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
19398 "it must be a legal 128-bit vector "
19400 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
19401 Ld->getPointerInfo(), Ld->getAlignment(),
19402 Ld->getMemOperand()->getFlags());
19404 assert(MemSz < 128 &&
19405 "Can't extend a type wider than 128 bits to a 256 bit vector!");
19406 // Do an sext load to a 128-bit vector type. We want to use the same
19407 // number of elements, but elements half as wide. This will end up being
19408 // recursively lowered by this routine, but will succeed as we definitely
19409 // have all the necessary features if we're using AVX1.
19411 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
19412 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
19414 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
19415 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
19416 Ld->getMemOperand()->getFlags());
19419 // Replace chain users with the new chain.
19420 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
19421 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
19423 // Finally, do a normal sign-extend to the desired register.
19424 return DAG.getSExtOrTrunc(Load, dl, RegVT);
19427 // All sizes must be a power of two.
19428 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
19429 "Non-power-of-two elements are not custom lowered!");
19431 // Attempt to load the original value using scalar loads.
19432 // Find the largest scalar type that divides the total loaded size.
19433 MVT SclrLoadTy = MVT::i8;
19434 for (MVT Tp : MVT::integer_valuetypes()) {
19435 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19440 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19441 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19443 SclrLoadTy = MVT::f64;
19445 // Calculate the number of scalar loads that we need to perform
19446 // in order to load our vector from memory.
19447 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19449 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19450 "Can only lower sext loads with a single scalar load!");
19452 unsigned loadRegZize = RegSz;
19453 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19456 // If we don't have BWI we won't be able to create the shuffle needed for
19458 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19459 MemVT == MVT::v8i8)
19462 // Represent our vector as a sequence of elements which are the
19463 // largest scalar that we can load.
19464 EVT LoadUnitVecVT = EVT::getVectorVT(
19465 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19467 // Represent the data using the same element type that is stored in
19468 // memory. In practice, we ''widen'' MemVT.
19470 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19471 loadRegZize / MemVT.getScalarSizeInBits());
19473 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19474 "Invalid vector type");
19476 // We can't shuffle using an illegal type.
19477 assert(TLI.isTypeLegal(WideVecVT) &&
19478 "We only lower types that form legal widened vector types");
19480 SmallVector<SDValue, 8> Chains;
19481 SDValue Ptr = Ld->getBasePtr();
19482 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19483 TLI.getPointerTy(DAG.getDataLayout()));
19484 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19486 for (unsigned i = 0; i < NumLoads; ++i) {
19487 // Perform a single load.
19488 SDValue ScalarLoad =
19489 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19490 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19491 Chains.push_back(ScalarLoad.getValue(1));
19492 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19493 // another round of DAGCombining.
19495 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19497 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19498 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19500 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19503 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19505 // Bitcast the loaded value to a vector of the original element type, in
19506 // the size of the target vector type.
19507 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19508 unsigned SizeRatio = RegSz / MemSz;
19510 if (Ext == ISD::SEXTLOAD) {
19511 // If we have SSE4.1, we can directly emit a VSEXT node.
19512 if (Subtarget.hasSSE41()) {
19513 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19514 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19518 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19520 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19521 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19523 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19524 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19528 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19529 MemVT == MVT::v8i8) {
19530 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19531 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19535 // Redistribute the loaded elements into the different locations.
19536 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19537 for (unsigned i = 0; i != NumElems; ++i)
19538 ShuffleVec[i * SizeRatio] = i;
19540 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19541 DAG.getUNDEF(WideVecVT), ShuffleVec);
19543 // Bitcast to the requested type.
19544 Shuff = DAG.getBitcast(RegVT, Shuff);
19545 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19549 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19550 /// each of which has no other use apart from the AND / OR.
19551 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19552 Opc = Op.getOpcode();
19553 if (Opc != ISD::OR && Opc != ISD::AND)
19555 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19556 Op.getOperand(0).hasOneUse() &&
19557 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19558 Op.getOperand(1).hasOneUse());
19561 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19562 /// SETCC node has a single use.
19563 static bool isXor1OfSetCC(SDValue Op) {
19564 if (Op.getOpcode() != ISD::XOR)
19566 if (isOneConstant(Op.getOperand(1)))
19567 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19568 Op.getOperand(0).hasOneUse();
19572 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19573 bool addTest = true;
19574 SDValue Chain = Op.getOperand(0);
19575 SDValue Cond = Op.getOperand(1);
19576 SDValue Dest = Op.getOperand(2);
19579 bool Inverted = false;
19581 if (Cond.getOpcode() == ISD::SETCC) {
19582 // Check for setcc([su]{add,sub,mul}o == 0).
19583 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19584 isNullConstant(Cond.getOperand(1)) &&
19585 Cond.getOperand(0).getResNo() == 1 &&
19586 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19587 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19588 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19589 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19590 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19591 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19593 Cond = Cond.getOperand(0);
19595 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19600 // FIXME: LowerXALUO doesn't handle these!!
19601 else if (Cond.getOpcode() == X86ISD::ADD ||
19602 Cond.getOpcode() == X86ISD::SUB ||
19603 Cond.getOpcode() == X86ISD::SMUL ||
19604 Cond.getOpcode() == X86ISD::UMUL)
19605 Cond = LowerXALUO(Cond, DAG);
19608 // Look pass (and (setcc_carry (cmp ...)), 1).
19609 if (Cond.getOpcode() == ISD::AND &&
19610 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19611 isOneConstant(Cond.getOperand(1)))
19612 Cond = Cond.getOperand(0);
19614 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19615 // setting operand in place of the X86ISD::SETCC.
19616 unsigned CondOpcode = Cond.getOpcode();
19617 if (CondOpcode == X86ISD::SETCC ||
19618 CondOpcode == X86ISD::SETCC_CARRY) {
19619 CC = Cond.getOperand(0);
19621 SDValue Cmp = Cond.getOperand(1);
19622 unsigned Opc = Cmp.getOpcode();
19623 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19624 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19628 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19632 // These can only come from an arithmetic instruction with overflow,
19633 // e.g. SADDO, UADDO.
19634 Cond = Cond.getOperand(1);
19640 CondOpcode = Cond.getOpcode();
19641 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19642 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19643 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19644 Cond.getOperand(0).getValueType() != MVT::i8)) {
19645 SDValue LHS = Cond.getOperand(0);
19646 SDValue RHS = Cond.getOperand(1);
19647 unsigned X86Opcode;
19650 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19651 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19653 switch (CondOpcode) {
19654 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19656 if (isOneConstant(RHS)) {
19657 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19660 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19661 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19663 if (isOneConstant(RHS)) {
19664 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19667 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19668 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19669 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19670 default: llvm_unreachable("unexpected overflowing operator");
19673 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19674 if (CondOpcode == ISD::UMULO)
19675 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19678 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19680 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19682 if (CondOpcode == ISD::UMULO)
19683 Cond = X86Op.getValue(2);
19685 Cond = X86Op.getValue(1);
19687 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19691 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19692 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19693 if (CondOpc == ISD::OR) {
19694 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19695 // two branches instead of an explicit OR instruction with a
19697 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19698 isX86LogicalCmp(Cmp)) {
19699 CC = Cond.getOperand(0).getOperand(0);
19700 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19701 Chain, Dest, CC, Cmp);
19702 CC = Cond.getOperand(1).getOperand(0);
19706 } else { // ISD::AND
19707 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19708 // two branches instead of an explicit AND instruction with a
19709 // separate test. However, we only do this if this block doesn't
19710 // have a fall-through edge, because this requires an explicit
19711 // jmp when the condition is false.
19712 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19713 isX86LogicalCmp(Cmp) &&
19714 Op.getNode()->hasOneUse()) {
19715 X86::CondCode CCode =
19716 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19717 CCode = X86::GetOppositeBranchCondition(CCode);
19718 CC = DAG.getConstant(CCode, dl, MVT::i8);
19719 SDNode *User = *Op.getNode()->use_begin();
19720 // Look for an unconditional branch following this conditional branch.
19721 // We need this because we need to reverse the successors in order
19722 // to implement FCMP_OEQ.
19723 if (User->getOpcode() == ISD::BR) {
19724 SDValue FalseBB = User->getOperand(1);
19726 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19727 assert(NewBR == User);
19731 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19732 Chain, Dest, CC, Cmp);
19733 X86::CondCode CCode =
19734 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19735 CCode = X86::GetOppositeBranchCondition(CCode);
19736 CC = DAG.getConstant(CCode, dl, MVT::i8);
19742 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19743 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19744 // It should be transformed during dag combiner except when the condition
19745 // is set by a arithmetics with overflow node.
19746 X86::CondCode CCode =
19747 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19748 CCode = X86::GetOppositeBranchCondition(CCode);
19749 CC = DAG.getConstant(CCode, dl, MVT::i8);
19750 Cond = Cond.getOperand(0).getOperand(1);
19752 } else if (Cond.getOpcode() == ISD::SETCC &&
19753 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19754 // For FCMP_OEQ, we can emit
19755 // two branches instead of an explicit AND instruction with a
19756 // separate test. However, we only do this if this block doesn't
19757 // have a fall-through edge, because this requires an explicit
19758 // jmp when the condition is false.
19759 if (Op.getNode()->hasOneUse()) {
19760 SDNode *User = *Op.getNode()->use_begin();
19761 // Look for an unconditional branch following this conditional branch.
19762 // We need this because we need to reverse the successors in order
19763 // to implement FCMP_OEQ.
19764 if (User->getOpcode() == ISD::BR) {
19765 SDValue FalseBB = User->getOperand(1);
19767 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19768 assert(NewBR == User);
19772 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19773 Cond.getOperand(0), Cond.getOperand(1));
19774 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19775 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19776 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19777 Chain, Dest, CC, Cmp);
19778 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19783 } else if (Cond.getOpcode() == ISD::SETCC &&
19784 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19785 // For FCMP_UNE, we can emit
19786 // two branches instead of an explicit AND instruction with a
19787 // separate test. However, we only do this if this block doesn't
19788 // have a fall-through edge, because this requires an explicit
19789 // jmp when the condition is false.
19790 if (Op.getNode()->hasOneUse()) {
19791 SDNode *User = *Op.getNode()->use_begin();
19792 // Look for an unconditional branch following this conditional branch.
19793 // We need this because we need to reverse the successors in order
19794 // to implement FCMP_UNE.
19795 if (User->getOpcode() == ISD::BR) {
19796 SDValue FalseBB = User->getOperand(1);
19798 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19799 assert(NewBR == User);
19802 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19803 Cond.getOperand(0), Cond.getOperand(1));
19804 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19805 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19806 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19807 Chain, Dest, CC, Cmp);
19808 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19818 // Look pass the truncate if the high bits are known zero.
19819 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19820 Cond = Cond.getOperand(0);
19822 // We know the result of AND is compared against zero. Try to match
19824 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19825 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19826 CC = NewSetCC.getOperand(0);
19827 Cond = NewSetCC.getOperand(1);
19834 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19835 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19836 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19838 Cond = ConvertCmpIfNecessary(Cond, DAG);
19839 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19840 Chain, Dest, CC, Cond);
19843 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19844 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19845 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19846 // that the guard pages used by the OS virtual memory manager are allocated in
19847 // correct sequence.
19849 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19850 SelectionDAG &DAG) const {
19851 MachineFunction &MF = DAG.getMachineFunction();
19852 bool SplitStack = MF.shouldSplitStack();
19853 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19854 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19855 SplitStack || EmitStackProbe;
19859 SDNode *Node = Op.getNode();
19860 SDValue Chain = Op.getOperand(0);
19861 SDValue Size = Op.getOperand(1);
19862 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19863 EVT VT = Node->getValueType(0);
19865 // Chain the dynamic stack allocation so that it doesn't modify the stack
19866 // pointer when other instructions are using the stack.
19867 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19869 bool Is64Bit = Subtarget.is64Bit();
19870 MVT SPTy = getPointerTy(DAG.getDataLayout());
19874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19875 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19876 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19877 " not tell us which reg is the stack pointer!");
19879 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19880 Chain = SP.getValue(1);
19881 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19882 unsigned StackAlign = TFI.getStackAlignment();
19883 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19884 if (Align > StackAlign)
19885 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19886 DAG.getConstant(-(uint64_t)Align, dl, VT));
19887 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19888 } else if (SplitStack) {
19889 MachineRegisterInfo &MRI = MF.getRegInfo();
19892 // The 64 bit implementation of segmented stacks needs to clobber both r10
19893 // r11. This makes it impossible to use it along with nested parameters.
19894 const Function &F = MF.getFunction();
19895 for (const auto &A : F.args()) {
19896 if (A.hasNestAttr())
19897 report_fatal_error("Cannot use segmented stacks with functions that "
19898 "have nested arguments.");
19902 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19903 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19904 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19905 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19906 DAG.getRegister(Vreg, SPTy));
19908 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19909 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19910 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19912 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19913 unsigned SPReg = RegInfo->getStackRegister();
19914 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19915 Chain = SP.getValue(1);
19918 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19919 DAG.getConstant(-(uint64_t)Align, dl, VT));
19920 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19926 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19927 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19929 SDValue Ops[2] = {Result, Chain};
19930 return DAG.getMergeValues(Ops, dl);
19933 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19934 MachineFunction &MF = DAG.getMachineFunction();
19935 auto PtrVT = getPointerTy(MF.getDataLayout());
19936 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19938 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19941 if (!Subtarget.is64Bit() ||
19942 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19943 // vastart just stores the address of the VarArgsFrameIndex slot into the
19944 // memory location argument.
19945 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19946 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19947 MachinePointerInfo(SV));
19951 // gp_offset (0 - 6 * 8)
19952 // fp_offset (48 - 48 + 8 * 16)
19953 // overflow_arg_area (point to parameters coming in memory).
19955 SmallVector<SDValue, 8> MemOps;
19956 SDValue FIN = Op.getOperand(1);
19958 SDValue Store = DAG.getStore(
19959 Op.getOperand(0), DL,
19960 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19961 MachinePointerInfo(SV));
19962 MemOps.push_back(Store);
19965 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19966 Store = DAG.getStore(
19967 Op.getOperand(0), DL,
19968 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19969 MachinePointerInfo(SV, 4));
19970 MemOps.push_back(Store);
19972 // Store ptr to overflow_arg_area
19973 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19974 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19976 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19977 MemOps.push_back(Store);
19979 // Store ptr to reg_save_area.
19980 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19981 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19982 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19983 Store = DAG.getStore(
19984 Op.getOperand(0), DL, RSFIN, FIN,
19985 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19986 MemOps.push_back(Store);
19987 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19990 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19991 assert(Subtarget.is64Bit() &&
19992 "LowerVAARG only handles 64-bit va_arg!");
19993 assert(Op.getNumOperands() == 4);
19995 MachineFunction &MF = DAG.getMachineFunction();
19996 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
19997 // The Win64 ABI uses char* instead of a structure.
19998 return DAG.expandVAArg(Op.getNode());
20000 SDValue Chain = Op.getOperand(0);
20001 SDValue SrcPtr = Op.getOperand(1);
20002 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20003 unsigned Align = Op.getConstantOperandVal(3);
20006 EVT ArgVT = Op.getNode()->getValueType(0);
20007 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20008 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
20011 // Decide which area this value should be read from.
20012 // TODO: Implement the AMD64 ABI in its entirety. This simple
20013 // selection mechanism works only for the basic types.
20014 if (ArgVT == MVT::f80) {
20015 llvm_unreachable("va_arg for f80 not yet implemented");
20016 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
20017 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
20018 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
20019 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
20021 llvm_unreachable("Unhandled argument type in LowerVAARG");
20024 if (ArgMode == 2) {
20025 // Sanity Check: Make sure using fp_offset makes sense.
20026 assert(!Subtarget.useSoftFloat() &&
20027 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
20028 Subtarget.hasSSE1());
20031 // Insert VAARG_64 node into the DAG
20032 // VAARG_64 returns two values: Variable Argument Address, Chain
20033 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
20034 DAG.getConstant(ArgMode, dl, MVT::i8),
20035 DAG.getConstant(Align, dl, MVT::i32)};
20036 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
20037 SDValue VAARG = DAG.getMemIntrinsicNode(
20038 X86ISD::VAARG_64, dl,
20039 VTs, InstOps, MVT::i64,
20040 MachinePointerInfo(SV),
20042 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
20043 Chain = VAARG.getValue(1);
20045 // Load the next argument and return it
20046 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
20049 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
20050 SelectionDAG &DAG) {
20051 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
20052 // where a va_list is still an i8*.
20053 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
20054 if (Subtarget.isCallingConvWin64(
20055 DAG.getMachineFunction().getFunction().getCallingConv()))
20056 // Probably a Win64 va_copy.
20057 return DAG.expandVACopy(Op.getNode());
20059 SDValue Chain = Op.getOperand(0);
20060 SDValue DstPtr = Op.getOperand(1);
20061 SDValue SrcPtr = Op.getOperand(2);
20062 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
20063 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20066 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
20067 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
20069 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
20072 /// Handle vector element shifts where the shift amount is a constant.
20073 /// Takes immediate version of shift as input.
20074 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
20075 SDValue SrcOp, uint64_t ShiftAmt,
20076 SelectionDAG &DAG) {
20077 MVT ElementType = VT.getVectorElementType();
20079 // Bitcast the source vector to the output type, this is mainly necessary for
20080 // vXi8/vXi64 shifts.
20081 if (VT != SrcOp.getSimpleValueType())
20082 SrcOp = DAG.getBitcast(VT, SrcOp);
20084 // Fold this packed shift into its first operand if ShiftAmt is 0.
20088 // Check for ShiftAmt >= element width
20089 if (ShiftAmt >= ElementType.getSizeInBits()) {
20090 if (Opc == X86ISD::VSRAI)
20091 ShiftAmt = ElementType.getSizeInBits() - 1;
20093 return DAG.getConstant(0, dl, VT);
20096 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
20097 && "Unknown target vector shift-by-constant node");
20099 // Fold this packed vector shift into a build vector if SrcOp is a
20100 // vector of Constants or UNDEFs.
20101 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
20102 SmallVector<SDValue, 8> Elts;
20103 unsigned NumElts = SrcOp->getNumOperands();
20104 ConstantSDNode *ND;
20107 default: llvm_unreachable("Unknown opcode!");
20108 case X86ISD::VSHLI:
20109 for (unsigned i=0; i!=NumElts; ++i) {
20110 SDValue CurrentOp = SrcOp->getOperand(i);
20111 if (CurrentOp->isUndef()) {
20112 Elts.push_back(CurrentOp);
20115 ND = cast<ConstantSDNode>(CurrentOp);
20116 const APInt &C = ND->getAPIntValue();
20117 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
20120 case X86ISD::VSRLI:
20121 for (unsigned i=0; i!=NumElts; ++i) {
20122 SDValue CurrentOp = SrcOp->getOperand(i);
20123 if (CurrentOp->isUndef()) {
20124 Elts.push_back(CurrentOp);
20127 ND = cast<ConstantSDNode>(CurrentOp);
20128 const APInt &C = ND->getAPIntValue();
20129 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
20132 case X86ISD::VSRAI:
20133 for (unsigned i=0; i!=NumElts; ++i) {
20134 SDValue CurrentOp = SrcOp->getOperand(i);
20135 if (CurrentOp->isUndef()) {
20136 Elts.push_back(CurrentOp);
20139 ND = cast<ConstantSDNode>(CurrentOp);
20140 const APInt &C = ND->getAPIntValue();
20141 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
20146 return DAG.getBuildVector(VT, dl, Elts);
20149 return DAG.getNode(Opc, dl, VT, SrcOp,
20150 DAG.getConstant(ShiftAmt, dl, MVT::i8));
20153 /// Handle vector element shifts where the shift amount may or may not be a
20154 /// constant. Takes immediate version of shift as input.
20155 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
20156 SDValue SrcOp, SDValue ShAmt,
20157 const X86Subtarget &Subtarget,
20158 SelectionDAG &DAG) {
20159 MVT SVT = ShAmt.getSimpleValueType();
20160 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
20162 // Catch shift-by-constant.
20163 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
20164 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
20165 CShAmt->getZExtValue(), DAG);
20167 // Change opcode to non-immediate version
20169 default: llvm_unreachable("Unknown target vector shift node");
20170 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
20171 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
20172 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
20175 // Need to build a vector containing shift amount.
20176 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
20177 // +=================+============+=======================================+
20178 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
20179 // +=================+============+=======================================+
20180 // | i64 | Yes, No | Use ShAmt as lowest elt |
20181 // | i32 | Yes | zero-extend in-reg |
20182 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
20183 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
20184 // +=================+============+=======================================+
20186 if (SVT == MVT::i64)
20187 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
20188 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
20189 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
20190 ShAmt = ShAmt.getOperand(0);
20191 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
20192 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20193 } else if (Subtarget.hasSSE41() &&
20194 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
20195 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
20196 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20198 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
20199 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
20200 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
20203 // The return type has to be a 128-bit type with the same element
20204 // type as the input type.
20205 MVT EltVT = VT.getVectorElementType();
20206 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
20208 ShAmt = DAG.getBitcast(ShVT, ShAmt);
20209 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
20212 /// Return Mask with the necessary casting or extending
20213 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
20214 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
20215 const X86Subtarget &Subtarget, SelectionDAG &DAG,
20218 if (isAllOnesConstant(Mask))
20219 return DAG.getConstant(1, dl, MaskVT);
20220 if (X86::isZeroNode(Mask))
20221 return DAG.getConstant(0, dl, MaskVT);
20223 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
20224 // Mask should be extended
20225 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
20226 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
20229 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
20230 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
20231 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
20232 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
20234 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20235 DAG.getConstant(0, dl, MVT::i32));
20236 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20237 DAG.getConstant(1, dl, MVT::i32));
20239 Lo = DAG.getBitcast(MVT::v32i1, Lo);
20240 Hi = DAG.getBitcast(MVT::v32i1, Hi);
20242 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
20244 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20245 Mask.getSimpleValueType().getSizeInBits());
20246 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
20247 // are extracted by EXTRACT_SUBVECTOR.
20248 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
20249 DAG.getBitcast(BitcastVT, Mask),
20250 DAG.getIntPtrConstant(0, dl));
20254 /// Return (and \p Op, \p Mask) for compare instructions or
20255 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
20256 /// necessary casting or extending for \p Mask when lowering masking intrinsics
20257 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
20258 SDValue PreservedSrc,
20259 const X86Subtarget &Subtarget,
20260 SelectionDAG &DAG) {
20261 MVT VT = Op.getSimpleValueType();
20262 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20263 unsigned OpcodeSelect = ISD::VSELECT;
20266 if (isAllOnesConstant(Mask))
20269 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20271 switch (Op.getOpcode()) {
20274 case X86ISD::CMPM_RND:
20275 case X86ISD::VPSHUFBITQMB:
20276 case X86ISD::VFPCLASS:
20277 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
20278 case ISD::TRUNCATE:
20279 case X86ISD::VTRUNC:
20280 case X86ISD::VTRUNCS:
20281 case X86ISD::VTRUNCUS:
20282 case X86ISD::CVTPS2PH:
20283 // We can't use ISD::VSELECT here because it is not always "Legal"
20284 // for the destination type. For example vpmovqb require only AVX512
20285 // and vselect that can operate on byte element type require BWI
20286 OpcodeSelect = X86ISD::SELECT;
20289 if (PreservedSrc.isUndef())
20290 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20291 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
20294 /// Creates an SDNode for a predicated scalar operation.
20295 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
20296 /// The mask is coming as MVT::i8 and it should be transformed
20297 /// to MVT::v1i1 while lowering masking intrinsics.
20298 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
20299 /// "X86select" instead of "vselect". We just can't create the "vselect" node
20300 /// for a scalar instruction.
20301 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
20302 SDValue PreservedSrc,
20303 const X86Subtarget &Subtarget,
20304 SelectionDAG &DAG) {
20306 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
20307 if (MaskConst->getZExtValue() & 0x1)
20310 MVT VT = Op.getSimpleValueType();
20313 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
20314 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
20315 if (Op.getOpcode() == X86ISD::FSETCCM ||
20316 Op.getOpcode() == X86ISD::FSETCCM_RND ||
20317 Op.getOpcode() == X86ISD::VFPCLASSS)
20318 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
20320 if (PreservedSrc.isUndef())
20321 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20322 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
20325 static int getSEHRegistrationNodeSize(const Function *Fn) {
20326 if (!Fn->hasPersonalityFn())
20327 report_fatal_error(
20328 "querying registration node size for function without personality");
20329 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
20330 // WinEHStatePass for the full struct definition.
20331 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
20332 case EHPersonality::MSVC_X86SEH: return 24;
20333 case EHPersonality::MSVC_CXX: return 16;
20336 report_fatal_error(
20337 "can only recover FP for 32-bit MSVC EH personality functions");
20340 /// When the MSVC runtime transfers control to us, either to an outlined
20341 /// function or when returning to a parent frame after catching an exception, we
20342 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
20343 /// Here's the math:
20344 /// RegNodeBase = EntryEBP - RegNodeSize
20345 /// ParentFP = RegNodeBase - ParentFrameOffset
20346 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
20347 /// subtracting the offset (negative on x86) takes us back to the parent FP.
20348 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
20349 SDValue EntryEBP) {
20350 MachineFunction &MF = DAG.getMachineFunction();
20353 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20354 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20356 // It's possible that the parent function no longer has a personality function
20357 // if the exceptional code was optimized away, in which case we just return
20358 // the incoming EBP.
20359 if (!Fn->hasPersonalityFn())
20362 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
20363 // registration, or the .set_setframe offset.
20364 MCSymbol *OffsetSym =
20365 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
20366 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20367 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
20368 SDValue ParentFrameOffset =
20369 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
20371 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
20372 // prologue to RBP in the parent function.
20373 const X86Subtarget &Subtarget =
20374 static_cast<const X86Subtarget &>(DAG.getSubtarget());
20375 if (Subtarget.is64Bit())
20376 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
20378 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
20379 // RegNodeBase = EntryEBP - RegNodeSize
20380 // ParentFP = RegNodeBase - ParentFrameOffset
20381 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
20382 DAG.getConstant(RegNodeSize, dl, PtrVT));
20383 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
20386 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
20387 SelectionDAG &DAG) const {
20388 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
20389 auto isRoundModeCurDirection = [](SDValue Rnd) {
20390 if (!isa<ConstantSDNode>(Rnd))
20393 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
20394 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
20398 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20399 MVT VT = Op.getSimpleValueType();
20400 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
20402 switch(IntrData->Type) {
20403 case INTR_TYPE_1OP:
20404 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
20405 case INTR_TYPE_2OP: {
20406 // We specify 2 possible opcodes for intrinsics with rounding modes.
20407 // First, we check if the intrinsic may have non-default rounding mode,
20408 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20409 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20410 if (IntrWithRoundingModeOpcode != 0) {
20411 SDValue Rnd = Op.getOperand(3);
20412 if (!isRoundModeCurDirection(Rnd)) {
20413 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20414 Op.getOperand(1), Op.getOperand(2), Rnd);
20418 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20419 Op.getOperand(1), Op.getOperand(2));
20421 case INTR_TYPE_3OP:
20422 case INTR_TYPE_3OP_IMM8: {
20423 SDValue Src3 = Op.getOperand(3);
20425 if (IntrData->Type == INTR_TYPE_3OP_IMM8)
20426 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20428 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20429 Op.getOperand(1), Op.getOperand(2), Src3);
20431 case INTR_TYPE_4OP:
20432 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20433 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
20434 case INTR_TYPE_1OP_MASK_RM: {
20435 SDValue Src = Op.getOperand(1);
20436 SDValue PassThru = Op.getOperand(2);
20437 SDValue Mask = Op.getOperand(3);
20438 SDValue RoundingMode;
20439 // We always add rounding mode to the Node.
20440 // If the rounding mode is not specified, we add the
20441 // "current direction" mode.
20442 if (Op.getNumOperands() == 4)
20444 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20446 RoundingMode = Op.getOperand(4);
20447 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20448 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20450 Mask, PassThru, Subtarget, DAG);
20452 case INTR_TYPE_1OP_MASK: {
20453 SDValue Src = Op.getOperand(1);
20454 SDValue PassThru = Op.getOperand(2);
20455 SDValue Mask = Op.getOperand(3);
20456 // We add rounding mode to the Node when
20457 // - RM Opcode is specified and
20458 // - RM is not "current direction".
20459 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20460 if (IntrWithRoundingModeOpcode != 0) {
20461 SDValue Rnd = Op.getOperand(4);
20462 if (!isRoundModeCurDirection(Rnd)) {
20463 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20464 dl, Op.getValueType(),
20466 Mask, PassThru, Subtarget, DAG);
20469 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20470 Mask, PassThru, Subtarget, DAG);
20472 case INTR_TYPE_SCALAR_MASK: {
20473 SDValue Src1 = Op.getOperand(1);
20474 SDValue Src2 = Op.getOperand(2);
20475 SDValue passThru = Op.getOperand(3);
20476 SDValue Mask = Op.getOperand(4);
20477 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20478 // There are 2 kinds of intrinsics in this group:
20479 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20480 // (2) With rounding mode and sae - 7 operands.
20481 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20482 if (Op.getNumOperands() == (5U + HasRounding)) {
20484 SDValue Rnd = Op.getOperand(5);
20485 if (!isRoundModeCurDirection(Rnd))
20486 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20487 dl, VT, Src1, Src2, Rnd),
20488 Mask, passThru, Subtarget, DAG);
20490 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20492 Mask, passThru, Subtarget, DAG);
20495 assert(Op.getNumOperands() == (6U + HasRounding) &&
20496 "Unexpected intrinsic form");
20497 SDValue RoundingMode = Op.getOperand(5);
20499 SDValue Sae = Op.getOperand(6);
20500 if (!isRoundModeCurDirection(Sae))
20501 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20502 dl, VT, Src1, Src2,
20503 RoundingMode, Sae),
20504 Mask, passThru, Subtarget, DAG);
20506 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20507 Src2, RoundingMode),
20508 Mask, passThru, Subtarget, DAG);
20510 case INTR_TYPE_SCALAR_MASK_RM: {
20511 SDValue Src1 = Op.getOperand(1);
20512 SDValue Src2 = Op.getOperand(2);
20513 SDValue Src0 = Op.getOperand(3);
20514 SDValue Mask = Op.getOperand(4);
20515 // There are 2 kinds of intrinsics in this group:
20516 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20517 // (2) With rounding mode and sae - 7 operands.
20518 if (Op.getNumOperands() == 6) {
20519 SDValue Sae = Op.getOperand(5);
20520 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20522 Mask, Src0, Subtarget, DAG);
20524 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20525 SDValue RoundingMode = Op.getOperand(5);
20526 SDValue Sae = Op.getOperand(6);
20527 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20528 RoundingMode, Sae),
20529 Mask, Src0, Subtarget, DAG);
20531 case INTR_TYPE_2OP_MASK:
20532 case INTR_TYPE_2OP_IMM8_MASK: {
20533 SDValue Src1 = Op.getOperand(1);
20534 SDValue Src2 = Op.getOperand(2);
20535 SDValue PassThru = Op.getOperand(3);
20536 SDValue Mask = Op.getOperand(4);
20538 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
20539 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20541 // We specify 2 possible opcodes for intrinsics with rounding modes.
20542 // First, we check if the intrinsic may have non-default rounding mode,
20543 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20544 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20545 if (IntrWithRoundingModeOpcode != 0) {
20546 SDValue Rnd = Op.getOperand(5);
20547 if (!isRoundModeCurDirection(Rnd)) {
20548 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20549 dl, Op.getValueType(),
20551 Mask, PassThru, Subtarget, DAG);
20554 // TODO: Intrinsics should have fast-math-flags to propagate.
20555 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20556 Mask, PassThru, Subtarget, DAG);
20558 case INTR_TYPE_2OP_MASK_RM: {
20559 SDValue Src1 = Op.getOperand(1);
20560 SDValue Src2 = Op.getOperand(2);
20561 SDValue PassThru = Op.getOperand(3);
20562 SDValue Mask = Op.getOperand(4);
20563 // We specify 2 possible modes for intrinsics, with/without rounding
20565 // First, we check if the intrinsic have rounding mode (6 operands),
20566 // if not, we set rounding mode to "current".
20568 if (Op.getNumOperands() == 6)
20569 Rnd = Op.getOperand(5);
20571 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20572 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20574 Mask, PassThru, Subtarget, DAG);
20576 case INTR_TYPE_3OP_SCALAR_MASK: {
20577 SDValue Src1 = Op.getOperand(1);
20578 SDValue Src2 = Op.getOperand(2);
20579 SDValue Src3 = Op.getOperand(3);
20580 SDValue PassThru = Op.getOperand(4);
20581 SDValue Mask = Op.getOperand(5);
20583 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20584 if (IntrWithRoundingModeOpcode != 0) {
20585 SDValue Rnd = Op.getOperand(6);
20586 if (!isRoundModeCurDirection(Rnd))
20587 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20588 dl, VT, Src1, Src2, Src3, Rnd),
20589 Mask, PassThru, Subtarget, DAG);
20591 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20593 Mask, PassThru, Subtarget, DAG);
20595 case INTR_TYPE_3OP_MASK: {
20596 SDValue Src1 = Op.getOperand(1);
20597 SDValue Src2 = Op.getOperand(2);
20598 SDValue Src3 = Op.getOperand(3);
20599 SDValue PassThru = Op.getOperand(4);
20600 SDValue Mask = Op.getOperand(5);
20602 // We specify 2 possible opcodes for intrinsics with rounding modes.
20603 // First, we check if the intrinsic may have non-default rounding mode,
20604 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20605 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20606 if (IntrWithRoundingModeOpcode != 0) {
20607 SDValue Rnd = Op.getOperand(6);
20608 if (!isRoundModeCurDirection(Rnd)) {
20609 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20610 dl, Op.getValueType(),
20611 Src1, Src2, Src3, Rnd),
20612 Mask, PassThru, Subtarget, DAG);
20615 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20617 Mask, PassThru, Subtarget, DAG);
20619 case INTR_TYPE_3OP_RM: {
20620 SDValue Src1 = Op.getOperand(1);
20621 SDValue Src2 = Op.getOperand(2);
20622 SDValue Src3 = Op.getOperand(3);
20624 // We specify 2 possible opcodes for intrinsics with rounding modes.
20625 // First, we check if the intrinsic may have non-default rounding mode,
20626 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20627 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20628 if (IntrWithRoundingModeOpcode != 0) {
20629 SDValue Rnd = Op.getOperand(4);
20630 if (!isRoundModeCurDirection(Rnd)) {
20631 return DAG.getNode(IntrWithRoundingModeOpcode,
20632 dl, Op.getValueType(),
20633 Src1, Src2, Src3, Rnd);
20636 return DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3);
20639 SDValue Src1 = Op.getOperand(1);
20640 SDValue Src2 = Op.getOperand(2);
20642 // Swap Src1 and Src2 in the node creation
20643 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
20647 case FMA_OP_MASK: {
20648 SDValue Src1 = Op.getOperand(1);
20649 SDValue Src2 = Op.getOperand(2);
20650 SDValue Src3 = Op.getOperand(3);
20651 SDValue Mask = Op.getOperand(4);
20652 MVT VT = Op.getSimpleValueType();
20653 SDValue PassThru = SDValue();
20655 // set PassThru element
20656 if (IntrData->Type == FMA_OP_MASKZ)
20657 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20658 else if (IntrData->Type == FMA_OP_MASK3)
20663 // We specify 2 possible opcodes for intrinsics with rounding modes.
20664 // First, we check if the intrinsic may have non-default rounding mode,
20665 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20666 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20667 if (IntrWithRoundingModeOpcode != 0) {
20668 SDValue Rnd = Op.getOperand(5);
20669 if (!isRoundModeCurDirection(Rnd))
20670 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20671 dl, Op.getValueType(),
20672 Src1, Src2, Src3, Rnd),
20673 Mask, PassThru, Subtarget, DAG);
20675 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20676 dl, Op.getValueType(),
20678 Mask, PassThru, Subtarget, DAG);
20680 case FMA_OP_SCALAR_MASK:
20681 case FMA_OP_SCALAR_MASK3:
20682 case FMA_OP_SCALAR_MASKZ: {
20683 SDValue Src1 = Op.getOperand(1);
20684 SDValue Src2 = Op.getOperand(2);
20685 SDValue Src3 = Op.getOperand(3);
20686 SDValue Mask = Op.getOperand(4);
20687 MVT VT = Op.getSimpleValueType();
20688 SDValue PassThru = SDValue();
20690 // set PassThru element
20691 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20692 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20693 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20698 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20699 if (IntrWithRoundingModeOpcode != 0) {
20700 SDValue Rnd = Op.getOperand(5);
20701 if (!isRoundModeCurDirection(Rnd))
20702 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20703 Op.getValueType(), Src1, Src2,
20705 Mask, PassThru, Subtarget, DAG);
20708 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20709 Op.getValueType(), Src1, Src2,
20711 Mask, PassThru, Subtarget, DAG);
20714 // NOTE: We need to swizzle the operands to pass the multiply operands
20716 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20717 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
20719 // ISD::FP_ROUND has a second argument that indicates if the truncation
20720 // does not change the value. Set it to 0 since it can change.
20721 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20722 DAG.getIntPtrConstant(0, dl));
20723 case CVTPD2PS_MASK: {
20724 SDValue Src = Op.getOperand(1);
20725 SDValue PassThru = Op.getOperand(2);
20726 SDValue Mask = Op.getOperand(3);
20727 // We add rounding mode to the Node when
20728 // - RM Opcode is specified and
20729 // - RM is not "current direction".
20730 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20731 if (IntrWithRoundingModeOpcode != 0) {
20732 SDValue Rnd = Op.getOperand(4);
20733 if (!isRoundModeCurDirection(Rnd)) {
20734 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20735 dl, Op.getValueType(),
20737 Mask, PassThru, Subtarget, DAG);
20740 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20741 // ISD::FP_ROUND has a second argument that indicates if the truncation
20742 // does not change the value. Set it to 0 since it can change.
20743 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20744 DAG.getIntPtrConstant(0, dl)),
20745 Mask, PassThru, Subtarget, DAG);
20748 // FPclass intrinsics
20749 SDValue Src1 = Op.getOperand(1);
20750 MVT MaskVT = Op.getSimpleValueType();
20751 SDValue Imm = Op.getOperand(2);
20752 return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20755 SDValue Src1 = Op.getOperand(1);
20756 SDValue Imm = Op.getOperand(2);
20757 SDValue Mask = Op.getOperand(3);
20758 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20759 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20761 // Need to fill with zeros to ensure the bitcast will produce zeroes
20762 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20763 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20764 DAG.getConstant(0, dl, MVT::v8i1),
20765 FPclassMask, DAG.getIntPtrConstant(0, dl));
20766 return DAG.getBitcast(MVT::i8, Ins);
20769 // Comparison intrinsics with masks.
20770 // Example of transformation:
20771 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20772 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20774 // (v8i1 (insert_subvector zero,
20775 // (v2i1 (and (PCMPEQM %a, %b),
20776 // (extract_subvector
20777 // (v8i1 (bitcast %mask)), 0))), 0))))
20778 MVT VT = Op.getOperand(1).getSimpleValueType();
20779 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20780 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20781 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20782 Mask.getSimpleValueType().getSizeInBits());
20783 SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20785 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20787 // Need to fill with zeros to ensure the bitcast will produce zeroes
20788 // for the upper bits in the v2i1/v4i1 case.
20789 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20790 DAG.getConstant(0, dl, BitcastVT),
20791 CmpMask, DAG.getIntPtrConstant(0, dl));
20792 return DAG.getBitcast(Op.getValueType(), Res);
20795 case CMP_MASK_CC: {
20796 MVT MaskVT = Op.getSimpleValueType();
20798 SDValue CC = Op.getOperand(3);
20799 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20800 // We specify 2 possible opcodes for intrinsics with rounding modes.
20801 // First, we check if the intrinsic may have non-default rounding mode,
20802 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20803 if (IntrData->Opc1 != 0) {
20804 SDValue Rnd = Op.getOperand(4);
20805 if (!isRoundModeCurDirection(Rnd))
20806 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20807 Op.getOperand(2), CC, Rnd);
20809 //default rounding mode
20810 if (!Cmp.getNode())
20811 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20812 Op.getOperand(2), CC);
20816 case CMP_MASK_SCALAR_CC: {
20817 SDValue Src1 = Op.getOperand(1);
20818 SDValue Src2 = Op.getOperand(2);
20819 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20820 SDValue Mask = Op.getOperand(4);
20823 if (IntrData->Opc1 != 0) {
20824 SDValue Rnd = Op.getOperand(5);
20825 if (!isRoundModeCurDirection(Rnd))
20826 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20828 //default rounding mode
20830 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20832 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20834 // Need to fill with zeros to ensure the bitcast will produce zeroes
20835 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20836 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20837 DAG.getConstant(0, dl, MVT::v8i1),
20838 CmpMask, DAG.getIntPtrConstant(0, dl));
20839 return DAG.getBitcast(MVT::i8, Ins);
20841 case COMI: { // Comparison intrinsics
20842 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20843 SDValue LHS = Op.getOperand(1);
20844 SDValue RHS = Op.getOperand(2);
20845 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20846 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20849 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20850 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20851 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20852 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20855 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20856 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20857 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20858 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20861 case ISD::SETGT: // (CF = 0 and ZF = 0)
20862 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20864 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20865 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20868 case ISD::SETGE: // CF = 0
20869 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20871 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20872 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20875 llvm_unreachable("Unexpected illegal condition!");
20877 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20879 case COMI_RM: { // Comparison intrinsics with Sae
20880 SDValue LHS = Op.getOperand(1);
20881 SDValue RHS = Op.getOperand(2);
20882 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20883 SDValue Sae = Op.getOperand(4);
20886 if (isRoundModeCurDirection(Sae))
20887 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20888 DAG.getConstant(CondVal, dl, MVT::i8));
20890 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20891 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20892 // Need to fill with zeros to ensure the bitcast will produce zeroes
20893 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20894 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20895 DAG.getConstant(0, dl, MVT::v16i1),
20896 FCmp, DAG.getIntPtrConstant(0, dl));
20897 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
20898 DAG.getBitcast(MVT::i16, Ins));
20901 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20902 Op.getOperand(1), Op.getOperand(2), Subtarget,
20904 case COMPRESS_EXPAND_IN_REG: {
20905 SDValue Mask = Op.getOperand(3);
20906 SDValue DataToCompress = Op.getOperand(1);
20907 SDValue PassThru = Op.getOperand(2);
20908 if (isAllOnesConstant(Mask)) // return data as is
20909 return Op.getOperand(1);
20911 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20913 Mask, PassThru, Subtarget, DAG);
20916 case FIXUPIMMS_MASKZ:
20918 case FIXUPIMM_MASKZ:{
20919 SDValue Src1 = Op.getOperand(1);
20920 SDValue Src2 = Op.getOperand(2);
20921 SDValue Src3 = Op.getOperand(3);
20922 SDValue Imm = Op.getOperand(4);
20923 SDValue Mask = Op.getOperand(5);
20924 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20925 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20926 // We specify 2 possible modes for intrinsics, with/without rounding
20928 // First, we check if the intrinsic have rounding mode (7 operands),
20929 // if not, we set rounding mode to "current".
20931 if (Op.getNumOperands() == 7)
20932 Rnd = Op.getOperand(6);
20934 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20935 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20936 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20937 Src1, Src2, Src3, Imm, Rnd),
20938 Mask, Passthru, Subtarget, DAG);
20939 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20940 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20941 Src1, Src2, Src3, Imm, Rnd),
20942 Mask, Passthru, Subtarget, DAG);
20945 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20946 // Clear the upper bits of the rounding immediate so that the legacy
20947 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20948 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20950 DAG.getConstant(0xf, dl, MVT::i32));
20951 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20952 Op.getOperand(1), RoundingMode);
20955 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20956 // Clear the upper bits of the rounding immediate so that the legacy
20957 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20958 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20960 DAG.getConstant(0xf, dl, MVT::i32));
20961 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20962 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20970 default: return SDValue(); // Don't custom lower most intrinsics.
20972 // ptest and testp intrinsics. The intrinsic these come from are designed to
20973 // return an integer value, not just an instruction so lower it to the ptest
20974 // or testp pattern and a setcc for the result.
20975 case Intrinsic::x86_sse41_ptestz:
20976 case Intrinsic::x86_sse41_ptestc:
20977 case Intrinsic::x86_sse41_ptestnzc:
20978 case Intrinsic::x86_avx_ptestz_256:
20979 case Intrinsic::x86_avx_ptestc_256:
20980 case Intrinsic::x86_avx_ptestnzc_256:
20981 case Intrinsic::x86_avx_vtestz_ps:
20982 case Intrinsic::x86_avx_vtestc_ps:
20983 case Intrinsic::x86_avx_vtestnzc_ps:
20984 case Intrinsic::x86_avx_vtestz_pd:
20985 case Intrinsic::x86_avx_vtestc_pd:
20986 case Intrinsic::x86_avx_vtestnzc_pd:
20987 case Intrinsic::x86_avx_vtestz_ps_256:
20988 case Intrinsic::x86_avx_vtestc_ps_256:
20989 case Intrinsic::x86_avx_vtestnzc_ps_256:
20990 case Intrinsic::x86_avx_vtestz_pd_256:
20991 case Intrinsic::x86_avx_vtestc_pd_256:
20992 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20993 bool IsTestPacked = false;
20994 X86::CondCode X86CC;
20996 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20997 case Intrinsic::x86_avx_vtestz_ps:
20998 case Intrinsic::x86_avx_vtestz_pd:
20999 case Intrinsic::x86_avx_vtestz_ps_256:
21000 case Intrinsic::x86_avx_vtestz_pd_256:
21001 IsTestPacked = true;
21003 case Intrinsic::x86_sse41_ptestz:
21004 case Intrinsic::x86_avx_ptestz_256:
21006 X86CC = X86::COND_E;
21008 case Intrinsic::x86_avx_vtestc_ps:
21009 case Intrinsic::x86_avx_vtestc_pd:
21010 case Intrinsic::x86_avx_vtestc_ps_256:
21011 case Intrinsic::x86_avx_vtestc_pd_256:
21012 IsTestPacked = true;
21014 case Intrinsic::x86_sse41_ptestc:
21015 case Intrinsic::x86_avx_ptestc_256:
21017 X86CC = X86::COND_B;
21019 case Intrinsic::x86_avx_vtestnzc_ps:
21020 case Intrinsic::x86_avx_vtestnzc_pd:
21021 case Intrinsic::x86_avx_vtestnzc_ps_256:
21022 case Intrinsic::x86_avx_vtestnzc_pd_256:
21023 IsTestPacked = true;
21025 case Intrinsic::x86_sse41_ptestnzc:
21026 case Intrinsic::x86_avx_ptestnzc_256:
21028 X86CC = X86::COND_A;
21032 SDValue LHS = Op.getOperand(1);
21033 SDValue RHS = Op.getOperand(2);
21034 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
21035 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
21036 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
21037 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21040 case Intrinsic::x86_sse42_pcmpistria128:
21041 case Intrinsic::x86_sse42_pcmpestria128:
21042 case Intrinsic::x86_sse42_pcmpistric128:
21043 case Intrinsic::x86_sse42_pcmpestric128:
21044 case Intrinsic::x86_sse42_pcmpistrio128:
21045 case Intrinsic::x86_sse42_pcmpestrio128:
21046 case Intrinsic::x86_sse42_pcmpistris128:
21047 case Intrinsic::x86_sse42_pcmpestris128:
21048 case Intrinsic::x86_sse42_pcmpistriz128:
21049 case Intrinsic::x86_sse42_pcmpestriz128: {
21051 X86::CondCode X86CC;
21053 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
21054 case Intrinsic::x86_sse42_pcmpistria128:
21055 Opcode = X86ISD::PCMPISTR;
21056 X86CC = X86::COND_A;
21058 case Intrinsic::x86_sse42_pcmpestria128:
21059 Opcode = X86ISD::PCMPESTR;
21060 X86CC = X86::COND_A;
21062 case Intrinsic::x86_sse42_pcmpistric128:
21063 Opcode = X86ISD::PCMPISTR;
21064 X86CC = X86::COND_B;
21066 case Intrinsic::x86_sse42_pcmpestric128:
21067 Opcode = X86ISD::PCMPESTR;
21068 X86CC = X86::COND_B;
21070 case Intrinsic::x86_sse42_pcmpistrio128:
21071 Opcode = X86ISD::PCMPISTR;
21072 X86CC = X86::COND_O;
21074 case Intrinsic::x86_sse42_pcmpestrio128:
21075 Opcode = X86ISD::PCMPESTR;
21076 X86CC = X86::COND_O;
21078 case Intrinsic::x86_sse42_pcmpistris128:
21079 Opcode = X86ISD::PCMPISTR;
21080 X86CC = X86::COND_S;
21082 case Intrinsic::x86_sse42_pcmpestris128:
21083 Opcode = X86ISD::PCMPESTR;
21084 X86CC = X86::COND_S;
21086 case Intrinsic::x86_sse42_pcmpistriz128:
21087 Opcode = X86ISD::PCMPISTR;
21088 X86CC = X86::COND_E;
21090 case Intrinsic::x86_sse42_pcmpestriz128:
21091 Opcode = X86ISD::PCMPESTR;
21092 X86CC = X86::COND_E;
21095 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21096 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21097 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
21098 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
21099 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21102 case Intrinsic::x86_sse42_pcmpistri128:
21103 case Intrinsic::x86_sse42_pcmpestri128: {
21105 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
21106 Opcode = X86ISD::PCMPISTR;
21108 Opcode = X86ISD::PCMPESTR;
21110 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21111 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21112 return DAG.getNode(Opcode, dl, VTs, NewOps);
21115 case Intrinsic::x86_sse42_pcmpistrm128:
21116 case Intrinsic::x86_sse42_pcmpestrm128: {
21118 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
21119 Opcode = X86ISD::PCMPISTR;
21121 Opcode = X86ISD::PCMPESTR;
21123 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21124 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21125 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
21128 case Intrinsic::eh_sjlj_lsda: {
21129 MachineFunction &MF = DAG.getMachineFunction();
21130 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21131 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21132 auto &Context = MF.getMMI().getContext();
21133 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
21134 Twine(MF.getFunctionNumber()));
21135 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
21136 DAG.getMCSymbol(S, PtrVT));
21139 case Intrinsic::x86_seh_lsda: {
21140 // Compute the symbol for the LSDA. We know it'll get emitted later.
21141 MachineFunction &MF = DAG.getMachineFunction();
21142 SDValue Op1 = Op.getOperand(1);
21143 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
21144 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
21145 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21147 // Generate a simple absolute symbol reference. This intrinsic is only
21148 // supported on 32-bit Windows, which isn't PIC.
21149 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
21150 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
21153 case Intrinsic::x86_seh_recoverfp: {
21154 SDValue FnOp = Op.getOperand(1);
21155 SDValue IncomingFPOp = Op.getOperand(2);
21156 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
21157 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
21159 report_fatal_error(
21160 "llvm.x86.seh.recoverfp must take a function as the first argument");
21161 return recoverFramePointer(DAG, Fn, IncomingFPOp);
21164 case Intrinsic::localaddress: {
21165 // Returns one of the stack, base, or frame pointer registers, depending on
21166 // which is used to reference local variables.
21167 MachineFunction &MF = DAG.getMachineFunction();
21168 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21170 if (RegInfo->hasBasePointer(MF))
21171 Reg = RegInfo->getBaseRegister();
21172 else // This function handles the SP or FP case.
21173 Reg = RegInfo->getPtrSizedFrameRegister(MF);
21174 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
21179 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21180 SDValue Src, SDValue Mask, SDValue Base,
21181 SDValue Index, SDValue ScaleOp, SDValue Chain,
21182 const X86Subtarget &Subtarget) {
21184 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21185 // Scale must be constant.
21188 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21189 EVT MaskVT = Mask.getValueType();
21190 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21191 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21192 SDValue Segment = DAG.getRegister(0, MVT::i32);
21193 // If source is undef or we know it won't be used, use a zero vector
21194 // to break register dependency.
21195 // TODO: use undef instead and let BreakFalseDeps deal with it?
21196 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
21197 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21198 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
21199 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21200 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21201 return DAG.getMergeValues(RetOps, dl);
21204 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21205 SDValue Src, SDValue Mask, SDValue Base,
21206 SDValue Index, SDValue ScaleOp, SDValue Chain,
21207 const X86Subtarget &Subtarget) {
21209 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21210 // Scale must be constant.
21213 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21214 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21215 Index.getSimpleValueType().getVectorNumElements());
21217 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21218 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21219 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21220 SDValue Segment = DAG.getRegister(0, MVT::i32);
21221 // If source is undef or we know it won't be used, use a zero vector
21222 // to break register dependency.
21223 // TODO: use undef instead and let BreakFalseDeps deal with it?
21224 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
21225 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21226 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
21227 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21228 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21229 return DAG.getMergeValues(RetOps, dl);
21232 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21233 SDValue Src, SDValue Mask, SDValue Base,
21234 SDValue Index, SDValue ScaleOp, SDValue Chain,
21235 const X86Subtarget &Subtarget) {
21237 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21238 // Scale must be constant.
21241 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21242 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21243 SDValue Segment = DAG.getRegister(0, MVT::i32);
21244 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21245 Index.getSimpleValueType().getVectorNumElements());
21247 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21248 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
21249 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
21250 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21251 return SDValue(Res, 1);
21254 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21255 SDValue Mask, SDValue Base, SDValue Index,
21256 SDValue ScaleOp, SDValue Chain,
21257 const X86Subtarget &Subtarget) {
21259 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21260 // Scale must be constant.
21263 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21264 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21265 SDValue Segment = DAG.getRegister(0, MVT::i32);
21267 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
21268 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21269 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
21270 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
21271 return SDValue(Res, 0);
21274 /// Handles the lowering of builtin intrinsic that return the value
21275 /// of the extended control register.
21276 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
21278 const X86Subtarget &Subtarget,
21279 SmallVectorImpl<SDValue> &Results) {
21280 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21281 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21284 // The ECX register is used to select the index of the XCR register to
21287 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
21288 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
21289 Chain = SDValue(N1, 0);
21291 // Reads the content of XCR and returns it in registers EDX:EAX.
21292 if (Subtarget.is64Bit()) {
21293 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
21294 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21297 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
21298 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21301 Chain = HI.getValue(1);
21303 if (Subtarget.is64Bit()) {
21304 // Merge the two 32-bit values into a 64-bit one..
21305 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21306 DAG.getConstant(32, DL, MVT::i8));
21307 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21308 Results.push_back(Chain);
21312 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21313 SDValue Ops[] = { LO, HI };
21314 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21315 Results.push_back(Pair);
21316 Results.push_back(Chain);
21319 /// Handles the lowering of builtin intrinsics that read performance monitor
21320 /// counters (x86_rdpmc).
21321 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21323 const X86Subtarget &Subtarget,
21324 SmallVectorImpl<SDValue> &Results) {
21325 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21326 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21329 // The ECX register is used to select the index of the performance counter
21331 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21333 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21335 // Reads the content of a 64-bit performance counter and returns it in the
21336 // registers EDX:EAX.
21337 if (Subtarget.is64Bit()) {
21338 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21339 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21342 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21343 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21346 Chain = HI.getValue(1);
21348 if (Subtarget.is64Bit()) {
21349 // The EAX register is loaded with the low-order 32 bits. The EDX register
21350 // is loaded with the supported high-order bits of the counter.
21351 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21352 DAG.getConstant(32, DL, MVT::i8));
21353 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21354 Results.push_back(Chain);
21358 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21359 SDValue Ops[] = { LO, HI };
21360 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21361 Results.push_back(Pair);
21362 Results.push_back(Chain);
21365 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21366 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21367 /// READCYCLECOUNTER nodes.
21368 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21370 const X86Subtarget &Subtarget,
21371 SmallVectorImpl<SDValue> &Results) {
21372 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21373 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21376 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21377 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21378 // and the EAX register is loaded with the low-order 32 bits.
21379 if (Subtarget.is64Bit()) {
21380 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21381 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21384 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21385 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21388 SDValue Chain = HI.getValue(1);
21390 if (Opcode == X86ISD::RDTSCP_DAG) {
21391 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21393 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21394 // the ECX register. Add 'ecx' explicitly to the chain.
21395 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21397 // Explicitly store the content of ECX at the location passed in input
21398 // to the 'rdtscp' intrinsic.
21399 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21400 MachinePointerInfo());
21403 if (Subtarget.is64Bit()) {
21404 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21405 // the EAX register is loaded with the low-order 32 bits.
21406 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21407 DAG.getConstant(32, DL, MVT::i8));
21408 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21409 Results.push_back(Chain);
21413 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21414 SDValue Ops[] = { LO, HI };
21415 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21416 Results.push_back(Pair);
21417 Results.push_back(Chain);
21420 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21421 SelectionDAG &DAG) {
21422 SmallVector<SDValue, 2> Results;
21424 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21426 return DAG.getMergeValues(Results, DL);
21429 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21430 MachineFunction &MF = DAG.getMachineFunction();
21431 SDValue Chain = Op.getOperand(0);
21432 SDValue RegNode = Op.getOperand(2);
21433 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21435 report_fatal_error("EH registrations only live in functions using WinEH");
21437 // Cast the operand to an alloca, and remember the frame index.
21438 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21440 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21441 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21443 // Return the chain operand without making any DAG nodes.
21447 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21448 MachineFunction &MF = DAG.getMachineFunction();
21449 SDValue Chain = Op.getOperand(0);
21450 SDValue EHGuard = Op.getOperand(2);
21451 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21453 report_fatal_error("EHGuard only live in functions using WinEH");
21455 // Cast the operand to an alloca, and remember the frame index.
21456 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21458 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21459 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21461 // Return the chain operand without making any DAG nodes.
21465 /// Emit Truncating Store with signed or unsigned saturation.
21467 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21468 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21469 SelectionDAG &DAG) {
21471 SDVTList VTs = DAG.getVTList(MVT::Other);
21472 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21473 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21475 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21476 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21479 /// Emit Masked Truncating Store with signed or unsigned saturation.
21481 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21482 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21483 MachineMemOperand *MMO, SelectionDAG &DAG) {
21485 SDVTList VTs = DAG.getVTList(MVT::Other);
21486 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21488 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21489 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21492 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21493 SelectionDAG &DAG) {
21494 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21496 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21499 case llvm::Intrinsic::x86_seh_ehregnode:
21500 return MarkEHRegistrationNode(Op, DAG);
21501 case llvm::Intrinsic::x86_seh_ehguard:
21502 return MarkEHGuard(Op, DAG);
21503 case llvm::Intrinsic::x86_flags_read_u32:
21504 case llvm::Intrinsic::x86_flags_read_u64:
21505 case llvm::Intrinsic::x86_flags_write_u32:
21506 case llvm::Intrinsic::x86_flags_write_u64: {
21507 // We need a frame pointer because this will get lowered to a PUSH/POP
21509 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21510 MFI.setHasCopyImplyingStackAdjustment(true);
21511 // Don't do anything here, we will expand these intrinsics out later
21512 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21515 case Intrinsic::x86_lwpins32:
21516 case Intrinsic::x86_lwpins64:
21517 case Intrinsic::x86_umwait:
21518 case Intrinsic::x86_tpause: {
21520 SDValue Chain = Op->getOperand(0);
21521 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21525 default: llvm_unreachable("Impossible intrinsic");
21526 case Intrinsic::x86_umwait:
21527 Opcode = X86ISD::UMWAIT;
21529 case Intrinsic::x86_tpause:
21530 Opcode = X86ISD::TPAUSE;
21532 case Intrinsic::x86_lwpins32:
21533 case Intrinsic::x86_lwpins64:
21534 Opcode = X86ISD::LWPINS;
21538 SDValue Operation =
21539 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
21540 Op->getOperand(3), Op->getOperand(4));
21541 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
21542 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21543 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21544 Operation.getValue(1));
21551 switch(IntrData->Type) {
21552 default: llvm_unreachable("Unknown Intrinsic Type");
21555 // Emit the node with the right value type.
21556 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21557 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21559 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21560 // Otherwise return the value from Rand, which is always 0, casted to i32.
21561 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21562 DAG.getConstant(1, dl, Op->getValueType(1)),
21563 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21564 SDValue(Result.getNode(), 1) };
21565 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21567 // Return { result, isValid, chain }.
21568 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21569 SDValue(Result.getNode(), 2));
21571 case GATHER_AVX2: {
21572 SDValue Chain = Op.getOperand(0);
21573 SDValue Src = Op.getOperand(2);
21574 SDValue Base = Op.getOperand(3);
21575 SDValue Index = Op.getOperand(4);
21576 SDValue Mask = Op.getOperand(5);
21577 SDValue Scale = Op.getOperand(6);
21578 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21579 Scale, Chain, Subtarget);
21582 //gather(v1, mask, index, base, scale);
21583 SDValue Chain = Op.getOperand(0);
21584 SDValue Src = Op.getOperand(2);
21585 SDValue Base = Op.getOperand(3);
21586 SDValue Index = Op.getOperand(4);
21587 SDValue Mask = Op.getOperand(5);
21588 SDValue Scale = Op.getOperand(6);
21589 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21593 //scatter(base, mask, index, v1, scale);
21594 SDValue Chain = Op.getOperand(0);
21595 SDValue Base = Op.getOperand(2);
21596 SDValue Mask = Op.getOperand(3);
21597 SDValue Index = Op.getOperand(4);
21598 SDValue Src = Op.getOperand(5);
21599 SDValue Scale = Op.getOperand(6);
21600 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21601 Scale, Chain, Subtarget);
21604 SDValue Hint = Op.getOperand(6);
21605 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21606 assert((HintVal == 2 || HintVal == 3) &&
21607 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21608 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21609 SDValue Chain = Op.getOperand(0);
21610 SDValue Mask = Op.getOperand(2);
21611 SDValue Index = Op.getOperand(3);
21612 SDValue Base = Op.getOperand(4);
21613 SDValue Scale = Op.getOperand(5);
21614 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21617 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21619 SmallVector<SDValue, 2> Results;
21620 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21622 return DAG.getMergeValues(Results, dl);
21624 // Read Performance Monitoring Counters.
21626 SmallVector<SDValue, 2> Results;
21627 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21628 return DAG.getMergeValues(Results, dl);
21630 // Get Extended Control Register.
21632 SmallVector<SDValue, 2> Results;
21633 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21634 return DAG.getMergeValues(Results, dl);
21636 // XTEST intrinsics.
21638 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21639 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21641 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21642 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21643 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21644 Ret, SDValue(InTrans.getNode(), 1));
21648 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21649 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21650 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21651 DAG.getConstant(-1, dl, MVT::i8));
21652 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21653 Op.getOperand(4), GenCF.getValue(1));
21654 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21655 Op.getOperand(5), MachinePointerInfo());
21656 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21657 SDValue Results[] = { SetCC, Store };
21658 return DAG.getMergeValues(Results, dl);
21660 case TRUNCATE_TO_MEM_VI8:
21661 case TRUNCATE_TO_MEM_VI16:
21662 case TRUNCATE_TO_MEM_VI32: {
21663 SDValue Mask = Op.getOperand(4);
21664 SDValue DataToTruncate = Op.getOperand(3);
21665 SDValue Addr = Op.getOperand(2);
21666 SDValue Chain = Op.getOperand(0);
21668 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21669 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21671 EVT MemVT = MemIntr->getMemoryVT();
21673 uint16_t TruncationOp = IntrData->Opc0;
21674 switch (TruncationOp) {
21675 case X86ISD::VTRUNC: {
21676 if (isAllOnesConstant(Mask)) // return just a truncate store
21677 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21678 MemIntr->getMemOperand());
21680 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21681 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21683 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21684 MemIntr->getMemOperand(), true /* truncating */);
21686 case X86ISD::VTRUNCUS:
21687 case X86ISD::VTRUNCS: {
21688 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21689 if (isAllOnesConstant(Mask))
21690 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21691 MemIntr->getMemOperand(), DAG);
21693 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21694 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21696 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21697 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21700 llvm_unreachable("Unsupported truncstore intrinsic");
21706 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21707 SelectionDAG &DAG) const {
21708 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21709 MFI.setReturnAddressIsTaken(true);
21711 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21714 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21716 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21719 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21720 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21721 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21722 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21723 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21724 MachinePointerInfo());
21727 // Just load the return address.
21728 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21729 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21730 MachinePointerInfo());
21733 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21734 SelectionDAG &DAG) const {
21735 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21736 return getReturnAddressFrameIndex(DAG);
21739 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21740 MachineFunction &MF = DAG.getMachineFunction();
21741 MachineFrameInfo &MFI = MF.getFrameInfo();
21742 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21743 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21744 EVT VT = Op.getValueType();
21746 MFI.setFrameAddressIsTaken(true);
21748 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21749 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21750 // is not possible to crawl up the stack without looking at the unwind codes
21752 int FrameAddrIndex = FuncInfo->getFAIndex();
21753 if (!FrameAddrIndex) {
21754 // Set up a frame object for the return address.
21755 unsigned SlotSize = RegInfo->getSlotSize();
21756 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21757 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21758 FuncInfo->setFAIndex(FrameAddrIndex);
21760 return DAG.getFrameIndex(FrameAddrIndex, VT);
21763 unsigned FrameReg =
21764 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21765 SDLoc dl(Op); // FIXME probably not meaningful
21766 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21767 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21768 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21769 "Invalid Frame Register!");
21770 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21772 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21773 MachinePointerInfo());
21777 // FIXME? Maybe this could be a TableGen attribute on some registers and
21778 // this table could be generated automatically from RegInfo.
21779 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21780 SelectionDAG &DAG) const {
21781 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21782 const MachineFunction &MF = DAG.getMachineFunction();
21784 unsigned Reg = StringSwitch<unsigned>(RegName)
21785 .Case("esp", X86::ESP)
21786 .Case("rsp", X86::RSP)
21787 .Case("ebp", X86::EBP)
21788 .Case("rbp", X86::RBP)
21791 if (Reg == X86::EBP || Reg == X86::RBP) {
21792 if (!TFI.hasFP(MF))
21793 report_fatal_error("register " + StringRef(RegName) +
21794 " is allocatable: function has no frame pointer");
21797 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21798 unsigned FrameReg =
21799 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21800 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21801 "Invalid Frame Register!");
21809 report_fatal_error("Invalid register name global variable");
21812 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21813 SelectionDAG &DAG) const {
21814 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21815 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21818 unsigned X86TargetLowering::getExceptionPointerRegister(
21819 const Constant *PersonalityFn) const {
21820 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21821 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21823 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21826 unsigned X86TargetLowering::getExceptionSelectorRegister(
21827 const Constant *PersonalityFn) const {
21828 // Funclet personalities don't use selectors (the runtime does the selection).
21829 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21830 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21833 bool X86TargetLowering::needsFixedCatchObjects() const {
21834 return Subtarget.isTargetWin64();
21837 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21838 SDValue Chain = Op.getOperand(0);
21839 SDValue Offset = Op.getOperand(1);
21840 SDValue Handler = Op.getOperand(2);
21843 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21844 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21845 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21846 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21847 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21848 "Invalid Frame Register!");
21849 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21850 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21852 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21853 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21855 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21856 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21857 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21859 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21860 DAG.getRegister(StoreAddrReg, PtrVT));
21863 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21864 SelectionDAG &DAG) const {
21866 // If the subtarget is not 64bit, we may need the global base reg
21867 // after isel expand pseudo, i.e., after CGBR pass ran.
21868 // Therefore, ask for the GlobalBaseReg now, so that the pass
21869 // inserts the code for us in case we need it.
21870 // Otherwise, we will end up in a situation where we will
21871 // reference a virtual register that is not defined!
21872 if (!Subtarget.is64Bit()) {
21873 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21874 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21876 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21877 DAG.getVTList(MVT::i32, MVT::Other),
21878 Op.getOperand(0), Op.getOperand(1));
21881 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21882 SelectionDAG &DAG) const {
21884 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21885 Op.getOperand(0), Op.getOperand(1));
21888 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21889 SelectionDAG &DAG) const {
21891 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21895 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21896 return Op.getOperand(0);
21899 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21900 SelectionDAG &DAG) const {
21901 SDValue Root = Op.getOperand(0);
21902 SDValue Trmp = Op.getOperand(1); // trampoline
21903 SDValue FPtr = Op.getOperand(2); // nested function
21904 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21907 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21908 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21910 if (Subtarget.is64Bit()) {
21911 SDValue OutChains[6];
21913 // Large code-model.
21914 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21915 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21917 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21918 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21920 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21922 // Load the pointer to the nested function into R11.
21923 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21924 SDValue Addr = Trmp;
21925 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21926 Addr, MachinePointerInfo(TrmpAddr));
21928 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21929 DAG.getConstant(2, dl, MVT::i64));
21931 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21932 /* Alignment = */ 2);
21934 // Load the 'nest' parameter value into R10.
21935 // R10 is specified in X86CallingConv.td
21936 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21937 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21938 DAG.getConstant(10, dl, MVT::i64));
21939 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21940 Addr, MachinePointerInfo(TrmpAddr, 10));
21942 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21943 DAG.getConstant(12, dl, MVT::i64));
21945 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21946 /* Alignment = */ 2);
21948 // Jump to the nested function.
21949 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21950 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21951 DAG.getConstant(20, dl, MVT::i64));
21952 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21953 Addr, MachinePointerInfo(TrmpAddr, 20));
21955 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21956 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21957 DAG.getConstant(22, dl, MVT::i64));
21958 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21959 Addr, MachinePointerInfo(TrmpAddr, 22));
21961 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21963 const Function *Func =
21964 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21965 CallingConv::ID CC = Func->getCallingConv();
21970 llvm_unreachable("Unsupported calling convention");
21971 case CallingConv::C:
21972 case CallingConv::X86_StdCall: {
21973 // Pass 'nest' parameter in ECX.
21974 // Must be kept in sync with X86CallingConv.td
21975 NestReg = X86::ECX;
21977 // Check that ECX wasn't needed by an 'inreg' parameter.
21978 FunctionType *FTy = Func->getFunctionType();
21979 const AttributeList &Attrs = Func->getAttributes();
21981 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21982 unsigned InRegCount = 0;
21985 for (FunctionType::param_iterator I = FTy->param_begin(),
21986 E = FTy->param_end(); I != E; ++I, ++Idx)
21987 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21988 auto &DL = DAG.getDataLayout();
21989 // FIXME: should only count parameters that are lowered to integers.
21990 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21993 if (InRegCount > 2) {
21994 report_fatal_error("Nest register in use - reduce number of inreg"
22000 case CallingConv::X86_FastCall:
22001 case CallingConv::X86_ThisCall:
22002 case CallingConv::Fast:
22003 // Pass 'nest' parameter in EAX.
22004 // Must be kept in sync with X86CallingConv.td
22005 NestReg = X86::EAX;
22009 SDValue OutChains[4];
22010 SDValue Addr, Disp;
22012 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22013 DAG.getConstant(10, dl, MVT::i32));
22014 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
22016 // This is storing the opcode for MOV32ri.
22017 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
22018 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
22020 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
22021 Trmp, MachinePointerInfo(TrmpAddr));
22023 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22024 DAG.getConstant(1, dl, MVT::i32));
22026 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
22027 /* Alignment = */ 1);
22029 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
22030 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22031 DAG.getConstant(5, dl, MVT::i32));
22032 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
22033 Addr, MachinePointerInfo(TrmpAddr, 5),
22034 /* Alignment = */ 1);
22036 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22037 DAG.getConstant(6, dl, MVT::i32));
22039 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
22040 /* Alignment = */ 1);
22042 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22046 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
22047 SelectionDAG &DAG) const {
22049 The rounding mode is in bits 11:10 of FPSR, and has the following
22051 00 Round to nearest
22056 FLT_ROUNDS, on the other hand, expects the following:
22063 To perform the conversion, we do:
22064 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
22067 MachineFunction &MF = DAG.getMachineFunction();
22068 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22069 unsigned StackAlignment = TFI.getStackAlignment();
22070 MVT VT = Op.getSimpleValueType();
22073 // Save FP Control Word to stack slot
22074 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
22075 SDValue StackSlot =
22076 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
22078 MachineMemOperand *MMO =
22079 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
22080 MachineMemOperand::MOStore, 2, 2);
22082 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
22083 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
22084 DAG.getVTList(MVT::Other),
22085 Ops, MVT::i16, MMO);
22087 // Load FP Control Word from stack slot
22089 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
22091 // Transform as necessary
22093 DAG.getNode(ISD::SRL, DL, MVT::i16,
22094 DAG.getNode(ISD::AND, DL, MVT::i16,
22095 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
22096 DAG.getConstant(11, DL, MVT::i8));
22098 DAG.getNode(ISD::SRL, DL, MVT::i16,
22099 DAG.getNode(ISD::AND, DL, MVT::i16,
22100 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
22101 DAG.getConstant(9, DL, MVT::i8));
22104 DAG.getNode(ISD::AND, DL, MVT::i16,
22105 DAG.getNode(ISD::ADD, DL, MVT::i16,
22106 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
22107 DAG.getConstant(1, DL, MVT::i16)),
22108 DAG.getConstant(3, DL, MVT::i16));
22110 return DAG.getNode((VT.getSizeInBits() < 16 ?
22111 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
22114 // Split an unary integer op into 2 half sized ops.
22115 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
22116 MVT VT = Op.getSimpleValueType();
22117 unsigned NumElems = VT.getVectorNumElements();
22118 unsigned SizeInBits = VT.getSizeInBits();
22119 MVT EltVT = VT.getVectorElementType();
22120 SDValue Src = Op.getOperand(0);
22121 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
22122 "Src and Op should have the same element type!");
22124 // Extract the Lo/Hi vectors
22126 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
22127 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
22129 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
22130 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22131 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
22132 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
22135 // Decompose 256-bit ops into smaller 128-bit ops.
22136 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
22137 assert(Op.getSimpleValueType().is256BitVector() &&
22138 Op.getSimpleValueType().isInteger() &&
22139 "Only handle AVX 256-bit vector integer operation");
22140 return LowerVectorIntUnary(Op, DAG);
22143 // Decompose 512-bit ops into smaller 256-bit ops.
22144 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
22145 assert(Op.getSimpleValueType().is512BitVector() &&
22146 Op.getSimpleValueType().isInteger() &&
22147 "Only handle AVX 512-bit vector integer operation");
22148 return LowerVectorIntUnary(Op, DAG);
22151 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
22153 // i8/i16 vector implemented using dword LZCNT vector instruction
22154 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
22155 // split the vector, perform operation on it's Lo a Hi part and
22156 // concatenate the results.
22157 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
22158 const X86Subtarget &Subtarget) {
22159 assert(Op.getOpcode() == ISD::CTLZ);
22161 MVT VT = Op.getSimpleValueType();
22162 MVT EltVT = VT.getVectorElementType();
22163 unsigned NumElems = VT.getVectorNumElements();
22165 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
22166 "Unsupported element type");
22168 // Split vector, it's Lo and Hi parts will be handled in next iteration.
22169 if (NumElems > 16 ||
22170 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
22171 return LowerVectorIntUnary(Op, DAG);
22173 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
22174 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
22175 "Unsupported value type for operation");
22177 // Use native supported vector instruction vplzcntd.
22178 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
22179 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
22180 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
22181 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
22183 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
22186 // Lower CTLZ using a PSHUFB lookup table implementation.
22187 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
22188 const X86Subtarget &Subtarget,
22189 SelectionDAG &DAG) {
22190 MVT VT = Op.getSimpleValueType();
22191 int NumElts = VT.getVectorNumElements();
22192 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
22193 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
22195 // Per-nibble leading zero PSHUFB lookup table.
22196 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
22197 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
22198 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
22199 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
22201 SmallVector<SDValue, 64> LUTVec;
22202 for (int i = 0; i < NumBytes; ++i)
22203 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22204 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
22206 // Begin by bitcasting the input to byte vector, then split those bytes
22207 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22208 // If the hi input nibble is zero then we add both results together, otherwise
22209 // we just take the hi result (by masking the lo result to zero before the
22211 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
22212 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
22214 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
22215 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
22216 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
22217 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
22219 if (CurrVT.is512BitVector()) {
22220 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22221 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
22222 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22224 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
22227 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22228 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22229 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
22230 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
22232 // Merge result back from vXi8 back to VT, working on the lo/hi halves
22233 // of the current vector width in the same way we did for the nibbles.
22234 // If the upper half of the input element is zero then add the halves'
22235 // leading zero counts together, otherwise just use the upper half's.
22236 // Double the width of the result until we are at target width.
22237 while (CurrVT != VT) {
22238 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
22239 int CurrNumElts = CurrVT.getVectorNumElements();
22240 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
22241 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
22242 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
22244 // Check if the upper half of the input element is zero.
22245 if (CurrVT.is512BitVector()) {
22246 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22247 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
22248 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22249 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22251 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
22252 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22254 HiZ = DAG.getBitcast(NextVT, HiZ);
22256 // Move the upper/lower halves to the lower bits as we'll be extending to
22257 // NextVT. Mask the lower result to zero if HiZ is true and add the results
22259 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
22260 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
22261 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
22262 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
22263 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
22270 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
22271 const X86Subtarget &Subtarget,
22272 SelectionDAG &DAG) {
22273 MVT VT = Op.getSimpleValueType();
22275 if (Subtarget.hasCDI() &&
22276 // vXi8 vectors need to be promoted to 512-bits for vXi32.
22277 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
22278 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
22280 // Decompose 256-bit ops into smaller 128-bit ops.
22281 if (VT.is256BitVector() && !Subtarget.hasInt256())
22282 return Lower256IntUnary(Op, DAG);
22284 // Decompose 512-bit ops into smaller 256-bit ops.
22285 if (VT.is512BitVector() && !Subtarget.hasBWI())
22286 return Lower512IntUnary(Op, DAG);
22288 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
22289 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
22292 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
22293 SelectionDAG &DAG) {
22294 MVT VT = Op.getSimpleValueType();
22296 unsigned NumBits = VT.getSizeInBits();
22298 unsigned Opc = Op.getOpcode();
22301 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22303 Op = Op.getOperand(0);
22304 if (VT == MVT::i8) {
22305 // Zero extend to i32 since there is not an i8 bsr.
22307 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22310 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22311 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22312 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22314 if (Opc == ISD::CTLZ) {
22315 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22318 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22319 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22322 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22325 // Finally xor with NumBits-1.
22326 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22327 DAG.getConstant(NumBits - 1, dl, OpVT));
22330 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22334 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22335 MVT VT = Op.getSimpleValueType();
22336 unsigned NumBits = VT.getScalarSizeInBits();
22339 if (VT.isVector()) {
22340 SDValue N0 = Op.getOperand(0);
22341 SDValue Zero = DAG.getConstant(0, dl, VT);
22343 // lsb(x) = (x & -x)
22344 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22345 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22347 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22348 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22349 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22350 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22351 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22354 // cttz(x) = ctpop(lsb - 1)
22355 SDValue One = DAG.getConstant(1, dl, VT);
22356 return DAG.getNode(ISD::CTPOP, dl, VT,
22357 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22360 assert(Op.getOpcode() == ISD::CTTZ &&
22361 "Only scalar CTTZ requires custom lowering");
22363 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22364 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22365 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22367 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22370 DAG.getConstant(NumBits, dl, VT),
22371 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22374 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22377 /// Break a 256-bit integer operation into two new 128-bit ones and then
22378 /// concatenate the result back.
22379 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22380 MVT VT = Op.getSimpleValueType();
22382 assert(VT.is256BitVector() && VT.isInteger() &&
22383 "Unsupported value type for operation");
22385 unsigned NumElems = VT.getVectorNumElements();
22388 // Extract the LHS vectors
22389 SDValue LHS = Op.getOperand(0);
22390 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22391 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22393 // Extract the RHS vectors
22394 SDValue RHS = Op.getOperand(1);
22395 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22396 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22398 MVT EltVT = VT.getVectorElementType();
22399 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22401 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22402 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22403 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22406 /// Break a 512-bit integer operation into two new 256-bit ones and then
22407 /// concatenate the result back.
22408 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22409 MVT VT = Op.getSimpleValueType();
22411 assert(VT.is512BitVector() && VT.isInteger() &&
22412 "Unsupported value type for operation");
22414 unsigned NumElems = VT.getVectorNumElements();
22417 // Extract the LHS vectors
22418 SDValue LHS = Op.getOperand(0);
22419 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22420 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22422 // Extract the RHS vectors
22423 SDValue RHS = Op.getOperand(1);
22424 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22425 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22427 MVT EltVT = VT.getVectorElementType();
22428 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22430 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22431 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22432 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22435 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22436 MVT VT = Op.getSimpleValueType();
22437 if (VT.getScalarType() == MVT::i1)
22438 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22439 Op.getOperand(0), Op.getOperand(1));
22440 assert(Op.getSimpleValueType().is256BitVector() &&
22441 Op.getSimpleValueType().isInteger() &&
22442 "Only handle AVX 256-bit vector integer operation");
22443 return Lower256IntArith(Op, DAG);
22446 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22447 MVT VT = Op.getSimpleValueType();
22448 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22449 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22450 // 8-bit integer abs to NEG and CMOV.
22452 SDValue N0 = Op.getOperand(0);
22453 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22454 DAG.getConstant(0, DL, VT), N0);
22455 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22456 SDValue(Neg.getNode(), 1)};
22457 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22460 assert(Op.getSimpleValueType().is256BitVector() &&
22461 Op.getSimpleValueType().isInteger() &&
22462 "Only handle AVX 256-bit vector integer operation");
22463 return Lower256IntUnary(Op, DAG);
22466 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22467 MVT VT = Op.getSimpleValueType();
22469 // For AVX1 cases, split to use legal ops (everything but v4i64).
22470 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
22471 return Lower256IntArith(Op, DAG);
22474 unsigned Opcode = Op.getOpcode();
22475 SDValue N0 = Op.getOperand(0);
22476 SDValue N1 = Op.getOperand(1);
22478 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
22479 // using the SMIN/SMAX instructions and flipping the signbit back.
22480 if (VT == MVT::v8i16) {
22481 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
22482 "Unexpected MIN/MAX opcode");
22483 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
22484 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
22485 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
22486 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
22487 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
22488 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
22491 // Else, expand to a compare/select.
22494 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
22495 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
22496 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
22497 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
22498 default: llvm_unreachable("Unknown MINMAX opcode");
22501 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
22502 return DAG.getSelect(DL, VT, Cond, N0, N1);
22505 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22506 SelectionDAG &DAG) {
22508 MVT VT = Op.getSimpleValueType();
22510 if (VT.getScalarType() == MVT::i1)
22511 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22513 // Decompose 256-bit ops into smaller 128-bit ops.
22514 if (VT.is256BitVector() && !Subtarget.hasInt256())
22515 return Lower256IntArith(Op, DAG);
22517 SDValue A = Op.getOperand(0);
22518 SDValue B = Op.getOperand(1);
22520 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22521 // vector pairs, multiply and truncate.
22522 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22523 if (Subtarget.hasInt256()) {
22524 // For 512-bit vectors, split into 256-bit vectors to allow the
22525 // sign-extension to occur.
22526 if (VT == MVT::v64i8)
22527 return Lower512IntArith(Op, DAG);
22529 // For 256-bit vectors, split into 128-bit vectors to allow the
22530 // sign-extension to occur. We don't need this on AVX512BW as we can
22531 // safely sign-extend to v32i16.
22532 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22533 return Lower256IntArith(Op, DAG);
22535 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22536 return DAG.getNode(
22537 ISD::TRUNCATE, dl, VT,
22538 DAG.getNode(ISD::MUL, dl, ExVT,
22539 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22540 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22543 assert(VT == MVT::v16i8 &&
22544 "Pre-AVX2 support only supports v16i8 multiplication");
22545 MVT ExVT = MVT::v8i16;
22547 // Extract the lo parts and sign extend to i16
22548 // We're going to mask off the low byte of each result element of the
22549 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22551 const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
22552 4, -1, 5, -1, 6, -1, 7, -1};
22553 SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
22554 SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
22555 ALo = DAG.getBitcast(ExVT, ALo);
22556 BLo = DAG.getBitcast(ExVT, BLo);
22558 // Extract the hi parts and sign extend to i16
22559 // We're going to mask off the low byte of each result element of the
22560 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22562 const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
22563 12, -1, 13, -1, 14, -1, 15, -1};
22564 SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
22565 SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
22566 AHi = DAG.getBitcast(ExVT, AHi);
22567 BHi = DAG.getBitcast(ExVT, BHi);
22569 // Multiply, mask the lower 8bits of the lo/hi results and pack
22570 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22571 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22572 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22573 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22574 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22577 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22578 if (VT == MVT::v4i32) {
22579 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22580 "Should not custom lower when pmulld is available!");
22582 // Extract the odd parts.
22583 static const int UnpackMask[] = { 1, -1, 3, -1 };
22584 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22585 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22587 // Multiply the even parts.
22588 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22589 DAG.getBitcast(MVT::v2i64, A),
22590 DAG.getBitcast(MVT::v2i64, B));
22591 // Now multiply odd parts.
22592 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22593 DAG.getBitcast(MVT::v2i64, Aodds),
22594 DAG.getBitcast(MVT::v2i64, Bodds));
22596 Evens = DAG.getBitcast(VT, Evens);
22597 Odds = DAG.getBitcast(VT, Odds);
22599 // Merge the two vectors back together with a shuffle. This expands into 2
22601 static const int ShufMask[] = { 0, 4, 2, 6 };
22602 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22605 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22606 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22607 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
22609 // Ahi = psrlqi(a, 32);
22610 // Bhi = psrlqi(b, 32);
22612 // AloBlo = pmuludq(a, b);
22613 // AloBhi = pmuludq(a, Bhi);
22614 // AhiBlo = pmuludq(Ahi, b);
22616 // Hi = psllqi(AloBhi + AhiBlo, 32);
22617 // return AloBlo + Hi;
22618 KnownBits AKnown, BKnown;
22619 DAG.computeKnownBits(A, AKnown);
22620 DAG.computeKnownBits(B, BKnown);
22622 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22623 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
22624 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
22626 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22627 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
22628 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
22630 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22632 // Only multiply lo/hi halves that aren't known to be zero.
22633 SDValue AloBlo = Zero;
22634 if (!ALoIsZero && !BLoIsZero)
22635 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
22637 SDValue AloBhi = Zero;
22638 if (!ALoIsZero && !BHiIsZero) {
22639 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22640 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
22643 SDValue AhiBlo = Zero;
22644 if (!AHiIsZero && !BLoIsZero) {
22645 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22646 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
22649 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22650 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22652 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22655 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22656 SelectionDAG &DAG) {
22658 MVT VT = Op.getSimpleValueType();
22660 // Decompose 256-bit ops into smaller 128-bit ops.
22661 if (VT.is256BitVector() && !Subtarget.hasInt256())
22662 return Lower256IntArith(Op, DAG);
22664 // Only i8 vectors should need custom lowering after this.
22665 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22666 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22667 "Unsupported vector type");
22669 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22670 // logical shift down the upper half and pack back to i8.
22671 SDValue A = Op.getOperand(0);
22672 SDValue B = Op.getOperand(1);
22674 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22675 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22676 unsigned Opcode = Op.getOpcode();
22677 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22678 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22680 // For 512-bit vectors, split into 256-bit vectors to allow the
22681 // sign-extension to occur.
22682 if (VT == MVT::v64i8)
22683 return Lower512IntArith(Op, DAG);
22685 // AVX2 implementations - extend xmm subvectors to ymm.
22686 if (Subtarget.hasInt256()) {
22687 unsigned NumElems = VT.getVectorNumElements();
22688 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22689 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22691 if (VT == MVT::v32i8) {
22692 if (Subtarget.canExtendTo512BW()) {
22693 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22694 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22695 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22696 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22697 DAG.getConstant(8, dl, MVT::v32i16));
22698 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22700 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22701 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22702 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22703 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22704 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22705 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22706 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22707 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22708 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22709 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22710 DAG.getConstant(8, dl, MVT::v16i16));
22711 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22712 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22713 DAG.getConstant(8, dl, MVT::v16i16));
22714 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22715 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22716 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22717 16, 17, 18, 19, 20, 21, 22, 23};
22718 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22719 24, 25, 26, 27, 28, 29, 30, 31};
22720 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22721 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22722 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22725 assert(VT == MVT::v16i8 && "Unexpected VT");
22727 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22728 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22729 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22730 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22731 DAG.getConstant(8, dl, MVT::v16i16));
22732 // If we have BWI we can use truncate instruction.
22733 if (Subtarget.hasBWI())
22734 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22735 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22736 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22737 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22740 assert(VT == MVT::v16i8 &&
22741 "Pre-AVX2 support only supports v16i8 multiplication");
22742 MVT ExVT = MVT::v8i16;
22743 unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
22744 : ISD::SIGN_EXTEND_VECTOR_INREG;
22746 // Extract the lo parts and zero/sign extend to i16.
22748 if (Subtarget.hasSSE41()) {
22749 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
22750 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
22752 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22753 -1, 4, -1, 5, -1, 6, -1, 7};
22754 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22755 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22756 ALo = DAG.getBitcast(ExVT, ALo);
22757 BLo = DAG.getBitcast(ExVT, BLo);
22758 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22759 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22762 // Extract the hi parts and zero/sign extend to i16.
22764 if (Subtarget.hasSSE41()) {
22765 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22766 -1, -1, -1, -1, -1, -1, -1, -1};
22767 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22768 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22769 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
22770 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
22772 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22773 -1, 12, -1, 13, -1, 14, -1, 15};
22774 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22775 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22776 AHi = DAG.getBitcast(ExVT, AHi);
22777 BHi = DAG.getBitcast(ExVT, BHi);
22778 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22779 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22782 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22783 // pack back to v16i8.
22784 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22785 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22786 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22787 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22788 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22791 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22792 assert(Subtarget.isTargetWin64() && "Unexpected target");
22793 EVT VT = Op.getValueType();
22794 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22795 "Unexpected return type for lowering");
22799 switch (Op->getOpcode()) {
22800 default: llvm_unreachable("Unexpected request for libcall!");
22801 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22802 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22803 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22804 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22805 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22806 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22810 SDValue InChain = DAG.getEntryNode();
22812 TargetLowering::ArgListTy Args;
22813 TargetLowering::ArgListEntry Entry;
22814 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22815 EVT ArgVT = Op->getOperand(i).getValueType();
22816 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22817 "Unexpected argument type for lowering");
22818 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22819 Entry.Node = StackPtr;
22820 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22821 MachinePointerInfo(), /* Alignment = */ 16);
22822 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22823 Entry.Ty = PointerType::get(ArgTy,0);
22824 Entry.IsSExt = false;
22825 Entry.IsZExt = false;
22826 Args.push_back(Entry);
22829 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22830 getPointerTy(DAG.getDataLayout()));
22832 TargetLowering::CallLoweringInfo CLI(DAG);
22833 CLI.setDebugLoc(dl)
22836 getLibcallCallingConv(LC),
22837 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22840 .setSExtResult(isSigned)
22841 .setZExtResult(!isSigned);
22843 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22844 return DAG.getBitcast(VT, CallInfo.first);
22847 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22848 SelectionDAG &DAG) {
22849 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22850 MVT VT = Op0.getSimpleValueType();
22853 // Decompose 256-bit ops into smaller 128-bit ops.
22854 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22855 unsigned Opcode = Op.getOpcode();
22856 unsigned NumElems = VT.getVectorNumElements();
22857 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22858 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22859 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22860 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22861 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22862 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22863 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22865 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22866 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22868 return DAG.getMergeValues(Ops, dl);
22871 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22872 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22873 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22875 int NumElts = VT.getVectorNumElements();
22877 // PMULxD operations multiply each even value (starting at 0) of LHS with
22878 // the related value of RHS and produce a widen result.
22879 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22880 // => <2 x i64> <ae|cg>
22882 // In other word, to have all the results, we need to perform two PMULxD:
22883 // 1. one with the even values.
22884 // 2. one with the odd values.
22885 // To achieve #2, with need to place the odd values at an even position.
22887 // Place the odd value at an even position (basically, shift all values 1
22888 // step to the left):
22889 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22890 // <a|b|c|d> => <b|undef|d|undef>
22891 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22892 makeArrayRef(&Mask[0], NumElts));
22893 // <e|f|g|h> => <f|undef|h|undef>
22894 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22895 makeArrayRef(&Mask[0], NumElts));
22897 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22899 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22900 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22902 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22903 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22904 // => <2 x i64> <ae|cg>
22905 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22906 DAG.getBitcast(MulVT, Op0),
22907 DAG.getBitcast(MulVT, Op1)));
22908 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22909 // => <2 x i64> <bf|dh>
22910 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22911 DAG.getBitcast(MulVT, Odd0),
22912 DAG.getBitcast(MulVT, Odd1)));
22914 // Shuffle it back into the right order.
22915 SmallVector<int, 16> HighMask(NumElts);
22916 SmallVector<int, 16> LowMask(NumElts);
22917 for (int i = 0; i != NumElts; ++i) {
22918 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22919 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22922 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22923 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22925 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22926 // unsigned multiply.
22927 if (IsSigned && !Subtarget.hasSSE41()) {
22928 SDValue ShAmt = DAG.getConstant(
22930 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22931 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22932 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22933 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22934 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22936 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22937 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22940 // The first result of MUL_LOHI is actually the low value, followed by the
22942 SDValue Ops[] = {Lows, Highs};
22943 return DAG.getMergeValues(Ops, dl);
22946 // Return true if the required (according to Opcode) shift-imm form is natively
22947 // supported by the Subtarget
22948 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22950 if (VT.getScalarSizeInBits() < 16)
22953 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22954 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22957 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22958 (VT.is256BitVector() && Subtarget.hasInt256());
22960 bool AShift = LShift && (Subtarget.hasAVX512() ||
22961 (VT != MVT::v2i64 && VT != MVT::v4i64));
22962 return (Opcode == ISD::SRA) ? AShift : LShift;
22965 // The shift amount is a variable, but it is the same for all vector lanes.
22966 // These instructions are defined together with shift-immediate.
22968 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22970 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22973 // Return true if the required (according to Opcode) variable-shift form is
22974 // natively supported by the Subtarget
22975 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22978 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22981 // vXi16 supported only on AVX-512, BWI
22982 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22985 if (Subtarget.hasAVX512())
22988 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22989 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22990 return (Opcode == ISD::SRA) ? AShift : LShift;
22993 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22994 const X86Subtarget &Subtarget) {
22995 MVT VT = Op.getSimpleValueType();
22997 SDValue R = Op.getOperand(0);
22998 SDValue Amt = Op.getOperand(1);
23000 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
23001 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23003 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
23004 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
23005 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
23006 SDValue Ex = DAG.getBitcast(ExVT, R);
23008 // ashr(R, 63) === cmp_slt(R, 0)
23009 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
23010 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
23011 "Unsupported PCMPGT op");
23012 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
23013 getZeroVector(VT, Subtarget, DAG, dl), R);
23016 if (ShiftAmt >= 32) {
23017 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
23019 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
23020 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23021 ShiftAmt - 32, DAG);
23022 if (VT == MVT::v2i64)
23023 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
23024 if (VT == MVT::v4i64)
23025 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23026 {9, 1, 11, 3, 13, 5, 15, 7});
23028 // SRA upper i32, SHL whole i64 and select lower i32.
23029 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23032 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
23033 Lower = DAG.getBitcast(ExVT, Lower);
23034 if (VT == MVT::v2i64)
23035 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
23036 if (VT == MVT::v4i64)
23037 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23038 {8, 1, 10, 3, 12, 5, 14, 7});
23040 return DAG.getBitcast(VT, Ex);
23043 // Optimize shl/srl/sra with constant shift amount.
23044 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23045 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
23046 uint64_t ShiftAmt = ShiftConst->getZExtValue();
23048 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23049 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23051 // i64 SRA needs to be performed as partial shifts.
23052 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
23053 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
23054 Op.getOpcode() == ISD::SRA)
23055 return ArithmeticShiftRight64(ShiftAmt);
23057 if (VT == MVT::v16i8 ||
23058 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
23059 VT == MVT::v64i8) {
23060 unsigned NumElts = VT.getVectorNumElements();
23061 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23063 // Simple i8 add case
23064 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
23065 return DAG.getNode(ISD::ADD, dl, VT, R, R);
23067 // ashr(R, 7) === cmp_slt(R, 0)
23068 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
23069 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
23070 if (VT.is512BitVector()) {
23071 assert(VT == MVT::v64i8 && "Unexpected element type!");
23072 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
23074 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
23076 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
23079 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
23080 if (VT == MVT::v16i8 && Subtarget.hasXOP())
23083 if (Op.getOpcode() == ISD::SHL) {
23084 // Make a large shift.
23085 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
23087 SHL = DAG.getBitcast(VT, SHL);
23088 // Zero out the rightmost bits.
23089 return DAG.getNode(ISD::AND, dl, VT, SHL,
23090 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
23092 if (Op.getOpcode() == ISD::SRL) {
23093 // Make a large shift.
23094 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
23096 SRL = DAG.getBitcast(VT, SRL);
23097 // Zero out the leftmost bits.
23098 return DAG.getNode(ISD::AND, dl, VT, SRL,
23099 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
23101 if (Op.getOpcode() == ISD::SRA) {
23102 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
23103 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23105 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
23106 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
23107 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
23110 llvm_unreachable("Unknown shift opcode.");
23115 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23116 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
23117 if (!Subtarget.hasXOP() &&
23118 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
23119 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
23121 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
23122 unsigned SubVectorScale = 1;
23123 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23125 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
23126 Amt = Amt.getOperand(0);
23129 // Peek through any splat that was introduced for i64 shift vectorization.
23130 int SplatIndex = -1;
23131 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
23132 if (SVN->isSplat()) {
23133 SplatIndex = SVN->getSplatIndex();
23134 Amt = Amt.getOperand(0);
23135 assert(SplatIndex < (int)VT.getVectorNumElements() &&
23136 "Splat shuffle referencing second operand");
23139 if (Amt.getOpcode() != ISD::BITCAST ||
23140 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
23143 Amt = Amt.getOperand(0);
23144 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23145 (SubVectorScale * VT.getVectorNumElements());
23146 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
23147 uint64_t ShiftAmt = 0;
23148 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
23149 for (unsigned i = 0; i != Ratio; ++i) {
23150 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
23154 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
23157 // Check remaining shift amounts (if not a splat).
23158 if (SplatIndex < 0) {
23159 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23160 uint64_t ShAmt = 0;
23161 for (unsigned j = 0; j != Ratio; ++j) {
23162 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
23166 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
23168 if (ShAmt != ShiftAmt)
23173 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23174 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23176 if (Op.getOpcode() == ISD::SRA)
23177 return ArithmeticShiftRight64(ShiftAmt);
23183 // Determine if V is a splat value, and return the scalar.
23184 static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
23185 SelectionDAG &DAG, const X86Subtarget &Subtarget,
23187 V = peekThroughEXTRACT_SUBVECTORs(V);
23189 // Check if this is a splat build_vector node.
23190 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
23191 SDValue SplatAmt = BV->getSplatValue();
23192 if (SplatAmt && SplatAmt.isUndef())
23197 // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
23198 if (V.getOpcode() == ISD::SUB &&
23199 !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
23200 SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
23201 SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
23203 // Ensure that the corresponding splat BV element is not UNDEF.
23204 BitVector UndefElts;
23205 BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
23206 ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23207 if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
23208 unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
23209 if (!UndefElts[SplatIdx])
23210 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23211 VT.getVectorElementType(), V,
23212 DAG.getIntPtrConstant(SplatIdx, dl));
23216 // Check if this is a shuffle node doing a splat.
23217 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
23218 if (!SVN || !SVN->isSplat())
23221 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
23222 SDValue InVec = V.getOperand(0);
23223 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
23224 assert((SplatIdx < VT.getVectorNumElements()) &&
23225 "Unexpected shuffle index found!");
23226 return InVec.getOperand(SplatIdx);
23227 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
23228 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
23229 if (C->getZExtValue() == SplatIdx)
23230 return InVec.getOperand(1);
23233 // Avoid introducing an extract element from a shuffle.
23234 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23235 VT.getVectorElementType(), InVec,
23236 DAG.getIntPtrConstant(SplatIdx, dl));
23239 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
23240 const X86Subtarget &Subtarget) {
23241 MVT VT = Op.getSimpleValueType();
23243 SDValue R = Op.getOperand(0);
23244 SDValue Amt = Op.getOperand(1);
23245 unsigned Opcode = Op.getOpcode();
23247 unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
23248 (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23250 unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
23251 (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
23253 Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
23255 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
23256 if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
23257 MVT EltVT = VT.getVectorElementType();
23258 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
23259 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
23260 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
23261 else if (EltVT.bitsLT(MVT::i32))
23262 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
23264 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
23268 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23269 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
23270 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
23271 Amt = Amt.getOperand(0);
23272 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23273 VT.getVectorNumElements();
23274 std::vector<SDValue> Vals(Ratio);
23275 for (unsigned i = 0; i != Ratio; ++i)
23276 Vals[i] = Amt.getOperand(i);
23277 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23278 for (unsigned j = 0; j != Ratio; ++j)
23279 if (Vals[j] != Amt.getOperand(i + j))
23283 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
23284 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
23289 // Convert a shift/rotate left amount to a multiplication scale factor.
23290 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
23291 const X86Subtarget &Subtarget,
23292 SelectionDAG &DAG) {
23293 MVT VT = Amt.getSimpleValueType();
23294 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
23295 (Subtarget.hasInt256() && VT == MVT::v16i16)))
23298 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
23299 SmallVector<SDValue, 8> Elts;
23300 MVT SVT = VT.getVectorElementType();
23301 unsigned SVTBits = SVT.getSizeInBits();
23302 APInt One(SVTBits, 1);
23303 unsigned NumElems = VT.getVectorNumElements();
23305 for (unsigned i = 0; i != NumElems; ++i) {
23306 SDValue Op = Amt->getOperand(i);
23307 if (Op->isUndef()) {
23308 Elts.push_back(Op);
23312 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23313 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23314 uint64_t ShAmt = C.getZExtValue();
23315 if (ShAmt >= SVTBits) {
23316 Elts.push_back(DAG.getUNDEF(SVT));
23319 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23321 return DAG.getBuildVector(VT, dl, Elts);
23324 // If the target doesn't support variable shifts, use either FP conversion
23325 // or integer multiplication to avoid shifting each element individually.
23326 if (VT == MVT::v4i32) {
23327 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23328 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
23329 DAG.getConstant(0x3f800000U, dl, VT));
23330 Amt = DAG.getBitcast(MVT::v4f32, Amt);
23331 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
23334 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
23335 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
23336 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23337 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
23338 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
23339 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
23340 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
23341 if (Subtarget.hasSSE41())
23342 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23344 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
23345 DAG.getBitcast(VT, Hi),
23346 {0, 2, 4, 6, 8, 10, 12, 14});
23352 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
23353 SelectionDAG &DAG) {
23354 MVT VT = Op.getSimpleValueType();
23356 SDValue R = Op.getOperand(0);
23357 SDValue Amt = Op.getOperand(1);
23358 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23360 assert(VT.isVector() && "Custom lowering only for vector shifts!");
23361 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
23363 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
23366 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
23369 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
23372 // XOP has 128-bit variable logical/arithmetic shifts.
23373 // +ve/-ve Amt = shift left/right.
23374 if (Subtarget.hasXOP() &&
23375 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
23376 VT == MVT::v8i16 || VT == MVT::v16i8)) {
23377 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
23378 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
23379 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
23381 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
23382 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
23383 if (Op.getOpcode() == ISD::SRA)
23384 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
23387 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23388 // shifts per-lane and then shuffle the partial results back together.
23389 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23390 // Splat the shift amounts so the scalar shifts above will catch it.
23391 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23392 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23393 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23394 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23395 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23398 // i64 vector arithmetic shift can be emulated with the transform:
23399 // M = lshr(SIGN_MASK, Amt)
23400 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23401 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23402 Op.getOpcode() == ISD::SRA) {
23403 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23404 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23405 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23406 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23407 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23411 // If possible, lower this packed shift into a vector multiply instead of
23412 // expanding it into a sequence of scalar shifts.
23413 if (Op.getOpcode() == ISD::SHL)
23414 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
23415 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
23417 // If possible, lower this shift as a sequence of two shifts by
23418 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
23420 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23422 // Could be rewritten as:
23423 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23425 // The advantage is that the two shifts from the example would be
23426 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
23427 // the vector shift into four scalar shifts plus four pairs of vector
23429 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
23430 bool UseMOVSD = false;
23431 bool CanBeSimplified;
23432 // The splat value for the first packed shift (the 'X' from the example).
23433 SDValue Amt1 = Amt->getOperand(0);
23434 // The splat value for the second packed shift (the 'Y' from the example).
23435 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
23437 // See if it is possible to replace this node with a sequence of
23438 // two shifts followed by a MOVSS/MOVSD/PBLEND.
23439 if (VT == MVT::v4i32) {
23440 // Check if it is legal to use a MOVSS.
23441 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
23442 Amt2 == Amt->getOperand(3);
23443 if (!CanBeSimplified) {
23444 // Otherwise, check if we can still simplify this node using a MOVSD.
23445 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
23446 Amt->getOperand(2) == Amt->getOperand(3);
23448 Amt2 = Amt->getOperand(2);
23451 // Do similar checks for the case where the machine value type
23453 CanBeSimplified = Amt1 == Amt->getOperand(1);
23454 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
23455 CanBeSimplified = Amt2 == Amt->getOperand(i);
23457 if (!CanBeSimplified) {
23459 CanBeSimplified = true;
23460 Amt2 = Amt->getOperand(4);
23461 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
23462 CanBeSimplified = Amt1 == Amt->getOperand(i);
23463 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
23464 CanBeSimplified = Amt2 == Amt->getOperand(j);
23468 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
23469 isa<ConstantSDNode>(Amt2)) {
23470 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
23472 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23473 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23475 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23476 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23477 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
23478 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
23480 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23481 BitCast2, {0, 1, 6, 7}));
23482 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23483 BitCast2, {0, 5, 6, 7}));
23487 // v4i32 Non Uniform Shifts.
23488 // If the shift amount is constant we can shift each lane using the SSE2
23489 // immediate shifts, else we need to zero-extend each lane to the lower i64
23490 // and shift using the SSE2 variable shifts.
23491 // The separate results can then be blended together.
23492 if (VT == MVT::v4i32) {
23493 unsigned Opc = Op.getOpcode();
23494 SDValue Amt0, Amt1, Amt2, Amt3;
23496 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23497 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23498 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23499 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23501 // ISD::SHL is handled above but we include it here for completeness.
23504 llvm_unreachable("Unknown target vector shift node");
23506 Opc = X86ISD::VSHL;
23509 Opc = X86ISD::VSRL;
23512 Opc = X86ISD::VSRA;
23515 // The SSE2 shifts use the lower i64 as the same shift amount for
23516 // all lanes and the upper i64 is ignored. On AVX we're better off
23517 // just zero-extending, but for SSE just duplicating the top 16-bits is
23518 // cheaper and has the same effect for out of range values.
23519 if (Subtarget.hasAVX()) {
23520 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23521 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23522 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23523 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23524 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23526 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
23527 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23528 {4, 5, 6, 7, -1, -1, -1, -1});
23529 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23530 {0, 1, 1, 1, -1, -1, -1, -1});
23531 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23532 {2, 3, 3, 3, -1, -1, -1, -1});
23533 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23534 {0, 1, 1, 1, -1, -1, -1, -1});
23535 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23536 {2, 3, 3, 3, -1, -1, -1, -1});
23540 SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
23541 SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
23542 SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
23543 SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
23545 // Merge the shifted lane results optimally with/without PBLENDW.
23546 // TODO - ideally shuffle combining would handle this.
23547 if (Subtarget.hasSSE41()) {
23548 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23549 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23550 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23552 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
23553 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
23554 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
23557 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23558 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23559 // make the existing SSE solution better.
23560 // NOTE: We honor prefered vector width before promoting to 512-bits.
23561 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23562 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23563 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23564 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23565 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23566 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23567 "Unexpected vector type");
23568 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23569 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23571 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23572 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23573 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23574 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23575 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23578 if (VT == MVT::v16i8 ||
23579 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23580 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23581 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23582 unsigned ShiftOpcode = Op->getOpcode();
23584 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23585 if (VT.is512BitVector()) {
23586 // On AVX512BW targets we make use of the fact that VSELECT lowers
23587 // to a masked blend which selects bytes based just on the sign bit
23588 // extracted to a mask.
23589 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23590 V0 = DAG.getBitcast(VT, V0);
23591 V1 = DAG.getBitcast(VT, V1);
23592 Sel = DAG.getBitcast(VT, Sel);
23593 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
23595 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23596 } else if (Subtarget.hasSSE41()) {
23597 // On SSE41 targets we make use of the fact that VSELECT lowers
23598 // to PBLENDVB which selects bytes based just on the sign bit.
23599 V0 = DAG.getBitcast(VT, V0);
23600 V1 = DAG.getBitcast(VT, V1);
23601 Sel = DAG.getBitcast(VT, Sel);
23602 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23604 // On pre-SSE41 targets we test for the sign bit by comparing to
23605 // zero - a negative value will set all bits of the lanes to true
23606 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23607 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23608 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23609 return DAG.getSelect(dl, SelVT, C, V0, V1);
23612 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23613 // We can safely do this using i16 shifts as we're only interested in
23614 // the 3 lower bits of each byte.
23615 Amt = DAG.getBitcast(ExtVT, Amt);
23616 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23617 Amt = DAG.getBitcast(VT, Amt);
23619 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23620 // r = VSELECT(r, shift(r, 4), a);
23622 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23623 R = SignBitSelect(VT, Amt, M, R);
23626 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23628 // r = VSELECT(r, shift(r, 2), a);
23629 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23630 R = SignBitSelect(VT, Amt, M, R);
23633 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23635 // return VSELECT(r, shift(r, 1), a);
23636 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23637 R = SignBitSelect(VT, Amt, M, R);
23641 if (Op->getOpcode() == ISD::SRA) {
23642 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23643 // so we can correctly sign extend. We don't care what happens to the
23645 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23646 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23647 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23648 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23649 ALo = DAG.getBitcast(ExtVT, ALo);
23650 AHi = DAG.getBitcast(ExtVT, AHi);
23651 RLo = DAG.getBitcast(ExtVT, RLo);
23652 RHi = DAG.getBitcast(ExtVT, RHi);
23654 // r = VSELECT(r, shift(r, 4), a);
23655 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23656 DAG.getConstant(4, dl, ExtVT));
23657 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23658 DAG.getConstant(4, dl, ExtVT));
23659 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23660 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23663 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23664 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23666 // r = VSELECT(r, shift(r, 2), a);
23667 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23668 DAG.getConstant(2, dl, ExtVT));
23669 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23670 DAG.getConstant(2, dl, ExtVT));
23671 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23672 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23675 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23676 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23678 // r = VSELECT(r, shift(r, 1), a);
23679 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23680 DAG.getConstant(1, dl, ExtVT));
23681 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23682 DAG.getConstant(1, dl, ExtVT));
23683 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23684 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23686 // Logical shift the result back to the lower byte, leaving a zero upper
23688 // meaning that we can safely pack with PACKUSWB.
23690 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23692 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23693 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23697 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23698 MVT ExtVT = MVT::v8i32;
23699 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23700 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23701 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23702 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23703 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23704 ALo = DAG.getBitcast(ExtVT, ALo);
23705 AHi = DAG.getBitcast(ExtVT, AHi);
23706 RLo = DAG.getBitcast(ExtVT, RLo);
23707 RHi = DAG.getBitcast(ExtVT, RHi);
23708 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23709 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23710 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23711 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23712 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23715 if (VT == MVT::v8i16) {
23716 unsigned ShiftOpcode = Op->getOpcode();
23718 // If we have a constant shift amount, the non-SSE41 path is best as
23719 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23720 bool UseSSE41 = Subtarget.hasSSE41() &&
23721 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23723 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23724 // On SSE41 targets we make use of the fact that VSELECT lowers
23725 // to PBLENDVB which selects bytes based just on the sign bit.
23727 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23728 V0 = DAG.getBitcast(ExtVT, V0);
23729 V1 = DAG.getBitcast(ExtVT, V1);
23730 Sel = DAG.getBitcast(ExtVT, Sel);
23731 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23733 // On pre-SSE41 targets we splat the sign bit - a negative value will
23734 // set all bits of the lanes to true and VSELECT uses that in
23735 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23737 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23738 return DAG.getSelect(dl, VT, C, V0, V1);
23741 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23743 // On SSE41 targets we need to replicate the shift mask in both
23744 // bytes for PBLENDVB.
23747 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23748 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23750 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23753 // r = VSELECT(r, shift(r, 8), a);
23754 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23755 R = SignBitSelect(Amt, M, R);
23758 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23760 // r = VSELECT(r, shift(r, 4), a);
23761 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23762 R = SignBitSelect(Amt, M, R);
23765 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23767 // r = VSELECT(r, shift(r, 2), a);
23768 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23769 R = SignBitSelect(Amt, M, R);
23772 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23774 // return VSELECT(r, shift(r, 1), a);
23775 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23776 R = SignBitSelect(Amt, M, R);
23780 // Decompose 256-bit shifts into smaller 128-bit shifts.
23781 if (VT.is256BitVector())
23782 return Lower256IntArith(Op, DAG);
23787 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23788 SelectionDAG &DAG) {
23789 MVT VT = Op.getSimpleValueType();
23790 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23793 SDValue R = Op.getOperand(0);
23794 SDValue Amt = Op.getOperand(1);
23795 unsigned Opcode = Op.getOpcode();
23796 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23798 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
23799 // Attempt to rotate by immediate.
23801 SmallVector<APInt, 16> EltBits;
23802 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23803 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23804 return EltBits[0] == V;
23806 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23807 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23808 return DAG.getNode(Op, DL, VT, R,
23809 DAG.getConstant(RotateAmt, DL, MVT::i8));
23813 // Else, fall-back on VPROLV/VPRORV.
23817 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23819 // XOP has 128-bit vector variable + immediate rotates.
23820 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23821 if (Subtarget.hasXOP()) {
23822 // Split 256-bit integers.
23823 if (VT.is256BitVector())
23824 return Lower256IntArith(Op, DAG);
23825 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23827 // Attempt to rotate by immediate.
23828 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23829 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23830 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23831 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23832 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23833 DAG.getConstant(RotateAmt, DL, MVT::i8));
23837 // Use general rotate by variable (per-element).
23841 // Split 256-bit integers on pre-AVX2 targets.
23842 if (VT.is256BitVector() && !Subtarget.hasAVX2())
23843 return Lower256IntArith(Op, DAG);
23845 assert((VT == MVT::v4i32 || VT == MVT::v8i16 ||
23846 ((VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget.hasAVX2())) &&
23847 "Only v4i32/v8i16/v8i32/v16i16 vector rotates supported");
23849 // Rotate by an uniform constant - expand back to shifts.
23850 // TODO - legalizers should be able to handle this.
23851 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23852 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23853 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23854 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23855 if (RotateAmt == 0)
23858 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, VT, R,
23860 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, VT, R,
23861 EltSizeInBits - RotateAmt, DAG);
23862 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23866 // Rotate by splat - expand back to shifts.
23867 // TODO - legalizers should be able to handle this.
23868 if (IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
23869 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23870 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23871 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23872 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23873 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23876 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23877 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
23878 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
23880 // Best to fallback for all supported variable shifts.
23881 // AVX2 - best to fallback for non-constants as well.
23882 // TODO - legalizers should be able to handle this.
23883 if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
23884 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23885 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23886 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23887 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23888 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23891 // As with shifts, convert the rotation amount to a multiplication factor.
23892 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
23893 assert(Scale && "Failed to convert ROTL amount to scale");
23895 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
23896 if (EltSizeInBits == 16) {
23897 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
23898 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
23899 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23902 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
23903 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
23904 // that can then be OR'd with the lower 32-bits.
23905 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
23906 static const int OddMask[] = {1, -1, 3, -1};
23907 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
23908 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
23910 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
23911 DAG.getBitcast(MVT::v2i64, R),
23912 DAG.getBitcast(MVT::v2i64, Scale));
23913 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
23914 DAG.getBitcast(MVT::v2i64, R13),
23915 DAG.getBitcast(MVT::v2i64, Scale13));
23916 Res02 = DAG.getBitcast(VT, Res02);
23917 Res13 = DAG.getBitcast(VT, Res13);
23919 return DAG.getNode(ISD::OR, DL, VT,
23920 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
23921 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
23924 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23925 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23926 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23927 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23928 // has only one use.
23929 SDNode *N = Op.getNode();
23930 SDValue LHS = N->getOperand(0);
23931 SDValue RHS = N->getOperand(1);
23932 unsigned BaseOp = 0;
23933 X86::CondCode Cond;
23935 switch (Op.getOpcode()) {
23936 default: llvm_unreachable("Unknown ovf instruction!");
23938 // A subtract of one will be selected as a INC. Note that INC doesn't
23939 // set CF, so we can't do this for UADDO.
23940 if (isOneConstant(RHS)) {
23941 BaseOp = X86ISD::INC;
23942 Cond = X86::COND_O;
23945 BaseOp = X86ISD::ADD;
23946 Cond = X86::COND_O;
23949 BaseOp = X86ISD::ADD;
23950 Cond = X86::COND_B;
23953 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23954 // set CF, so we can't do this for USUBO.
23955 if (isOneConstant(RHS)) {
23956 BaseOp = X86ISD::DEC;
23957 Cond = X86::COND_O;
23960 BaseOp = X86ISD::SUB;
23961 Cond = X86::COND_O;
23964 BaseOp = X86ISD::SUB;
23965 Cond = X86::COND_B;
23968 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23969 Cond = X86::COND_O;
23971 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23972 if (N->getValueType(0) == MVT::i8) {
23973 BaseOp = X86ISD::UMUL8;
23974 Cond = X86::COND_O;
23977 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23979 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23981 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23983 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23987 // Also sets EFLAGS.
23988 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23989 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23991 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23993 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23996 /// Returns true if the operand type is exactly twice the native width, and
23997 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23998 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23999 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
24000 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
24001 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
24004 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
24005 else if (OpWidth == 128)
24006 return Subtarget.hasCmpxchg16b();
24011 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
24012 return needsCmpXchgNb(SI->getValueOperand()->getType());
24015 // Note: this turns large loads into lock cmpxchg8b/16b.
24016 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
24017 TargetLowering::AtomicExpansionKind
24018 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
24019 auto PTy = cast<PointerType>(LI->getPointerOperandType());
24020 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
24021 : AtomicExpansionKind::None;
24024 TargetLowering::AtomicExpansionKind
24025 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
24026 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24027 Type *MemType = AI->getType();
24029 // If the operand is too big, we must see if cmpxchg8/16b is available
24030 // and default to library calls otherwise.
24031 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
24032 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
24033 : AtomicExpansionKind::None;
24036 AtomicRMWInst::BinOp Op = AI->getOperation();
24039 llvm_unreachable("Unknown atomic operation");
24040 case AtomicRMWInst::Xchg:
24041 case AtomicRMWInst::Add:
24042 case AtomicRMWInst::Sub:
24043 // It's better to use xadd, xsub or xchg for these in all cases.
24044 return AtomicExpansionKind::None;
24045 case AtomicRMWInst::Or:
24046 case AtomicRMWInst::And:
24047 case AtomicRMWInst::Xor:
24048 // If the atomicrmw's result isn't actually used, we can just add a "lock"
24049 // prefix to a normal instruction for these operations.
24050 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
24051 : AtomicExpansionKind::None;
24052 case AtomicRMWInst::Nand:
24053 case AtomicRMWInst::Max:
24054 case AtomicRMWInst::Min:
24055 case AtomicRMWInst::UMax:
24056 case AtomicRMWInst::UMin:
24057 // These always require a non-trivial set of data operations on x86. We must
24058 // use a cmpxchg loop.
24059 return AtomicExpansionKind::CmpXChg;
24064 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
24065 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24066 Type *MemType = AI->getType();
24067 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
24068 // there is no benefit in turning such RMWs into loads, and it is actually
24069 // harmful as it introduces a mfence.
24070 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
24073 auto Builder = IRBuilder<>(AI);
24074 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
24075 auto SSID = AI->getSyncScopeID();
24076 // We must restrict the ordering to avoid generating loads with Release or
24077 // ReleaseAcquire orderings.
24078 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
24079 auto Ptr = AI->getPointerOperand();
24081 // Before the load we need a fence. Here is an example lifted from
24082 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
24085 // x.store(1, relaxed);
24086 // r1 = y.fetch_add(0, release);
24088 // y.fetch_add(42, acquire);
24089 // r2 = x.load(relaxed);
24090 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
24091 // lowered to just a load without a fence. A mfence flushes the store buffer,
24092 // making the optimization clearly correct.
24093 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
24094 // otherwise, we might be able to be more aggressive on relaxed idempotent
24095 // rmw. In practice, they do not look useful, so we don't try to be
24096 // especially clever.
24097 if (SSID == SyncScope::SingleThread)
24098 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
24099 // the IR level, so we must wrap it in an intrinsic.
24102 if (!Subtarget.hasMFence())
24103 // FIXME: it might make sense to use a locked operation here but on a
24104 // different cache-line to prevent cache-line bouncing. In practice it
24105 // is probably a small win, and x86 processors without mfence are rare
24106 // enough that we do not bother.
24110 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
24111 Builder.CreateCall(MFence, {});
24113 // Finally we can emit the atomic load.
24114 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
24115 AI->getType()->getPrimitiveSizeInBits());
24116 Loaded->setAtomic(Order, SSID);
24117 AI->replaceAllUsesWith(Loaded);
24118 AI->eraseFromParent();
24122 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
24123 SelectionDAG &DAG) {
24125 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
24126 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
24127 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
24128 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
24130 // The only fence that needs an instruction is a sequentially-consistent
24131 // cross-thread fence.
24132 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
24133 FenceSSID == SyncScope::System) {
24134 if (Subtarget.hasMFence())
24135 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
24137 SDValue Chain = Op.getOperand(0);
24138 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
24140 DAG.getRegister(X86::ESP, MVT::i32), // Base
24141 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
24142 DAG.getRegister(0, MVT::i32), // Index
24143 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
24144 DAG.getRegister(0, MVT::i32), // Segment.
24148 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
24149 return SDValue(Res, 0);
24152 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
24153 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
24156 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
24157 SelectionDAG &DAG) {
24158 MVT T = Op.getSimpleValueType();
24162 switch(T.SimpleTy) {
24163 default: llvm_unreachable("Invalid value type!");
24164 case MVT::i8: Reg = X86::AL; size = 1; break;
24165 case MVT::i16: Reg = X86::AX; size = 2; break;
24166 case MVT::i32: Reg = X86::EAX; size = 4; break;
24168 assert(Subtarget.is64Bit() && "Node not type legal!");
24169 Reg = X86::RAX; size = 8;
24172 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
24173 Op.getOperand(2), SDValue());
24174 SDValue Ops[] = { cpIn.getValue(0),
24177 DAG.getTargetConstant(size, DL, MVT::i8),
24178 cpIn.getValue(1) };
24179 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24180 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
24181 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
24185 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
24186 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
24187 MVT::i32, cpOut.getValue(2));
24188 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
24190 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
24191 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
24192 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
24196 // Create MOVMSKB, taking into account whether we need to split for AVX1.
24197 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
24198 const X86Subtarget &Subtarget) {
24199 MVT InVT = V.getSimpleValueType();
24201 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
24203 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
24204 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
24205 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
24206 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
24207 DAG.getConstant(16, DL, MVT::i8));
24208 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
24211 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24214 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
24215 SelectionDAG &DAG) {
24216 SDValue Src = Op.getOperand(0);
24217 MVT SrcVT = Src.getSimpleValueType();
24218 MVT DstVT = Op.getSimpleValueType();
24220 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
24221 // half to v32i1 and concatenating the result.
24222 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
24223 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
24224 assert(Subtarget.hasBWI() && "Expected BWI target");
24226 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24227 DAG.getIntPtrConstant(0, dl));
24228 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24229 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24230 DAG.getIntPtrConstant(1, dl));
24231 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24232 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24235 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
24236 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
24237 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
24240 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
24241 EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
24242 DstVT.getVectorNumElements() / 2);
24243 Lo = DAG.getBitcast(CastVT, Lo);
24244 Hi = DAG.getBitcast(CastVT, Hi);
24245 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
24248 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
24249 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
24250 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
24251 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
24253 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
24254 V = getPMOVMSKB(DL, V, DAG, Subtarget);
24255 return DAG.getZExtOrTrunc(V, DL, DstVT);
24258 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
24259 SrcVT == MVT::i64) {
24260 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24261 if (DstVT != MVT::f64)
24262 // This conversion needs to be expanded.
24265 SmallVector<SDValue, 16> Elts;
24269 if (SrcVT.isVector()) {
24270 NumElts = SrcVT.getVectorNumElements();
24271 SVT = SrcVT.getVectorElementType();
24273 // Widen the vector in input in the case of MVT::v2i32.
24274 // Example: from MVT::v2i32 to MVT::v4i32.
24275 for (unsigned i = 0, e = NumElts; i != e; ++i)
24276 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
24277 DAG.getIntPtrConstant(i, dl)));
24279 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
24280 "Unexpected source type in LowerBITCAST");
24281 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24282 DAG.getIntPtrConstant(0, dl)));
24283 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24284 DAG.getIntPtrConstant(1, dl)));
24288 // Explicitly mark the extra elements as Undef.
24289 Elts.append(NumElts, DAG.getUNDEF(SVT));
24291 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24292 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
24293 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
24294 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
24295 DAG.getIntPtrConstant(0, dl));
24298 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
24299 Subtarget.hasMMX() && "Unexpected custom BITCAST");
24300 assert((DstVT == MVT::i64 ||
24301 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
24302 "Unexpected custom BITCAST");
24303 // i64 <=> MMX conversions are Legal.
24304 if (SrcVT==MVT::i64 && DstVT.isVector())
24306 if (DstVT==MVT::i64 && SrcVT.isVector())
24308 // MMX <=> MMX conversions are Legal.
24309 if (SrcVT.isVector() && DstVT.isVector())
24311 // All other conversions need to be expanded.
24315 /// Compute the horizontal sum of bytes in V for the elements of VT.
24317 /// Requires V to be a byte vector and VT to be an integer vector type with
24318 /// wider elements than V's type. The width of the elements of VT determines
24319 /// how many bytes of V are summed horizontally to produce each element of the
24321 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
24322 const X86Subtarget &Subtarget,
24323 SelectionDAG &DAG) {
24325 MVT ByteVecVT = V.getSimpleValueType();
24326 MVT EltVT = VT.getVectorElementType();
24327 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
24328 "Expected value to have byte element type.");
24329 assert(EltVT != MVT::i8 &&
24330 "Horizontal byte sum only makes sense for wider elements!");
24331 unsigned VecSize = VT.getSizeInBits();
24332 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
24334 // PSADBW instruction horizontally add all bytes and leave the result in i64
24335 // chunks, thus directly computes the pop count for v2i64 and v4i64.
24336 if (EltVT == MVT::i64) {
24337 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24338 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24339 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
24340 return DAG.getBitcast(VT, V);
24343 if (EltVT == MVT::i32) {
24344 // We unpack the low half and high half into i32s interleaved with zeros so
24345 // that we can use PSADBW to horizontally sum them. The most useful part of
24346 // this is that it lines up the results of two PSADBW instructions to be
24347 // two v2i64 vectors which concatenated are the 4 population counts. We can
24348 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
24349 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
24350 SDValue V32 = DAG.getBitcast(VT, V);
24351 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
24352 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
24354 // Do the horizontal sums into two v2i64s.
24355 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24356 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24357 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24358 DAG.getBitcast(ByteVecVT, Low), Zeros);
24359 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24360 DAG.getBitcast(ByteVecVT, High), Zeros);
24362 // Merge them together.
24363 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
24364 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
24365 DAG.getBitcast(ShortVecVT, Low),
24366 DAG.getBitcast(ShortVecVT, High));
24368 return DAG.getBitcast(VT, V);
24371 // The only element type left is i16.
24372 assert(EltVT == MVT::i16 && "Unknown how to handle type");
24374 // To obtain pop count for each i16 element starting from the pop count for
24375 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
24376 // right by 8. It is important to shift as i16s as i8 vector shift isn't
24377 // directly supported.
24378 SDValue ShifterV = DAG.getConstant(8, DL, VT);
24379 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24380 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
24381 DAG.getBitcast(ByteVecVT, V));
24382 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24385 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
24386 const X86Subtarget &Subtarget,
24387 SelectionDAG &DAG) {
24388 MVT VT = Op.getSimpleValueType();
24389 MVT EltVT = VT.getVectorElementType();
24390 unsigned VecSize = VT.getSizeInBits();
24392 // Implement a lookup table in register by using an algorithm based on:
24393 // http://wm.ite.pl/articles/sse-popcount.html
24395 // The general idea is that every lower byte nibble in the input vector is an
24396 // index into a in-register pre-computed pop count table. We then split up the
24397 // input vector in two new ones: (1) a vector with only the shifted-right
24398 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
24399 // masked out higher ones) for each byte. PSHUFB is used separately with both
24400 // to index the in-register table. Next, both are added and the result is a
24401 // i8 vector where each element contains the pop count for input byte.
24403 // To obtain the pop count for elements != i8, we follow up with the same
24404 // approach and use additional tricks as described below.
24406 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
24407 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
24408 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
24409 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
24411 int NumByteElts = VecSize / 8;
24412 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
24413 SDValue In = DAG.getBitcast(ByteVecVT, Op);
24414 SmallVector<SDValue, 64> LUTVec;
24415 for (int i = 0; i < NumByteElts; ++i)
24416 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24417 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
24418 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
24421 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
24422 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
24425 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
24427 // The input vector is used as the shuffle mask that index elements into the
24428 // LUT. After counting low and high nibbles, add the vector to obtain the
24429 // final pop count per i8 element.
24430 SDValue HighPopCnt =
24431 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24432 SDValue LowPopCnt =
24433 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24434 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
24436 if (EltVT == MVT::i8)
24439 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
24442 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
24443 const X86Subtarget &Subtarget,
24444 SelectionDAG &DAG) {
24445 MVT VT = Op.getSimpleValueType();
24446 assert(VT.is128BitVector() &&
24447 "Only 128-bit vector bitmath lowering supported.");
24449 int VecSize = VT.getSizeInBits();
24450 MVT EltVT = VT.getVectorElementType();
24451 int Len = EltVT.getSizeInBits();
24453 // This is the vectorized version of the "best" algorithm from
24454 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
24455 // with a minor tweak to use a series of adds + shifts instead of vector
24456 // multiplications. Implemented for all integer vector types. We only use
24457 // this when we don't have SSSE3 which allows a LUT-based lowering that is
24458 // much faster, even faster than using native popcnt instructions.
24460 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
24461 MVT VT = V.getSimpleValueType();
24462 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
24463 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
24465 auto GetMask = [&](SDValue V, APInt Mask) {
24466 MVT VT = V.getSimpleValueType();
24467 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
24468 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
24471 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
24472 // x86, so set the SRL type to have elements at least i16 wide. This is
24473 // correct because all of our SRLs are followed immediately by a mask anyways
24474 // that handles any bits that sneak into the high bits of the byte elements.
24475 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
24479 // v = v - ((v >> 1) & 0x55555555...)
24481 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
24482 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
24483 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
24485 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
24486 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
24487 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
24488 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
24489 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
24491 // v = (v + (v >> 4)) & 0x0F0F0F0F...
24492 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
24493 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
24494 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
24496 // At this point, V contains the byte-wise population count, and we are
24497 // merely doing a horizontal sum if necessary to get the wider element
24499 if (EltVT == MVT::i8)
24502 return LowerHorizontalByteSum(
24503 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
24507 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
24508 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24509 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24510 SelectionDAG &DAG) {
24511 MVT VT = Op.getSimpleValueType();
24512 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24513 "Unknown CTPOP type to handle");
24514 SDLoc DL(Op.getNode());
24515 SDValue Op0 = Op.getOperand(0);
24517 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24518 if (Subtarget.hasVPOPCNTDQ()) {
24519 unsigned NumElems = VT.getVectorNumElements();
24520 assert((VT.getVectorElementType() == MVT::i8 ||
24521 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24522 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
24523 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24524 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24525 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24526 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24530 if (!Subtarget.hasSSSE3()) {
24531 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24532 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24533 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24536 // Decompose 256-bit ops into smaller 128-bit ops.
24537 if (VT.is256BitVector() && !Subtarget.hasInt256())
24538 return Lower256IntUnary(Op, DAG);
24540 // Decompose 512-bit ops into smaller 256-bit ops.
24541 if (VT.is512BitVector() && !Subtarget.hasBWI())
24542 return Lower512IntUnary(Op, DAG);
24544 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24547 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24548 SelectionDAG &DAG) {
24549 assert(Op.getSimpleValueType().isVector() &&
24550 "We only do custom lowering for vector population count.");
24551 return LowerVectorCTPOP(Op, Subtarget, DAG);
24554 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24555 MVT VT = Op.getSimpleValueType();
24556 SDValue In = Op.getOperand(0);
24559 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24560 // perform the BITREVERSE.
24561 if (!VT.isVector()) {
24562 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24563 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24564 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24565 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24566 DAG.getIntPtrConstant(0, DL));
24569 int NumElts = VT.getVectorNumElements();
24570 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24572 // Decompose 256-bit ops into smaller 128-bit ops.
24573 if (VT.is256BitVector())
24574 return Lower256IntUnary(Op, DAG);
24576 assert(VT.is128BitVector() &&
24577 "Only 128-bit vector bitreverse lowering supported.");
24579 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24580 // perform the BSWAP in the shuffle.
24581 // Its best to shuffle using the second operand as this will implicitly allow
24582 // memory folding for multiple vectors.
24583 SmallVector<SDValue, 16> MaskElts;
24584 for (int i = 0; i != NumElts; ++i) {
24585 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24586 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24587 int PermuteByte = SourceByte | (2 << 5);
24588 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24592 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24593 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24594 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24596 return DAG.getBitcast(VT, Res);
24599 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24600 SelectionDAG &DAG) {
24601 MVT VT = Op.getSimpleValueType();
24603 if (Subtarget.hasXOP() && !VT.is512BitVector())
24604 return LowerBITREVERSE_XOP(Op, DAG);
24606 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24608 SDValue In = Op.getOperand(0);
24611 unsigned NumElts = VT.getVectorNumElements();
24612 assert(VT.getScalarType() == MVT::i8 &&
24613 "Only byte vector BITREVERSE supported");
24615 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24616 if (VT.is256BitVector() && !Subtarget.hasInt256())
24617 return Lower256IntUnary(Op, DAG);
24619 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24620 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24621 // 0-15 value (moved to the other nibble).
24622 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24623 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24624 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24626 const int LoLUT[16] = {
24627 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24628 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24629 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24630 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24631 const int HiLUT[16] = {
24632 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24633 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24634 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24635 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24637 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24638 for (unsigned i = 0; i < NumElts; ++i) {
24639 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24640 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24643 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24644 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24645 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24646 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24647 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24650 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24651 const X86Subtarget &Subtarget,
24652 bool AllowIncDec = true) {
24653 unsigned NewOpc = 0;
24654 switch (N->getOpcode()) {
24655 case ISD::ATOMIC_LOAD_ADD:
24656 NewOpc = X86ISD::LADD;
24658 case ISD::ATOMIC_LOAD_SUB:
24659 NewOpc = X86ISD::LSUB;
24661 case ISD::ATOMIC_LOAD_OR:
24662 NewOpc = X86ISD::LOR;
24664 case ISD::ATOMIC_LOAD_XOR:
24665 NewOpc = X86ISD::LXOR;
24667 case ISD::ATOMIC_LOAD_AND:
24668 NewOpc = X86ISD::LAND;
24671 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24674 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24676 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24677 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24678 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24679 DAG.getMachineFunction().getFunction().optForSize())) {
24680 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24681 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24682 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24683 DAG.getVTList(MVT::i32, MVT::Other),
24684 {N->getOperand(0), N->getOperand(1)},
24685 /*MemVT=*/N->getSimpleValueType(0), MMO);
24686 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24687 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24688 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24689 DAG.getVTList(MVT::i32, MVT::Other),
24690 {N->getOperand(0), N->getOperand(1)},
24691 /*MemVT=*/N->getSimpleValueType(0), MMO);
24695 return DAG.getMemIntrinsicNode(
24696 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24697 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24698 /*MemVT=*/N->getSimpleValueType(0), MMO);
24701 /// Lower atomic_load_ops into LOCK-prefixed operations.
24702 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24703 const X86Subtarget &Subtarget) {
24704 SDValue Chain = N->getOperand(0);
24705 SDValue LHS = N->getOperand(1);
24706 SDValue RHS = N->getOperand(2);
24707 unsigned Opc = N->getOpcode();
24708 MVT VT = N->getSimpleValueType(0);
24711 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24712 // can only be lowered when the result is unused. They should have already
24713 // been transformed into a cmpxchg loop in AtomicExpand.
24714 if (N->hasAnyUseOfValue(0)) {
24715 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24716 // select LXADD if LOCK_SUB can't be selected.
24717 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24718 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24719 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24720 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24721 RHS, AN->getMemOperand());
24723 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24724 "Used AtomicRMW ops other than Add should have been expanded!");
24728 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24729 // RAUW the chain, but don't worry about the result, as it's unused.
24730 assert(!N->hasAnyUseOfValue(0));
24731 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24735 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24736 SDNode *Node = Op.getNode();
24738 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24740 // Convert seq_cst store -> xchg
24741 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24742 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24743 // (The only way to get a 16-byte store is cmpxchg16b)
24744 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24745 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24746 AtomicOrdering::SequentiallyConsistent ||
24747 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24748 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24749 cast<AtomicSDNode>(Node)->getMemoryVT(),
24750 Node->getOperand(0),
24751 Node->getOperand(1), Node->getOperand(2),
24752 cast<AtomicSDNode>(Node)->getMemOperand());
24753 return Swap.getValue(1);
24755 // Other atomic stores have a simple pattern.
24759 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24760 SDNode *N = Op.getNode();
24761 MVT VT = N->getSimpleValueType(0);
24763 // Let legalize expand this if it isn't a legal type yet.
24764 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24767 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24770 // Set the carry flag.
24771 SDValue Carry = Op.getOperand(2);
24772 EVT CarryVT = Carry.getValueType();
24773 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24774 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24775 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24777 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24778 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24779 Op.getOperand(1), Carry.getValue(1));
24781 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24782 if (N->getValueType(1) == MVT::i1)
24783 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24785 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24788 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24789 SelectionDAG &DAG) {
24790 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24792 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24793 // which returns the values as { float, float } (in XMM0) or
24794 // { double, double } (which is returned in XMM0, XMM1).
24796 SDValue Arg = Op.getOperand(0);
24797 EVT ArgVT = Arg.getValueType();
24798 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24800 TargetLowering::ArgListTy Args;
24801 TargetLowering::ArgListEntry Entry;
24805 Entry.IsSExt = false;
24806 Entry.IsZExt = false;
24807 Args.push_back(Entry);
24809 bool isF64 = ArgVT == MVT::f64;
24810 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24811 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24812 // the results are returned via SRet in memory.
24813 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24814 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24815 const char *LibcallName = TLI.getLibcallName(LC);
24817 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24819 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24820 : (Type *)VectorType::get(ArgTy, 4);
24822 TargetLowering::CallLoweringInfo CLI(DAG);
24823 CLI.setDebugLoc(dl)
24824 .setChain(DAG.getEntryNode())
24825 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24827 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24830 // Returned in xmm0 and xmm1.
24831 return CallResult.first;
24833 // Returned in bits 0:31 and 32:64 xmm0.
24834 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24835 CallResult.first, DAG.getIntPtrConstant(0, dl));
24836 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24837 CallResult.first, DAG.getIntPtrConstant(1, dl));
24838 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24839 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24842 /// Widen a vector input to a vector of NVT. The
24843 /// input vector must have the same element type as NVT.
24844 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24845 bool FillWithZeroes = false) {
24846 // Check if InOp already has the right width.
24847 MVT InVT = InOp.getSimpleValueType();
24851 if (InOp.isUndef())
24852 return DAG.getUNDEF(NVT);
24854 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24855 "input and widen element type must match");
24857 unsigned InNumElts = InVT.getVectorNumElements();
24858 unsigned WidenNumElts = NVT.getVectorNumElements();
24859 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24860 "Unexpected request for vector widening");
24863 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24864 InOp.getNumOperands() == 2) {
24865 SDValue N1 = InOp.getOperand(1);
24866 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24868 InOp = InOp.getOperand(0);
24869 InVT = InOp.getSimpleValueType();
24870 InNumElts = InVT.getVectorNumElements();
24873 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24874 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24875 SmallVector<SDValue, 16> Ops;
24876 for (unsigned i = 0; i < InNumElts; ++i)
24877 Ops.push_back(InOp.getOperand(i));
24879 EVT EltVT = InOp.getOperand(0).getValueType();
24881 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24882 DAG.getUNDEF(EltVT);
24883 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24884 Ops.push_back(FillVal);
24885 return DAG.getBuildVector(NVT, dl, Ops);
24887 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24889 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24890 InOp, DAG.getIntPtrConstant(0, dl));
24893 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24894 SelectionDAG &DAG) {
24895 assert(Subtarget.hasAVX512() &&
24896 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24898 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24899 SDValue Src = N->getValue();
24900 MVT VT = Src.getSimpleValueType();
24901 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24904 SDValue Scale = N->getScale();
24905 SDValue Index = N->getIndex();
24906 SDValue Mask = N->getMask();
24907 SDValue Chain = N->getChain();
24908 SDValue BasePtr = N->getBasePtr();
24910 if (VT == MVT::v2f32) {
24911 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24912 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24913 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24914 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24915 DAG.getUNDEF(MVT::v2f32));
24916 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24917 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24918 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24919 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24920 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24921 return SDValue(NewScatter.getNode(), 1);
24926 if (VT == MVT::v2i32) {
24927 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24928 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
24929 DAG.getUNDEF(MVT::v2i32));
24930 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24931 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24932 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24933 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24934 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24935 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24936 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24937 return SDValue(NewScatter.getNode(), 1);
24939 // Custom widen all the operands to avoid promotion.
24940 EVT NewIndexVT = EVT::getVectorVT(
24941 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
24942 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
24943 DAG.getUNDEF(Index.getValueType()));
24944 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24945 DAG.getConstant(0, dl, MVT::v2i1));
24946 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24947 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
24948 Ops, N->getMemOperand());
24951 MVT IndexVT = Index.getSimpleValueType();
24952 MVT MaskVT = Mask.getSimpleValueType();
24954 // If the index is v2i32, we're being called by type legalization and we
24955 // should just let the default handling take care of it.
24956 if (IndexVT == MVT::v2i32)
24959 // If we don't have VLX and neither the passthru or index is 512-bits, we
24960 // need to widen until one is.
24961 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24962 !Index.getSimpleValueType().is512BitVector()) {
24963 // Determine how much we need to widen by to get a 512-bit type.
24964 unsigned Factor = std::min(512/VT.getSizeInBits(),
24965 512/IndexVT.getSizeInBits());
24966 unsigned NumElts = VT.getVectorNumElements() * Factor;
24968 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
24969 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
24970 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24972 Src = ExtendToType(Src, VT, DAG);
24973 Index = ExtendToType(Index, IndexVT, DAG);
24974 Mask = ExtendToType(Mask, MaskVT, DAG, true);
24977 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
24978 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24979 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24980 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24981 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24982 return SDValue(NewScatter.getNode(), 1);
24985 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24986 SelectionDAG &DAG) {
24988 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24989 MVT VT = Op.getSimpleValueType();
24990 MVT ScalarVT = VT.getScalarType();
24991 SDValue Mask = N->getMask();
24994 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24995 "Expanding masked load is supported on AVX-512 target only!");
24997 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24998 "Expanding masked load is supported for 32 and 64-bit types only!");
25000 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25001 "Cannot lower masked load op.");
25003 assert((ScalarVT.getSizeInBits() >= 32 ||
25004 (Subtarget.hasBWI() &&
25005 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25006 "Unsupported masked load op.");
25008 // This operation is legal for targets with VLX, but without
25009 // VLX the vector should be widened to 512 bit
25010 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
25011 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25012 SDValue Src0 = N->getSrc0();
25013 Src0 = ExtendToType(Src0, WideDataVT, DAG);
25015 // Mask element has to be i1.
25016 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25017 "Unexpected mask type");
25019 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25021 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25022 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
25023 N->getBasePtr(), Mask, Src0,
25024 N->getMemoryVT(), N->getMemOperand(),
25025 N->getExtensionType(),
25026 N->isExpandingLoad());
25028 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
25029 NewLoad.getValue(0),
25030 DAG.getIntPtrConstant(0, dl));
25031 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
25032 return DAG.getMergeValues(RetOps, dl);
25035 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
25036 SelectionDAG &DAG) {
25037 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
25038 SDValue DataToStore = N->getValue();
25039 MVT VT = DataToStore.getSimpleValueType();
25040 MVT ScalarVT = VT.getScalarType();
25041 SDValue Mask = N->getMask();
25044 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
25045 "Expanding masked load is supported on AVX-512 target only!");
25047 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
25048 "Expanding masked load is supported for 32 and 64-bit types only!");
25050 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25051 "Cannot lower masked store op.");
25053 assert((ScalarVT.getSizeInBits() >= 32 ||
25054 (Subtarget.hasBWI() &&
25055 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25056 "Unsupported masked store op.");
25058 // This operation is legal for targets with VLX, but without
25059 // VLX the vector should be widened to 512 bit
25060 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
25061 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25063 // Mask element has to be i1.
25064 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25065 "Unexpected mask type");
25067 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25069 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
25070 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25071 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
25072 Mask, N->getMemoryVT(), N->getMemOperand(),
25073 N->isTruncatingStore(), N->isCompressingStore());
25076 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
25077 SelectionDAG &DAG) {
25078 assert(Subtarget.hasAVX2() &&
25079 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
25081 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
25083 MVT VT = Op.getSimpleValueType();
25084 SDValue Index = N->getIndex();
25085 SDValue Mask = N->getMask();
25086 SDValue Src0 = N->getValue();
25087 MVT IndexVT = Index.getSimpleValueType();
25088 MVT MaskVT = Mask.getSimpleValueType();
25090 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
25092 // If the index is v2i32, we're being called by type legalization.
25093 if (IndexVT == MVT::v2i32)
25096 // If we don't have VLX and neither the passthru or index is 512-bits, we
25097 // need to widen until one is.
25099 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25100 !IndexVT.is512BitVector()) {
25101 // Determine how much we need to widen by to get a 512-bit type.
25102 unsigned Factor = std::min(512/VT.getSizeInBits(),
25103 512/IndexVT.getSizeInBits());
25105 unsigned NumElts = VT.getVectorNumElements() * Factor;
25107 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25108 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25109 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25111 Src0 = ExtendToType(Src0, VT, DAG);
25112 Index = ExtendToType(Index, IndexVT, DAG);
25113 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25116 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
25118 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25119 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
25120 N->getMemOperand());
25121 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
25122 NewGather, DAG.getIntPtrConstant(0, dl));
25123 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
25126 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
25127 SelectionDAG &DAG) const {
25128 // TODO: Eventually, the lowering of these nodes should be informed by or
25129 // deferred to the GC strategy for the function in which they appear. For
25130 // now, however, they must be lowered to something. Since they are logically
25131 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25132 // require special handling for these nodes), lower them as literal NOOPs for
25134 SmallVector<SDValue, 2> Ops;
25136 Ops.push_back(Op.getOperand(0));
25137 if (Op->getGluedNode())
25138 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25141 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25142 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25147 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
25148 SelectionDAG &DAG) const {
25149 // TODO: Eventually, the lowering of these nodes should be informed by or
25150 // deferred to the GC strategy for the function in which they appear. For
25151 // now, however, they must be lowered to something. Since they are logically
25152 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25153 // require special handling for these nodes), lower them as literal NOOPs for
25155 SmallVector<SDValue, 2> Ops;
25157 Ops.push_back(Op.getOperand(0));
25158 if (Op->getGluedNode())
25159 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25162 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25163 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25168 /// Provide custom lowering hooks for some operations.
25169 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
25170 switch (Op.getOpcode()) {
25171 default: llvm_unreachable("Should not custom lower this!");
25172 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
25173 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
25174 return LowerCMP_SWAP(Op, Subtarget, DAG);
25175 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
25176 case ISD::ATOMIC_LOAD_ADD:
25177 case ISD::ATOMIC_LOAD_SUB:
25178 case ISD::ATOMIC_LOAD_OR:
25179 case ISD::ATOMIC_LOAD_XOR:
25180 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
25181 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
25182 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
25183 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
25184 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
25185 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
25186 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
25187 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
25188 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
25189 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
25190 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
25191 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
25192 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
25193 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
25194 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
25195 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
25196 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
25197 case ISD::SHL_PARTS:
25198 case ISD::SRA_PARTS:
25199 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
25200 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
25201 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
25202 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
25203 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
25204 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
25205 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
25206 case ISD::ZERO_EXTEND_VECTOR_INREG:
25207 case ISD::SIGN_EXTEND_VECTOR_INREG:
25208 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
25209 case ISD::FP_TO_SINT:
25210 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
25211 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
25212 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
25213 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
25215 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
25216 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
25217 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
25218 case ISD::SETCC: return LowerSETCC(Op, DAG);
25219 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
25220 case ISD::SELECT: return LowerSELECT(Op, DAG);
25221 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
25222 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
25223 case ISD::VASTART: return LowerVASTART(Op, DAG);
25224 case ISD::VAARG: return LowerVAARG(Op, DAG);
25225 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
25226 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
25227 case ISD::INTRINSIC_VOID:
25228 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
25229 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
25230 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
25231 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
25232 case ISD::FRAME_TO_ARGS_OFFSET:
25233 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
25234 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
25235 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
25236 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
25237 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
25238 case ISD::EH_SJLJ_SETUP_DISPATCH:
25239 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
25240 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
25241 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
25242 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
25244 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
25246 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
25247 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
25249 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
25250 case ISD::UMUL_LOHI:
25251 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
25253 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
25256 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
25262 case ISD::UMULO: return LowerXALUO(Op, DAG);
25263 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
25264 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
25265 case ISD::ADDCARRY:
25266 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
25268 case ISD::SUB: return LowerADD_SUB(Op, DAG);
25272 case ISD::UMIN: return LowerMINMAX(Op, DAG);
25273 case ISD::ABS: return LowerABS(Op, DAG);
25274 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
25275 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
25276 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
25277 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
25278 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
25279 case ISD::GC_TRANSITION_START:
25280 return LowerGC_TRANSITION_START(Op, DAG);
25281 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
25285 /// Places new result values for the node in Results (their number
25286 /// and types must exactly match those of the original return values of
25287 /// the node), or leaves Results empty, which indicates that the node is not
25288 /// to be custom lowered after all.
25289 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
25290 SmallVectorImpl<SDValue> &Results,
25291 SelectionDAG &DAG) const {
25292 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
25294 if (!Res.getNode())
25297 assert((N->getNumValues() <= Res->getNumValues()) &&
25298 "Lowering returned the wrong number of results!");
25300 // Places new result values base on N result number.
25301 // In some cases (LowerSINT_TO_FP for example) Res has more result values
25302 // than original node, chain should be dropped(last value).
25303 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
25304 Results.push_back(Res.getValue(I));
25307 /// Replace a node with an illegal result type with a new node built out of
25309 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
25310 SmallVectorImpl<SDValue>&Results,
25311 SelectionDAG &DAG) const {
25313 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25314 switch (N->getOpcode()) {
25316 llvm_unreachable("Do not know how to custom type legalize this operation!");
25317 case X86ISD::AVG: {
25318 // Legalize types for X86ISD::AVG by expanding vectors.
25319 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25321 auto InVT = N->getValueType(0);
25322 assert(InVT.getSizeInBits() < 128);
25323 assert(128 % InVT.getSizeInBits() == 0);
25324 unsigned NumConcat = 128 / InVT.getSizeInBits();
25326 EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
25327 InVT.getVectorElementType(),
25328 NumConcat * InVT.getVectorNumElements());
25330 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
25331 Ops[0] = N->getOperand(0);
25332 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25333 Ops[0] = N->getOperand(1);
25334 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25336 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
25337 if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
25338 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
25339 DAG.getIntPtrConstant(0, dl));
25340 Results.push_back(Res);
25344 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
25345 // setCC result type is v2i1 because type legalzation will end up with
25346 // a v4i1 setcc plus an extend.
25347 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
25348 if (N->getOperand(0).getValueType() != MVT::v2f32)
25350 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
25351 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25352 N->getOperand(0), UNDEF);
25353 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25354 N->getOperand(1), UNDEF);
25355 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
25357 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25358 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25359 DAG.getIntPtrConstant(0, dl));
25360 Results.push_back(Res);
25363 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
25364 case X86ISD::FMINC:
25366 case X86ISD::FMAXC:
25367 case X86ISD::FMAX: {
25368 EVT VT = N->getValueType(0);
25369 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
25370 SDValue UNDEF = DAG.getUNDEF(VT);
25371 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25372 N->getOperand(0), UNDEF);
25373 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25374 N->getOperand(1), UNDEF);
25375 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
25383 case ISD::UDIVREM: {
25384 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
25385 Results.push_back(V);
25388 case ISD::FP_TO_SINT:
25389 case ISD::FP_TO_UINT: {
25390 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
25391 EVT VT = N->getValueType(0);
25392 SDValue Src = N->getOperand(0);
25393 EVT SrcVT = Src.getValueType();
25395 if (VT == MVT::v2i32) {
25396 assert((IsSigned || Subtarget.hasAVX512()) &&
25397 "Can only handle signed conversion without AVX512");
25398 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25399 if (Src.getValueType() == MVT::v2f64) {
25400 MVT ResVT = MVT::v4i32;
25401 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
25402 if (!IsSigned && !Subtarget.hasVLX()) {
25403 // Widen to 512-bits.
25404 ResVT = MVT::v8i32;
25405 Opc = ISD::FP_TO_UINT;
25406 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
25407 DAG.getUNDEF(MVT::v8f64),
25408 Src, DAG.getIntPtrConstant(0, dl));
25410 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
25411 bool WidenType = getTypeAction(*DAG.getContext(),
25412 MVT::v2i32) == TypeWidenVector;
25413 ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
25414 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
25415 DAG.getIntPtrConstant(0, dl));
25416 Results.push_back(Res);
25419 if (SrcVT == MVT::v2f32) {
25420 SDValue Idx = DAG.getIntPtrConstant(0, dl);
25421 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25422 DAG.getUNDEF(MVT::v2f32));
25423 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
25424 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
25425 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25426 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
25427 Results.push_back(Res);
25431 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
25432 // so early out here.
25436 if (Subtarget.hasDQI() && VT == MVT::i64 &&
25437 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
25438 assert(!Subtarget.is64Bit() && "i64 should be legal");
25439 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
25440 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
25441 // TODO: Use 128-bit vectors for f64 case?
25442 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
25443 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
25444 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
25446 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
25447 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
25448 DAG.getConstantFP(0.0, dl, VecInVT), Src,
25450 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
25451 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
25452 Results.push_back(Res);
25456 std::pair<SDValue,SDValue> Vals =
25457 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
25458 SDValue FIST = Vals.first, StackSlot = Vals.second;
25459 if (FIST.getNode()) {
25460 // Return a load from the stack slot.
25461 if (StackSlot.getNode())
25463 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
25465 Results.push_back(FIST);
25469 case ISD::SINT_TO_FP: {
25470 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
25471 SDValue Src = N->getOperand(0);
25472 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
25474 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
25477 case ISD::UINT_TO_FP: {
25478 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25479 EVT VT = N->getValueType(0);
25480 if (VT != MVT::v2f32)
25482 SDValue Src = N->getOperand(0);
25483 EVT SrcVT = Src.getValueType();
25484 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
25485 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
25488 if (SrcVT != MVT::v2i32)
25490 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
25492 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
25493 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
25494 DAG.getBitcast(MVT::v2i64, VBias));
25495 Or = DAG.getBitcast(MVT::v2f64, Or);
25496 // TODO: Are there any fast-math-flags to propagate here?
25497 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
25498 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
25501 case ISD::FP_ROUND: {
25502 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
25504 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
25505 Results.push_back(V);
25508 case ISD::FP_EXTEND: {
25509 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25510 // No other ValueType for FP_EXTEND should reach this point.
25511 assert(N->getValueType(0) == MVT::v2f32 &&
25512 "Do not know how to legalize this Node");
25515 case ISD::INTRINSIC_W_CHAIN: {
25516 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25518 default : llvm_unreachable("Do not know how to custom type "
25519 "legalize this intrinsic operation!");
25520 case Intrinsic::x86_rdtsc:
25521 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25523 case Intrinsic::x86_rdtscp:
25524 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25526 case Intrinsic::x86_rdpmc:
25527 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25529 case Intrinsic::x86_xgetbv:
25530 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25533 case ISD::INTRINSIC_WO_CHAIN: {
25534 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25535 Results.push_back(V);
25538 case ISD::READCYCLECOUNTER: {
25539 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25542 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25543 EVT T = N->getValueType(0);
25544 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25545 bool Regs64bit = T == MVT::i128;
25546 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25547 SDValue cpInL, cpInH;
25548 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25549 DAG.getConstant(0, dl, HalfT));
25550 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25551 DAG.getConstant(1, dl, HalfT));
25552 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25553 Regs64bit ? X86::RAX : X86::EAX,
25555 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25556 Regs64bit ? X86::RDX : X86::EDX,
25557 cpInH, cpInL.getValue(1));
25558 SDValue swapInL, swapInH;
25559 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25560 DAG.getConstant(0, dl, HalfT));
25561 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25562 DAG.getConstant(1, dl, HalfT));
25564 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25565 swapInH, cpInH.getValue(1));
25566 // If the current function needs the base pointer, RBX,
25567 // we shouldn't use cmpxchg directly.
25568 // Indeed the lowering of that instruction will clobber
25569 // that register and since RBX will be a reserved register
25570 // the register allocator will not make sure its value will
25571 // be properly saved and restored around this live-range.
25572 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25574 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25575 unsigned BasePtr = TRI->getBaseRegister();
25576 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25577 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25578 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25579 // ISel prefers the LCMPXCHG64 variant.
25580 // If that assert breaks, that means it is not the case anymore,
25581 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25582 // not just EBX. This is a matter of accepting i64 input for that
25583 // pseudo, and restoring into the register of the right wide
25584 // in expand pseudo. Everything else should just work.
25585 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25586 "Saving only half of the RBX");
25587 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25588 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25589 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25590 Regs64bit ? X86::RBX : X86::EBX,
25591 HalfT, swapInH.getValue(1));
25592 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25594 /*Glue*/ RBXSave.getValue(2)};
25595 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25598 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25599 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25600 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25601 swapInH.getValue(1));
25602 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25603 swapInL.getValue(1)};
25604 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25606 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25607 Regs64bit ? X86::RAX : X86::EAX,
25608 HalfT, Result.getValue(1));
25609 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25610 Regs64bit ? X86::RDX : X86::EDX,
25611 HalfT, cpOutL.getValue(2));
25612 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25614 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25615 MVT::i32, cpOutH.getValue(2));
25616 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25617 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25619 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25620 Results.push_back(Success);
25621 Results.push_back(EFLAGS.getValue(1));
25624 case ISD::ATOMIC_SWAP:
25625 case ISD::ATOMIC_LOAD_ADD:
25626 case ISD::ATOMIC_LOAD_SUB:
25627 case ISD::ATOMIC_LOAD_AND:
25628 case ISD::ATOMIC_LOAD_OR:
25629 case ISD::ATOMIC_LOAD_XOR:
25630 case ISD::ATOMIC_LOAD_NAND:
25631 case ISD::ATOMIC_LOAD_MIN:
25632 case ISD::ATOMIC_LOAD_MAX:
25633 case ISD::ATOMIC_LOAD_UMIN:
25634 case ISD::ATOMIC_LOAD_UMAX:
25635 case ISD::ATOMIC_LOAD: {
25636 // Delegate to generic TypeLegalization. Situations we can really handle
25637 // should have already been dealt with by AtomicExpandPass.cpp.
25640 case ISD::BITCAST: {
25641 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25642 EVT DstVT = N->getValueType(0);
25643 EVT SrcVT = N->getOperand(0).getValueType();
25645 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
25646 // we can split using the k-register rather than memory.
25647 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25648 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25650 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25651 Lo = DAG.getBitcast(MVT::i32, Lo);
25652 Hi = DAG.getBitcast(MVT::i32, Hi);
25653 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25654 Results.push_back(Res);
25658 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25659 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
25660 SrcVT.isVector() && isTypeLegal(SrcVT)) {
25662 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25663 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
25664 Lo = DAG.getBitcast(CastVT, Lo);
25665 Hi = DAG.getBitcast(CastVT, Hi);
25666 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
25667 Results.push_back(Res);
25671 if (SrcVT != MVT::f64 ||
25672 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25675 unsigned NumElts = DstVT.getVectorNumElements();
25676 EVT SVT = DstVT.getVectorElementType();
25677 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25678 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25679 MVT::v2f64, N->getOperand(0));
25680 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25682 if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
25683 // If we are legalizing vectors by widening, we already have the desired
25684 // legal vector type, just return it.
25685 Results.push_back(ToVecInt);
25689 SmallVector<SDValue, 8> Elts;
25690 for (unsigned i = 0, e = NumElts; i != e; ++i)
25691 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25692 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25694 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25697 case ISD::MGATHER: {
25698 EVT VT = N->getValueType(0);
25699 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25700 auto *Gather = cast<MaskedGatherSDNode>(N);
25701 SDValue Index = Gather->getIndex();
25702 if (Index.getValueType() != MVT::v2i64)
25704 SDValue Mask = Gather->getMask();
25705 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25706 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25707 Gather->getValue(),
25708 DAG.getUNDEF(MVT::v2f32));
25709 if (!Subtarget.hasVLX()) {
25710 // We need to widen the mask, but the instruction will only use 2
25711 // of its elements. So we can use undef.
25712 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25713 DAG.getUNDEF(MVT::v2i1));
25714 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25716 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25717 Index, Gather->getScale() };
25718 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25719 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25720 Gather->getMemoryVT(), Gather->getMemOperand());
25721 Results.push_back(Res);
25722 Results.push_back(Res.getValue(2));
25725 if (VT == MVT::v2i32) {
25726 auto *Gather = cast<MaskedGatherSDNode>(N);
25727 SDValue Index = Gather->getIndex();
25728 SDValue Mask = Gather->getMask();
25729 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25730 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25731 Gather->getValue(),
25732 DAG.getUNDEF(MVT::v2i32));
25733 // If the index is v2i64 we can use it directly.
25734 if (Index.getValueType() == MVT::v2i64 &&
25735 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25736 if (!Subtarget.hasVLX()) {
25737 // We need to widen the mask, but the instruction will only use 2
25738 // of its elements. So we can use undef.
25739 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25740 DAG.getUNDEF(MVT::v2i1));
25741 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25743 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25744 Index, Gather->getScale() };
25745 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25746 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25747 Gather->getMemoryVT(), Gather->getMemOperand());
25748 SDValue Chain = Res.getValue(2);
25749 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
25750 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25751 DAG.getIntPtrConstant(0, dl));
25752 Results.push_back(Res);
25753 Results.push_back(Chain);
25756 EVT IndexVT = Index.getValueType();
25757 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25758 IndexVT.getScalarType(), 4);
25759 // Otherwise we need to custom widen everything to avoid promotion.
25760 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25761 DAG.getUNDEF(IndexVT));
25762 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25763 DAG.getConstant(0, dl, MVT::v2i1));
25764 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25765 Index, Gather->getScale() };
25766 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25767 Gather->getMemoryVT(), dl, Ops,
25768 Gather->getMemOperand());
25769 SDValue Chain = Res.getValue(1);
25770 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25771 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25772 DAG.getIntPtrConstant(0, dl));
25773 Results.push_back(Res);
25774 Results.push_back(Chain);
25782 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25783 switch ((X86ISD::NodeType)Opcode) {
25784 case X86ISD::FIRST_NUMBER: break;
25785 case X86ISD::BSF: return "X86ISD::BSF";
25786 case X86ISD::BSR: return "X86ISD::BSR";
25787 case X86ISD::SHLD: return "X86ISD::SHLD";
25788 case X86ISD::SHRD: return "X86ISD::SHRD";
25789 case X86ISD::FAND: return "X86ISD::FAND";
25790 case X86ISD::FANDN: return "X86ISD::FANDN";
25791 case X86ISD::FOR: return "X86ISD::FOR";
25792 case X86ISD::FXOR: return "X86ISD::FXOR";
25793 case X86ISD::FILD: return "X86ISD::FILD";
25794 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25795 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25796 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25797 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25798 case X86ISD::FLD: return "X86ISD::FLD";
25799 case X86ISD::FST: return "X86ISD::FST";
25800 case X86ISD::CALL: return "X86ISD::CALL";
25801 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25802 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25803 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25804 case X86ISD::BT: return "X86ISD::BT";
25805 case X86ISD::CMP: return "X86ISD::CMP";
25806 case X86ISD::COMI: return "X86ISD::COMI";
25807 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25808 case X86ISD::CMPM: return "X86ISD::CMPM";
25809 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25810 case X86ISD::SETCC: return "X86ISD::SETCC";
25811 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25812 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25813 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25814 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25815 case X86ISD::CMOV: return "X86ISD::CMOV";
25816 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25817 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25818 case X86ISD::IRET: return "X86ISD::IRET";
25819 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25820 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25821 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25822 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25823 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25824 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25825 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25826 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25827 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25828 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25829 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25830 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25831 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25832 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25833 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25834 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25835 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25836 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25837 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25838 case X86ISD::HADD: return "X86ISD::HADD";
25839 case X86ISD::HSUB: return "X86ISD::HSUB";
25840 case X86ISD::FHADD: return "X86ISD::FHADD";
25841 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25842 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25843 case X86ISD::FMAX: return "X86ISD::FMAX";
25844 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25845 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25846 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25847 case X86ISD::FMIN: return "X86ISD::FMIN";
25848 case X86ISD::FMINS: return "X86ISD::FMINS";
25849 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25850 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25851 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25852 case X86ISD::FMINC: return "X86ISD::FMINC";
25853 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25854 case X86ISD::FRCP: return "X86ISD::FRCP";
25855 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25856 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25857 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25858 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25859 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25860 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25861 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25862 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25863 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25864 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25865 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25866 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25867 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25868 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25869 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25870 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25871 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25872 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25873 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25874 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25875 case X86ISD::LADD: return "X86ISD::LADD";
25876 case X86ISD::LSUB: return "X86ISD::LSUB";
25877 case X86ISD::LOR: return "X86ISD::LOR";
25878 case X86ISD::LXOR: return "X86ISD::LXOR";
25879 case X86ISD::LAND: return "X86ISD::LAND";
25880 case X86ISD::LINC: return "X86ISD::LINC";
25881 case X86ISD::LDEC: return "X86ISD::LDEC";
25882 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25883 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25884 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25885 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25886 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25887 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25888 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25889 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25890 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25891 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25892 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25893 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25894 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25895 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25896 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25897 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25898 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25899 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25900 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25901 case X86ISD::VSHL: return "X86ISD::VSHL";
25902 case X86ISD::VSRL: return "X86ISD::VSRL";
25903 case X86ISD::VSRA: return "X86ISD::VSRA";
25904 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25905 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25906 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25907 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25908 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25909 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25910 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25911 case X86ISD::CMPP: return "X86ISD::CMPP";
25912 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25913 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25914 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25915 case X86ISD::ADD: return "X86ISD::ADD";
25916 case X86ISD::SUB: return "X86ISD::SUB";
25917 case X86ISD::ADC: return "X86ISD::ADC";
25918 case X86ISD::SBB: return "X86ISD::SBB";
25919 case X86ISD::SMUL: return "X86ISD::SMUL";
25920 case X86ISD::UMUL: return "X86ISD::UMUL";
25921 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25922 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25923 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25924 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25925 case X86ISD::INC: return "X86ISD::INC";
25926 case X86ISD::DEC: return "X86ISD::DEC";
25927 case X86ISD::OR: return "X86ISD::OR";
25928 case X86ISD::XOR: return "X86ISD::XOR";
25929 case X86ISD::AND: return "X86ISD::AND";
25930 case X86ISD::BEXTR: return "X86ISD::BEXTR";
25931 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25932 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25933 case X86ISD::PTEST: return "X86ISD::PTEST";
25934 case X86ISD::TESTP: return "X86ISD::TESTP";
25935 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25936 case X86ISD::KTEST: return "X86ISD::KTEST";
25937 case X86ISD::KADD: return "X86ISD::KADD";
25938 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25939 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25940 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25941 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25942 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25943 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25944 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25945 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25946 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25947 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25948 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25949 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25950 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25951 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25952 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25953 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25954 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25955 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25956 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25957 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25958 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25959 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25960 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25961 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25962 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25963 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25964 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25965 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25966 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25967 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25968 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25969 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25970 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25971 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25972 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25973 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25974 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25975 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25976 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25977 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25978 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25979 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25980 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25981 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25982 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25983 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25984 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25985 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25986 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25987 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25988 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25989 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25990 case X86ISD::SAHF: return "X86ISD::SAHF";
25991 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25992 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25993 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25994 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25995 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25996 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25997 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25998 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25999 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
26000 case X86ISD::FMSUB: return "X86ISD::FMSUB";
26001 case X86ISD::FNMADD: return "X86ISD::FNMADD";
26002 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
26003 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
26004 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
26005 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
26006 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
26007 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
26008 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
26009 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
26010 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
26011 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
26012 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
26013 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
26014 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
26015 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
26016 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
26017 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
26018 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
26019 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
26020 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
26021 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
26022 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
26023 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
26024 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
26025 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
26026 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
26027 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
26028 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
26029 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
26030 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
26031 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
26032 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
26033 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
26034 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
26035 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
26036 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
26037 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
26038 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
26039 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
26040 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
26041 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
26042 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
26043 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
26044 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
26045 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
26046 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
26047 case X86ISD::XTEST: return "X86ISD::XTEST";
26048 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
26049 case X86ISD::EXPAND: return "X86ISD::EXPAND";
26050 case X86ISD::SELECT: return "X86ISD::SELECT";
26051 case X86ISD::SELECTS: return "X86ISD::SELECTS";
26052 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
26053 case X86ISD::RCP14: return "X86ISD::RCP14";
26054 case X86ISD::RCP14S: return "X86ISD::RCP14S";
26055 case X86ISD::RCP28: return "X86ISD::RCP28";
26056 case X86ISD::RCP28S: return "X86ISD::RCP28S";
26057 case X86ISD::EXP2: return "X86ISD::EXP2";
26058 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
26059 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
26060 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
26061 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
26062 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
26063 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
26064 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
26065 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
26066 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
26067 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
26068 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
26069 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
26070 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
26071 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
26072 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
26073 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
26074 case X86ISD::SCALEF: return "X86ISD::SCALEF";
26075 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
26076 case X86ISD::ADDS: return "X86ISD::ADDS";
26077 case X86ISD::SUBS: return "X86ISD::SUBS";
26078 case X86ISD::AVG: return "X86ISD::AVG";
26079 case X86ISD::MULHRS: return "X86ISD::MULHRS";
26080 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
26081 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
26082 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
26083 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
26084 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
26085 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
26086 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
26087 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
26088 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
26089 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
26090 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
26091 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
26092 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
26093 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
26094 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
26095 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
26096 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
26097 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
26098 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
26099 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
26100 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
26101 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
26102 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
26103 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
26104 case X86ISD::LWPINS: return "X86ISD::LWPINS";
26105 case X86ISD::MGATHER: return "X86ISD::MGATHER";
26106 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
26107 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
26108 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
26109 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
26110 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
26111 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
26112 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
26113 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
26114 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
26115 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
26116 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
26117 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
26118 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
26123 /// Return true if the addressing mode represented by AM is legal for this
26124 /// target, for a load/store of the specified type.
26125 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
26126 const AddrMode &AM, Type *Ty,
26128 Instruction *I) const {
26129 // X86 supports extremely general addressing modes.
26130 CodeModel::Model M = getTargetMachine().getCodeModel();
26132 // X86 allows a sign-extended 32-bit immediate field as a displacement.
26133 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
26137 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
26139 // If a reference to this global requires an extra load, we can't fold it.
26140 if (isGlobalStubReference(GVFlags))
26143 // If BaseGV requires a register for the PIC base, we cannot also have a
26144 // BaseReg specified.
26145 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
26148 // If lower 4G is not available, then we must use rip-relative addressing.
26149 if ((M != CodeModel::Small || isPositionIndependent()) &&
26150 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
26154 switch (AM.Scale) {
26160 // These scales always work.
26165 // These scales are formed with basereg+scalereg. Only accept if there is
26170 default: // Other stuff never works.
26177 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
26178 unsigned Bits = Ty->getScalarSizeInBits();
26180 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
26181 // particularly cheaper than those without.
26185 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
26186 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
26187 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
26190 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
26191 // shifts just as cheap as scalar ones.
26192 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
26195 // AVX512BW has shifts such as vpsllvw.
26196 if (Subtarget.hasBWI() && Bits == 16)
26199 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
26200 // fully general vector.
26204 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
26205 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26207 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
26208 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
26209 return NumBits1 > NumBits2;
26212 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
26213 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26216 if (!isTypeLegal(EVT::getEVT(Ty1)))
26219 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
26221 // Assuming the caller doesn't have a zeroext or signext return parameter,
26222 // truncation all the way down to i1 is valid.
26226 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
26227 return isInt<32>(Imm);
26230 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
26231 // Can also use sub to handle negated immediates.
26232 return isInt<32>(Imm);
26235 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
26236 if (!VT1.isInteger() || !VT2.isInteger())
26238 unsigned NumBits1 = VT1.getSizeInBits();
26239 unsigned NumBits2 = VT2.getSizeInBits();
26240 return NumBits1 > NumBits2;
26243 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
26244 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26245 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
26248 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
26249 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26250 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
26253 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
26254 EVT VT1 = Val.getValueType();
26255 if (isZExtFree(VT1, VT2))
26258 if (Val.getOpcode() != ISD::LOAD)
26261 if (!VT1.isSimple() || !VT1.isInteger() ||
26262 !VT2.isSimple() || !VT2.isInteger())
26265 switch (VT1.getSimpleVT().SimpleTy) {
26270 // X86 has 8, 16, and 32-bit zero-extending loads.
26277 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
26278 EVT SrcVT = ExtVal.getOperand(0).getValueType();
26280 // There is no extending load for vXi1.
26281 if (SrcVT.getScalarType() == MVT::i1)
26288 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
26289 if (!Subtarget.hasAnyFMA())
26292 VT = VT.getScalarType();
26294 if (!VT.isSimple())
26297 switch (VT.getSimpleVT().SimpleTy) {
26308 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
26309 // i16 instructions are longer (0x66 prefix) and potentially slower.
26310 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
26313 /// Targets can use this to indicate that they only support *some*
26314 /// VECTOR_SHUFFLE operations, those with specific masks.
26315 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
26316 /// are assumed to be legal.
26317 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
26318 if (!VT.isSimple())
26321 // Not for i1 vectors
26322 if (VT.getSimpleVT().getScalarType() == MVT::i1)
26325 // Very little shuffling can be done for 64-bit vectors right now.
26326 if (VT.getSimpleVT().getSizeInBits() == 64)
26329 // We only care that the types being shuffled are legal. The lowering can
26330 // handle any possible shuffle mask that results.
26331 return isTypeLegal(VT.getSimpleVT());
26335 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
26337 // Don't convert an 'and' into a shuffle that we don't directly support.
26338 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
26339 if (!Subtarget.hasAVX2())
26340 if (VT == MVT::v32i8 || VT == MVT::v16i16)
26343 // Just delegate to the generic legality, clear masks aren't special.
26344 return isShuffleMaskLegal(Mask, VT);
26347 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
26348 // If the subtarget is using retpolines, we need to not generate jump tables.
26349 if (Subtarget.useRetpoline())
26352 // Otherwise, fallback on the generic logic.
26353 return TargetLowering::areJTsAllowed(Fn);
26356 //===----------------------------------------------------------------------===//
26357 // X86 Scheduler Hooks
26358 //===----------------------------------------------------------------------===//
26360 /// Utility function to emit xbegin specifying the start of an RTM region.
26361 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
26362 const TargetInstrInfo *TII) {
26363 DebugLoc DL = MI.getDebugLoc();
26365 const BasicBlock *BB = MBB->getBasicBlock();
26366 MachineFunction::iterator I = ++MBB->getIterator();
26368 // For the v = xbegin(), we generate
26377 // eax = # XABORT_DEF
26381 // v = phi(s0/mainBB, s1/fallBB)
26383 MachineBasicBlock *thisMBB = MBB;
26384 MachineFunction *MF = MBB->getParent();
26385 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26386 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
26387 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26388 MF->insert(I, mainMBB);
26389 MF->insert(I, fallMBB);
26390 MF->insert(I, sinkMBB);
26392 // Transfer the remainder of BB and its successor edges to sinkMBB.
26393 sinkMBB->splice(sinkMBB->begin(), MBB,
26394 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26395 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26397 MachineRegisterInfo &MRI = MF->getRegInfo();
26398 unsigned DstReg = MI.getOperand(0).getReg();
26399 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26400 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26401 unsigned fallDstReg = MRI.createVirtualRegister(RC);
26405 // # fallthrough to mainMBB
26406 // # abortion to fallMBB
26407 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
26408 thisMBB->addSuccessor(mainMBB);
26409 thisMBB->addSuccessor(fallMBB);
26412 // mainDstReg := -1
26413 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
26414 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26415 mainMBB->addSuccessor(sinkMBB);
26418 // ; pseudo instruction to model hardware's definition from XABORT
26419 // EAX := XABORT_DEF
26420 // fallDstReg := EAX
26421 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
26422 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
26424 fallMBB->addSuccessor(sinkMBB);
26427 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
26428 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
26429 .addReg(mainDstReg).addMBB(mainMBB)
26430 .addReg(fallDstReg).addMBB(fallMBB);
26432 MI.eraseFromParent();
26436 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26437 const X86Subtarget &Subtarget) {
26438 DebugLoc dl = MI.getDebugLoc();
26439 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26441 // insert input VAL into EAX
26442 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
26443 .addReg(MI.getOperand(0).getReg());
26444 // insert zero to ECX
26445 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26447 // insert zero to EDX
26448 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
26450 // insert WRPKRU instruction
26451 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
26453 MI.eraseFromParent(); // The pseudo is gone now.
26457 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26458 const X86Subtarget &Subtarget) {
26459 DebugLoc dl = MI.getDebugLoc();
26460 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26462 // insert zero to ECX
26463 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26465 // insert RDPKRU instruction
26466 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
26467 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26470 MI.eraseFromParent(); // The pseudo is gone now.
26474 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
26475 const X86Subtarget &Subtarget,
26477 DebugLoc dl = MI.getDebugLoc();
26478 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26479 // Address into RAX/EAX, other two args into ECX, EDX.
26480 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26481 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26482 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26483 for (int i = 0; i < X86::AddrNumOperands; ++i)
26484 MIB.add(MI.getOperand(i));
26486 unsigned ValOps = X86::AddrNumOperands;
26487 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26488 .addReg(MI.getOperand(ValOps).getReg());
26489 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26490 .addReg(MI.getOperand(ValOps + 1).getReg());
26492 // The instruction doesn't actually take any operands though.
26493 BuildMI(*BB, MI, dl, TII->get(Opc));
26495 MI.eraseFromParent(); // The pseudo is gone now.
26499 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26500 const X86Subtarget &Subtarget) {
26501 DebugLoc dl = MI->getDebugLoc();
26502 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26503 // Address into RAX/EAX
26504 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26505 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26506 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26507 for (int i = 0; i < X86::AddrNumOperands; ++i)
26508 MIB.add(MI->getOperand(i));
26510 // The instruction doesn't actually take any operands though.
26511 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26513 MI->eraseFromParent(); // The pseudo is gone now.
26519 MachineBasicBlock *
26520 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26521 MachineBasicBlock *MBB) const {
26522 // Emit va_arg instruction on X86-64.
26524 // Operands to this pseudo-instruction:
26525 // 0 ) Output : destination address (reg)
26526 // 1-5) Input : va_list address (addr, i64mem)
26527 // 6 ) ArgSize : Size (in bytes) of vararg type
26528 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26529 // 8 ) Align : Alignment of type
26530 // 9 ) EFLAGS (implicit-def)
26532 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26533 static_assert(X86::AddrNumOperands == 5,
26534 "VAARG_64 assumes 5 address operands");
26536 unsigned DestReg = MI.getOperand(0).getReg();
26537 MachineOperand &Base = MI.getOperand(1);
26538 MachineOperand &Scale = MI.getOperand(2);
26539 MachineOperand &Index = MI.getOperand(3);
26540 MachineOperand &Disp = MI.getOperand(4);
26541 MachineOperand &Segment = MI.getOperand(5);
26542 unsigned ArgSize = MI.getOperand(6).getImm();
26543 unsigned ArgMode = MI.getOperand(7).getImm();
26544 unsigned Align = MI.getOperand(8).getImm();
26546 // Memory Reference
26547 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26548 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26549 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26551 // Machine Information
26552 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26553 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26554 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26555 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26556 DebugLoc DL = MI.getDebugLoc();
26558 // struct va_list {
26561 // i64 overflow_area (address)
26562 // i64 reg_save_area (address)
26564 // sizeof(va_list) = 24
26565 // alignment(va_list) = 8
26567 unsigned TotalNumIntRegs = 6;
26568 unsigned TotalNumXMMRegs = 8;
26569 bool UseGPOffset = (ArgMode == 1);
26570 bool UseFPOffset = (ArgMode == 2);
26571 unsigned MaxOffset = TotalNumIntRegs * 8 +
26572 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26574 /* Align ArgSize to a multiple of 8 */
26575 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26576 bool NeedsAlign = (Align > 8);
26578 MachineBasicBlock *thisMBB = MBB;
26579 MachineBasicBlock *overflowMBB;
26580 MachineBasicBlock *offsetMBB;
26581 MachineBasicBlock *endMBB;
26583 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26584 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26585 unsigned OffsetReg = 0;
26587 if (!UseGPOffset && !UseFPOffset) {
26588 // If we only pull from the overflow region, we don't create a branch.
26589 // We don't need to alter control flow.
26590 OffsetDestReg = 0; // unused
26591 OverflowDestReg = DestReg;
26593 offsetMBB = nullptr;
26594 overflowMBB = thisMBB;
26597 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26598 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26599 // If not, pull from overflow_area. (branch to overflowMBB)
26604 // offsetMBB overflowMBB
26609 // Registers for the PHI in endMBB
26610 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26611 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26613 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26614 MachineFunction *MF = MBB->getParent();
26615 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26616 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26617 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26619 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26621 // Insert the new basic blocks
26622 MF->insert(MBBIter, offsetMBB);
26623 MF->insert(MBBIter, overflowMBB);
26624 MF->insert(MBBIter, endMBB);
26626 // Transfer the remainder of MBB and its successor edges to endMBB.
26627 endMBB->splice(endMBB->begin(), thisMBB,
26628 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26629 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26631 // Make offsetMBB and overflowMBB successors of thisMBB
26632 thisMBB->addSuccessor(offsetMBB);
26633 thisMBB->addSuccessor(overflowMBB);
26635 // endMBB is a successor of both offsetMBB and overflowMBB
26636 offsetMBB->addSuccessor(endMBB);
26637 overflowMBB->addSuccessor(endMBB);
26639 // Load the offset value into a register
26640 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26641 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26645 .addDisp(Disp, UseFPOffset ? 4 : 0)
26647 .setMemRefs(MMOBegin, MMOEnd);
26649 // Check if there is enough room left to pull this argument.
26650 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26652 .addImm(MaxOffset + 8 - ArgSizeA8);
26654 // Branch to "overflowMBB" if offset >= max
26655 // Fall through to "offsetMBB" otherwise
26656 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26657 .addMBB(overflowMBB);
26660 // In offsetMBB, emit code to use the reg_save_area.
26662 assert(OffsetReg != 0);
26664 // Read the reg_save_area address.
26665 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26666 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26672 .setMemRefs(MMOBegin, MMOEnd);
26674 // Zero-extend the offset
26675 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26676 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26679 .addImm(X86::sub_32bit);
26681 // Add the offset to the reg_save_area to get the final address.
26682 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26683 .addReg(OffsetReg64)
26684 .addReg(RegSaveReg);
26686 // Compute the offset for the next argument
26687 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26688 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26690 .addImm(UseFPOffset ? 16 : 8);
26692 // Store it back into the va_list.
26693 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26697 .addDisp(Disp, UseFPOffset ? 4 : 0)
26699 .addReg(NextOffsetReg)
26700 .setMemRefs(MMOBegin, MMOEnd);
26703 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26708 // Emit code to use overflow area
26711 // Load the overflow_area address into a register.
26712 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26713 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26719 .setMemRefs(MMOBegin, MMOEnd);
26721 // If we need to align it, do so. Otherwise, just copy the address
26722 // to OverflowDestReg.
26724 // Align the overflow address
26725 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26726 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26728 // aligned_addr = (addr + (align-1)) & ~(align-1)
26729 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26730 .addReg(OverflowAddrReg)
26733 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26735 .addImm(~(uint64_t)(Align-1));
26737 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26738 .addReg(OverflowAddrReg);
26741 // Compute the next overflow address after this argument.
26742 // (the overflow address should be kept 8-byte aligned)
26743 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26744 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26745 .addReg(OverflowDestReg)
26746 .addImm(ArgSizeA8);
26748 // Store the new overflow address.
26749 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26755 .addReg(NextAddrReg)
26756 .setMemRefs(MMOBegin, MMOEnd);
26758 // If we branched, emit the PHI to the front of endMBB.
26760 BuildMI(*endMBB, endMBB->begin(), DL,
26761 TII->get(X86::PHI), DestReg)
26762 .addReg(OffsetDestReg).addMBB(offsetMBB)
26763 .addReg(OverflowDestReg).addMBB(overflowMBB);
26766 // Erase the pseudo instruction
26767 MI.eraseFromParent();
26772 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26773 MachineInstr &MI, MachineBasicBlock *MBB) const {
26774 // Emit code to save XMM registers to the stack. The ABI says that the
26775 // number of registers to save is given in %al, so it's theoretically
26776 // possible to do an indirect jump trick to avoid saving all of them,
26777 // however this code takes a simpler approach and just executes all
26778 // of the stores if %al is non-zero. It's less code, and it's probably
26779 // easier on the hardware branch predictor, and stores aren't all that
26780 // expensive anyway.
26782 // Create the new basic blocks. One block contains all the XMM stores,
26783 // and one block is the final destination regardless of whether any
26784 // stores were performed.
26785 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26786 MachineFunction *F = MBB->getParent();
26787 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26788 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26789 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26790 F->insert(MBBIter, XMMSaveMBB);
26791 F->insert(MBBIter, EndMBB);
26793 // Transfer the remainder of MBB and its successor edges to EndMBB.
26794 EndMBB->splice(EndMBB->begin(), MBB,
26795 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26796 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26798 // The original block will now fall through to the XMM save block.
26799 MBB->addSuccessor(XMMSaveMBB);
26800 // The XMMSaveMBB will fall through to the end block.
26801 XMMSaveMBB->addSuccessor(EndMBB);
26803 // Now add the instructions.
26804 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26805 DebugLoc DL = MI.getDebugLoc();
26807 unsigned CountReg = MI.getOperand(0).getReg();
26808 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26809 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26811 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26812 // If %al is 0, branch around the XMM save block.
26813 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26814 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26815 MBB->addSuccessor(EndMBB);
26818 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26819 // that was just emitted, but clearly shouldn't be "saved".
26820 assert((MI.getNumOperands() <= 3 ||
26821 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26822 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26823 "Expected last argument to be EFLAGS");
26824 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26825 // In the XMM save block, save all the XMM argument registers.
26826 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26827 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26828 MachineMemOperand *MMO = F->getMachineMemOperand(
26829 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26830 MachineMemOperand::MOStore,
26831 /*Size=*/16, /*Align=*/16);
26832 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26833 .addFrameIndex(RegSaveFrameIndex)
26834 .addImm(/*Scale=*/1)
26835 .addReg(/*IndexReg=*/0)
26836 .addImm(/*Disp=*/Offset)
26837 .addReg(/*Segment=*/0)
26838 .addReg(MI.getOperand(i).getReg())
26839 .addMemOperand(MMO);
26842 MI.eraseFromParent(); // The pseudo instruction is gone now.
26847 // The EFLAGS operand of SelectItr might be missing a kill marker
26848 // because there were multiple uses of EFLAGS, and ISel didn't know
26849 // which to mark. Figure out whether SelectItr should have had a
26850 // kill marker, and set it if it should. Returns the correct kill
26852 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26853 MachineBasicBlock* BB,
26854 const TargetRegisterInfo* TRI) {
26855 // Scan forward through BB for a use/def of EFLAGS.
26856 MachineBasicBlock::iterator miI(std::next(SelectItr));
26857 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26858 const MachineInstr& mi = *miI;
26859 if (mi.readsRegister(X86::EFLAGS))
26861 if (mi.definesRegister(X86::EFLAGS))
26862 break; // Should have kill-flag - update below.
26865 // If we hit the end of the block, check whether EFLAGS is live into a
26867 if (miI == BB->end()) {
26868 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26869 sEnd = BB->succ_end();
26870 sItr != sEnd; ++sItr) {
26871 MachineBasicBlock* succ = *sItr;
26872 if (succ->isLiveIn(X86::EFLAGS))
26877 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26878 // out. SelectMI should have a kill flag on EFLAGS.
26879 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26883 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26884 // together with other CMOV pseudo-opcodes into a single basic-block with
26885 // conditional jump around it.
26886 static bool isCMOVPseudo(MachineInstr &MI) {
26887 switch (MI.getOpcode()) {
26888 case X86::CMOV_FR32:
26889 case X86::CMOV_FR64:
26890 case X86::CMOV_GR8:
26891 case X86::CMOV_GR16:
26892 case X86::CMOV_GR32:
26893 case X86::CMOV_RFP32:
26894 case X86::CMOV_RFP64:
26895 case X86::CMOV_RFP80:
26896 case X86::CMOV_V2F64:
26897 case X86::CMOV_V2I64:
26898 case X86::CMOV_V4F32:
26899 case X86::CMOV_V4F64:
26900 case X86::CMOV_V4I64:
26901 case X86::CMOV_V16F32:
26902 case X86::CMOV_V8F32:
26903 case X86::CMOV_V8F64:
26904 case X86::CMOV_V8I64:
26905 case X86::CMOV_V8I1:
26906 case X86::CMOV_V16I1:
26907 case X86::CMOV_V32I1:
26908 case X86::CMOV_V64I1:
26916 // Helper function, which inserts PHI functions into SinkMBB:
26917 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26918 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26919 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26920 // the last PHI function inserted.
26921 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26922 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26923 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26924 MachineBasicBlock *SinkMBB) {
26925 MachineFunction *MF = TrueMBB->getParent();
26926 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26927 DebugLoc DL = MIItBegin->getDebugLoc();
26929 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26930 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26932 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26934 // As we are creating the PHIs, we have to be careful if there is more than
26935 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26936 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26937 // That also means that PHI construction must work forward from earlier to
26938 // later, and that the code must maintain a mapping from earlier PHI's
26939 // destination registers, and the registers that went into the PHI.
26940 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26941 MachineInstrBuilder MIB;
26943 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26944 unsigned DestReg = MIIt->getOperand(0).getReg();
26945 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26946 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26948 // If this CMOV we are generating is the opposite condition from
26949 // the jump we generated, then we have to swap the operands for the
26950 // PHI that is going to be generated.
26951 if (MIIt->getOperand(3).getImm() == OppCC)
26952 std::swap(Op1Reg, Op2Reg);
26954 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26955 Op1Reg = RegRewriteTable[Op1Reg].first;
26957 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26958 Op2Reg = RegRewriteTable[Op2Reg].second;
26960 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26966 // Add this PHI to the rewrite table.
26967 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26973 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26974 MachineBasicBlock *
26975 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26976 MachineInstr &SecondCascadedCMOV,
26977 MachineBasicBlock *ThisMBB) const {
26978 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26979 DebugLoc DL = FirstCMOV.getDebugLoc();
26981 // We lower cascaded CMOVs such as
26983 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26985 // to two successive branches.
26987 // Without this, we would add a PHI between the two jumps, which ends up
26988 // creating a few copies all around. For instance, for
26990 // (sitofp (zext (fcmp une)))
26992 // we would generate:
26994 // ucomiss %xmm1, %xmm0
26995 // movss <1.0f>, %xmm0
26996 // movaps %xmm0, %xmm1
26998 // xorps %xmm1, %xmm1
27001 // movaps %xmm1, %xmm0
27005 // because this custom-inserter would have generated:
27017 // A: X = ...; Y = ...
27019 // C: Z = PHI [X, A], [Y, B]
27021 // E: PHI [X, C], [Z, D]
27023 // If we lower both CMOVs in a single step, we can instead generate:
27035 // A: X = ...; Y = ...
27037 // E: PHI [X, A], [X, C], [Y, D]
27039 // Which, in our sitofp/fcmp example, gives us something like:
27041 // ucomiss %xmm1, %xmm0
27042 // movss <1.0f>, %xmm0
27045 // xorps %xmm0, %xmm0
27050 // We lower cascaded CMOV into two successive branches to the same block.
27051 // EFLAGS is used by both, so mark it as live in the second.
27052 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27053 MachineFunction *F = ThisMBB->getParent();
27054 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27055 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27056 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27058 MachineFunction::iterator It = ++ThisMBB->getIterator();
27059 F->insert(It, FirstInsertedMBB);
27060 F->insert(It, SecondInsertedMBB);
27061 F->insert(It, SinkMBB);
27063 // For a cascaded CMOV, we lower it to two successive branches to
27064 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
27065 // the FirstInsertedMBB.
27066 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
27068 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27069 // live into the sink and copy blocks.
27070 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27071 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
27072 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
27073 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
27074 SinkMBB->addLiveIn(X86::EFLAGS);
27077 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27078 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27079 std::next(MachineBasicBlock::iterator(FirstCMOV)),
27081 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27083 // Fallthrough block for ThisMBB.
27084 ThisMBB->addSuccessor(FirstInsertedMBB);
27085 // The true block target of the first branch is always SinkMBB.
27086 ThisMBB->addSuccessor(SinkMBB);
27087 // Fallthrough block for FirstInsertedMBB.
27088 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
27089 // The true block for the branch of FirstInsertedMBB.
27090 FirstInsertedMBB->addSuccessor(SinkMBB);
27091 // This is fallthrough.
27092 SecondInsertedMBB->addSuccessor(SinkMBB);
27094 // Create the conditional branch instructions.
27095 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
27096 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
27097 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27099 X86::CondCode SecondCC =
27100 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
27101 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
27102 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
27105 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
27106 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
27107 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
27108 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
27109 MachineInstrBuilder MIB =
27110 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
27112 .addMBB(SecondInsertedMBB)
27116 // The second SecondInsertedMBB provides the same incoming value as the
27117 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
27118 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
27119 // Copy the PHI result to the register defined by the second CMOV.
27120 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
27121 TII->get(TargetOpcode::COPY),
27122 SecondCascadedCMOV.getOperand(0).getReg())
27123 .addReg(FirstCMOV.getOperand(0).getReg());
27125 // Now remove the CMOVs.
27126 FirstCMOV.eraseFromParent();
27127 SecondCascadedCMOV.eraseFromParent();
27132 MachineBasicBlock *
27133 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
27134 MachineBasicBlock *ThisMBB) const {
27135 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27136 DebugLoc DL = MI.getDebugLoc();
27138 // To "insert" a SELECT_CC instruction, we actually have to insert the
27139 // diamond control-flow pattern. The incoming instruction knows the
27140 // destination vreg to set, the condition code register to branch on, the
27141 // true/false values to select between and a branch opcode to use.
27146 // cmpTY ccX, r1, r2
27148 // fallthrough --> FalseMBB
27150 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
27151 // as described above, by inserting a BB, and then making a PHI at the join
27152 // point to select the true and false operands of the CMOV in the PHI.
27154 // The code also handles two different cases of multiple CMOV opcodes
27158 // In this case, there are multiple CMOVs in a row, all which are based on
27159 // the same condition setting (or the exact opposite condition setting).
27160 // In this case we can lower all the CMOVs using a single inserted BB, and
27161 // then make a number of PHIs at the join point to model the CMOVs. The only
27162 // trickiness here, is that in a case like:
27164 // t2 = CMOV cond1 t1, f1
27165 // t3 = CMOV cond1 t2, f2
27167 // when rewriting this into PHIs, we have to perform some renaming on the
27168 // temps since you cannot have a PHI operand refer to a PHI result earlier
27169 // in the same block. The "simple" but wrong lowering would be:
27171 // t2 = PHI t1(BB1), f1(BB2)
27172 // t3 = PHI t2(BB1), f2(BB2)
27174 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
27175 // renaming is to note that on the path through BB1, t2 is really just a
27176 // copy of t1, and do that renaming, properly generating:
27178 // t2 = PHI t1(BB1), f1(BB2)
27179 // t3 = PHI t1(BB1), f2(BB2)
27182 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
27183 // function - EmitLoweredCascadedSelect.
27185 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
27186 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27187 MachineInstr *LastCMOV = &MI;
27188 MachineBasicBlock::iterator NextMIIt =
27189 std::next(MachineBasicBlock::iterator(MI));
27191 // Check for case 1, where there are multiple CMOVs with the same condition
27192 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
27193 // number of jumps the most.
27195 if (isCMOVPseudo(MI)) {
27196 // See if we have a string of CMOVS with the same condition.
27197 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
27198 (NextMIIt->getOperand(3).getImm() == CC ||
27199 NextMIIt->getOperand(3).getImm() == OppCC)) {
27200 LastCMOV = &*NextMIIt;
27205 // This checks for case 2, but only do this if we didn't already find
27206 // case 1, as indicated by LastCMOV == MI.
27207 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
27208 NextMIIt->getOpcode() == MI.getOpcode() &&
27209 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
27210 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
27211 NextMIIt->getOperand(1).isKill()) {
27212 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
27215 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27216 MachineFunction *F = ThisMBB->getParent();
27217 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
27218 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27220 MachineFunction::iterator It = ++ThisMBB->getIterator();
27221 F->insert(It, FalseMBB);
27222 F->insert(It, SinkMBB);
27224 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27225 // live into the sink and copy blocks.
27226 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27227 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
27228 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
27229 FalseMBB->addLiveIn(X86::EFLAGS);
27230 SinkMBB->addLiveIn(X86::EFLAGS);
27233 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27234 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27235 std::next(MachineBasicBlock::iterator(LastCMOV)),
27237 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27239 // Fallthrough block for ThisMBB.
27240 ThisMBB->addSuccessor(FalseMBB);
27241 // The true block target of the first (or only) branch is always a SinkMBB.
27242 ThisMBB->addSuccessor(SinkMBB);
27243 // Fallthrough block for FalseMBB.
27244 FalseMBB->addSuccessor(SinkMBB);
27246 // Create the conditional branch instruction.
27247 unsigned Opc = X86::GetCondBranchFromCond(CC);
27248 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27251 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
27253 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
27254 MachineBasicBlock::iterator MIItEnd =
27255 std::next(MachineBasicBlock::iterator(LastCMOV));
27256 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
27258 // Now remove the CMOV(s).
27259 ThisMBB->erase(MIItBegin, MIItEnd);
27264 MachineBasicBlock *
27265 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
27266 MachineBasicBlock *BB) const {
27267 // Combine the following atomic floating-point modification pattern:
27268 // a.store(reg OP a.load(acquire), release)
27269 // Transform them into:
27270 // OPss (%gpr), %xmm
27271 // movss %xmm, (%gpr)
27272 // Or sd equivalent for 64-bit operations.
27274 switch (MI.getOpcode()) {
27275 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
27276 case X86::RELEASE_FADD32mr:
27277 FOp = X86::ADDSSrm;
27278 MOp = X86::MOVSSmr;
27280 case X86::RELEASE_FADD64mr:
27281 FOp = X86::ADDSDrm;
27282 MOp = X86::MOVSDmr;
27285 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27286 DebugLoc DL = MI.getDebugLoc();
27287 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
27288 unsigned ValOpIdx = X86::AddrNumOperands;
27289 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
27290 MachineInstrBuilder MIB =
27291 BuildMI(*BB, MI, DL, TII->get(FOp),
27292 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
27294 for (int i = 0; i < X86::AddrNumOperands; ++i) {
27295 MachineOperand &Operand = MI.getOperand(i);
27296 // Clear any kill flags on register operands as we'll create a second
27297 // instruction using the same address operands.
27298 if (Operand.isReg())
27299 Operand.setIsKill(false);
27302 MachineInstr *FOpMI = MIB;
27303 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
27304 for (int i = 0; i < X86::AddrNumOperands; ++i)
27305 MIB.add(MI.getOperand(i));
27306 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
27307 MI.eraseFromParent(); // The pseudo instruction is gone now.
27311 MachineBasicBlock *
27312 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
27313 MachineBasicBlock *BB) const {
27314 MachineFunction *MF = BB->getParent();
27315 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27316 DebugLoc DL = MI.getDebugLoc();
27317 const BasicBlock *LLVM_BB = BB->getBasicBlock();
27319 assert(MF->shouldSplitStack());
27321 const bool Is64Bit = Subtarget.is64Bit();
27322 const bool IsLP64 = Subtarget.isTarget64BitLP64();
27324 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
27325 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
27328 // ... [Till the alloca]
27329 // If stacklet is not large enough, jump to mallocMBB
27332 // Allocate by subtracting from RSP
27333 // Jump to continueMBB
27336 // Allocate by call to runtime
27340 // [rest of original BB]
27343 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27344 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27345 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27347 MachineRegisterInfo &MRI = MF->getRegInfo();
27348 const TargetRegisterClass *AddrRegClass =
27349 getRegClassFor(getPointerTy(MF->getDataLayout()));
27351 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27352 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27353 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
27354 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
27355 sizeVReg = MI.getOperand(1).getReg(),
27357 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
27359 MachineFunction::iterator MBBIter = ++BB->getIterator();
27361 MF->insert(MBBIter, bumpMBB);
27362 MF->insert(MBBIter, mallocMBB);
27363 MF->insert(MBBIter, continueMBB);
27365 continueMBB->splice(continueMBB->begin(), BB,
27366 std::next(MachineBasicBlock::iterator(MI)), BB->end());
27367 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
27369 // Add code to the main basic block to check if the stack limit has been hit,
27370 // and if so, jump to mallocMBB otherwise to bumpMBB.
27371 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
27372 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
27373 .addReg(tmpSPVReg).addReg(sizeVReg);
27374 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
27375 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
27376 .addReg(SPLimitVReg);
27377 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
27379 // bumpMBB simply decreases the stack pointer, since we know the current
27380 // stacklet has enough space.
27381 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
27382 .addReg(SPLimitVReg);
27383 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
27384 .addReg(SPLimitVReg);
27385 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27387 // Calls into a routine in libgcc to allocate more space from the heap.
27388 const uint32_t *RegMask =
27389 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
27391 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
27393 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27394 .addExternalSymbol("__morestack_allocate_stack_space")
27395 .addRegMask(RegMask)
27396 .addReg(X86::RDI, RegState::Implicit)
27397 .addReg(X86::RAX, RegState::ImplicitDefine);
27398 } else if (Is64Bit) {
27399 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
27401 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27402 .addExternalSymbol("__morestack_allocate_stack_space")
27403 .addRegMask(RegMask)
27404 .addReg(X86::EDI, RegState::Implicit)
27405 .addReg(X86::EAX, RegState::ImplicitDefine);
27407 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
27409 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
27410 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
27411 .addExternalSymbol("__morestack_allocate_stack_space")
27412 .addRegMask(RegMask)
27413 .addReg(X86::EAX, RegState::ImplicitDefine);
27417 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
27420 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
27421 .addReg(IsLP64 ? X86::RAX : X86::EAX);
27422 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27424 // Set up the CFG correctly.
27425 BB->addSuccessor(bumpMBB);
27426 BB->addSuccessor(mallocMBB);
27427 mallocMBB->addSuccessor(continueMBB);
27428 bumpMBB->addSuccessor(continueMBB);
27430 // Take care of the PHI nodes.
27431 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
27432 MI.getOperand(0).getReg())
27433 .addReg(mallocPtrVReg)
27435 .addReg(bumpSPPtrVReg)
27438 // Delete the original pseudo instruction.
27439 MI.eraseFromParent();
27442 return continueMBB;
27445 MachineBasicBlock *
27446 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
27447 MachineBasicBlock *BB) const {
27448 MachineFunction *MF = BB->getParent();
27449 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27450 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
27451 DebugLoc DL = MI.getDebugLoc();
27453 assert(!isAsynchronousEHPersonality(
27454 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
27455 "SEH does not use catchret!");
27457 // Only 32-bit EH needs to worry about manually restoring stack pointers.
27458 if (!Subtarget.is32Bit())
27461 // C++ EH creates a new target block to hold the restore code, and wires up
27462 // the new block to the return destination with a normal JMP_4.
27463 MachineBasicBlock *RestoreMBB =
27464 MF->CreateMachineBasicBlock(BB->getBasicBlock());
27465 assert(BB->succ_size() == 1);
27466 MF->insert(std::next(BB->getIterator()), RestoreMBB);
27467 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
27468 BB->addSuccessor(RestoreMBB);
27469 MI.getOperand(0).setMBB(RestoreMBB);
27471 auto RestoreMBBI = RestoreMBB->begin();
27472 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
27473 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
27477 MachineBasicBlock *
27478 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
27479 MachineBasicBlock *BB) const {
27480 MachineFunction *MF = BB->getParent();
27481 const Constant *PerFn = MF->getFunction().getPersonalityFn();
27482 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
27483 // Only 32-bit SEH requires special handling for catchpad.
27484 if (IsSEH && Subtarget.is32Bit()) {
27485 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27486 DebugLoc DL = MI.getDebugLoc();
27487 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27489 MI.eraseFromParent();
27493 MachineBasicBlock *
27494 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27495 MachineBasicBlock *BB) const {
27496 // So, here we replace TLSADDR with the sequence:
27497 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27498 // We need this because TLSADDR is lowered into calls
27499 // inside MC, therefore without the two markers shrink-wrapping
27500 // may push the prologue/epilogue pass them.
27501 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27502 DebugLoc DL = MI.getDebugLoc();
27503 MachineFunction &MF = *BB->getParent();
27505 // Emit CALLSEQ_START right before the instruction.
27506 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27507 MachineInstrBuilder CallseqStart =
27508 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27509 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27511 // Emit CALLSEQ_END right after the instruction.
27512 // We don't call erase from parent because we want to keep the
27513 // original instruction around.
27514 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27515 MachineInstrBuilder CallseqEnd =
27516 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27517 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27522 MachineBasicBlock *
27523 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27524 MachineBasicBlock *BB) const {
27525 // This is pretty easy. We're taking the value that we received from
27526 // our load from the relocation, sticking it in either RDI (x86-64)
27527 // or EAX and doing an indirect call. The return value will then
27528 // be in the normal return register.
27529 MachineFunction *F = BB->getParent();
27530 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27531 DebugLoc DL = MI.getDebugLoc();
27533 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27534 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27536 // Get a register mask for the lowered call.
27537 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27538 // proper register mask.
27539 const uint32_t *RegMask =
27540 Subtarget.is64Bit() ?
27541 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27542 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27543 if (Subtarget.is64Bit()) {
27544 MachineInstrBuilder MIB =
27545 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27549 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27550 MI.getOperand(3).getTargetFlags())
27552 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27553 addDirectMem(MIB, X86::RDI);
27554 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27555 } else if (!isPositionIndependent()) {
27556 MachineInstrBuilder MIB =
27557 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27561 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27562 MI.getOperand(3).getTargetFlags())
27564 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27565 addDirectMem(MIB, X86::EAX);
27566 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27568 MachineInstrBuilder MIB =
27569 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27570 .addReg(TII->getGlobalBaseReg(F))
27573 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27574 MI.getOperand(3).getTargetFlags())
27576 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27577 addDirectMem(MIB, X86::EAX);
27578 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27581 MI.eraseFromParent(); // The pseudo instruction is gone now.
27585 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27587 case X86::RETPOLINE_CALL32:
27588 return X86::CALLpcrel32;
27589 case X86::RETPOLINE_CALL64:
27590 return X86::CALL64pcrel32;
27591 case X86::RETPOLINE_TCRETURN32:
27592 return X86::TCRETURNdi;
27593 case X86::RETPOLINE_TCRETURN64:
27594 return X86::TCRETURNdi64;
27596 llvm_unreachable("not retpoline opcode");
27599 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27601 if (Subtarget.useRetpolineExternalThunk()) {
27602 // When using an external thunk for retpolines, we pick names that match the
27603 // names GCC happens to use as well. This helps simplify the implementation
27604 // of the thunks for kernels where they have no easy ability to create
27605 // aliases and are doing non-trivial configuration of the thunk's body. For
27606 // example, the Linux kernel will do boot-time hot patching of the thunk
27607 // bodies and cannot easily export aliases of these to loaded modules.
27609 // Note that at any point in the future, we may need to change the semantics
27610 // of how we implement retpolines and at that time will likely change the
27611 // name of the called thunk. Essentially, there is no hard guarantee that
27612 // LLVM will generate calls to specific thunks, we merely make a best-effort
27613 // attempt to help out kernels and other systems where duplicating the
27614 // thunks is costly.
27617 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27618 return "__x86_indirect_thunk_eax";
27620 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27621 return "__x86_indirect_thunk_ecx";
27623 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27624 return "__x86_indirect_thunk_edx";
27626 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27627 return "__x86_indirect_thunk_edi";
27629 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27630 return "__x86_indirect_thunk_r11";
27632 llvm_unreachable("unexpected reg for retpoline");
27635 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27638 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27639 return "__llvm_retpoline_eax";
27641 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27642 return "__llvm_retpoline_ecx";
27644 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27645 return "__llvm_retpoline_edx";
27647 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27648 return "__llvm_retpoline_edi";
27650 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27651 return "__llvm_retpoline_r11";
27653 llvm_unreachable("unexpected reg for retpoline");
27656 MachineBasicBlock *
27657 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27658 MachineBasicBlock *BB) const {
27659 // Copy the virtual register into the R11 physical register and
27660 // call the retpoline thunk.
27661 DebugLoc DL = MI.getDebugLoc();
27662 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27663 unsigned CalleeVReg = MI.getOperand(0).getReg();
27664 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27666 // Find an available scratch register to hold the callee. On 64-bit, we can
27667 // just use R11, but we scan for uses anyway to ensure we don't generate
27668 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27669 // already a register use operand to the call to hold the callee. If none
27670 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27671 // register and ESI is the base pointer to realigned stack frames with VLAs.
27672 SmallVector<unsigned, 3> AvailableRegs;
27673 if (Subtarget.is64Bit())
27674 AvailableRegs.push_back(X86::R11);
27676 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27678 // Zero out any registers that are already used.
27679 for (const auto &MO : MI.operands()) {
27680 if (MO.isReg() && MO.isUse())
27681 for (unsigned &Reg : AvailableRegs)
27682 if (Reg == MO.getReg())
27686 // Choose the first remaining non-zero available register.
27687 unsigned AvailableReg = 0;
27688 for (unsigned MaybeReg : AvailableRegs) {
27690 AvailableReg = MaybeReg;
27695 report_fatal_error("calling convention incompatible with retpoline, no "
27696 "available registers");
27698 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27700 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27701 .addReg(CalleeVReg);
27702 MI.getOperand(0).ChangeToES(Symbol);
27703 MI.setDesc(TII->get(Opc));
27704 MachineInstrBuilder(*BB->getParent(), &MI)
27705 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27709 /// SetJmp implies future control flow change upon calling the corresponding
27711 /// Instead of using the 'return' instruction, the long jump fixes the stack and
27712 /// performs an indirect branch. To do so it uses the registers that were stored
27713 /// in the jump buffer (when calling SetJmp).
27714 /// In case the shadow stack is enabled we need to fix it as well, because some
27715 /// return addresses will be skipped.
27716 /// The function will save the SSP for future fixing in the function
27717 /// emitLongJmpShadowStackFix.
27718 /// \sa emitLongJmpShadowStackFix
27719 /// \param [in] MI The temporary Machine Instruction for the builtin.
27720 /// \param [in] MBB The Machine Basic Block that will be modified.
27721 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
27722 MachineBasicBlock *MBB) const {
27723 DebugLoc DL = MI.getDebugLoc();
27724 MachineFunction *MF = MBB->getParent();
27725 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27726 MachineRegisterInfo &MRI = MF->getRegInfo();
27727 MachineInstrBuilder MIB;
27729 // Memory Reference.
27730 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27731 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27733 // Initialize a register with zero.
27734 MVT PVT = getPointerTy(MF->getDataLayout());
27735 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27736 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
27737 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
27738 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
27740 .addReg(ZReg, RegState::Undef)
27741 .addReg(ZReg, RegState::Undef);
27743 // Read the current SSP Register value to the zeroed register.
27744 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
27745 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
27746 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
27748 // Write the SSP register value to offset 3 in input memory buffer.
27749 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27750 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
27751 const int64_t SSPOffset = 3 * PVT.getStoreSize();
27752 const unsigned MemOpndSlot = 1;
27753 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27754 if (i == X86::AddrDisp)
27755 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
27757 MIB.add(MI.getOperand(MemOpndSlot + i));
27759 MIB.addReg(SSPCopyReg);
27760 MIB.setMemRefs(MMOBegin, MMOEnd);
27763 MachineBasicBlock *
27764 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27765 MachineBasicBlock *MBB) const {
27766 DebugLoc DL = MI.getDebugLoc();
27767 MachineFunction *MF = MBB->getParent();
27768 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27769 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27770 MachineRegisterInfo &MRI = MF->getRegInfo();
27772 const BasicBlock *BB = MBB->getBasicBlock();
27773 MachineFunction::iterator I = ++MBB->getIterator();
27775 // Memory Reference
27776 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27777 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27780 unsigned MemOpndSlot = 0;
27782 unsigned CurOp = 0;
27784 DstReg = MI.getOperand(CurOp++).getReg();
27785 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27786 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27788 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27789 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27791 MemOpndSlot = CurOp;
27793 MVT PVT = getPointerTy(MF->getDataLayout());
27794 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27795 "Invalid Pointer Size!");
27797 // For v = setjmp(buf), we generate
27800 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27801 // SjLjSetup restoreMBB
27807 // v = phi(main, restore)
27810 // if base pointer being used, load it from frame
27813 MachineBasicBlock *thisMBB = MBB;
27814 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27815 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27816 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27817 MF->insert(I, mainMBB);
27818 MF->insert(I, sinkMBB);
27819 MF->push_back(restoreMBB);
27820 restoreMBB->setHasAddressTaken();
27822 MachineInstrBuilder MIB;
27824 // Transfer the remainder of BB and its successor edges to sinkMBB.
27825 sinkMBB->splice(sinkMBB->begin(), MBB,
27826 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27827 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27830 unsigned PtrStoreOpc = 0;
27831 unsigned LabelReg = 0;
27832 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27833 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27834 !isPositionIndependent();
27836 // Prepare IP either in reg or imm.
27837 if (!UseImmLabel) {
27838 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27839 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27840 LabelReg = MRI.createVirtualRegister(PtrRC);
27841 if (Subtarget.is64Bit()) {
27842 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27846 .addMBB(restoreMBB)
27849 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27850 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27851 .addReg(XII->getGlobalBaseReg(MF))
27854 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27858 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27860 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27861 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27862 if (i == X86::AddrDisp)
27863 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27865 MIB.add(MI.getOperand(MemOpndSlot + i));
27868 MIB.addReg(LabelReg);
27870 MIB.addMBB(restoreMBB);
27871 MIB.setMemRefs(MMOBegin, MMOEnd);
27873 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
27874 emitSetJmpShadowStackFix(MI, thisMBB);
27878 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27879 .addMBB(restoreMBB);
27881 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27882 MIB.addRegMask(RegInfo->getNoPreservedMask());
27883 thisMBB->addSuccessor(mainMBB);
27884 thisMBB->addSuccessor(restoreMBB);
27888 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27889 mainMBB->addSuccessor(sinkMBB);
27892 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27893 TII->get(X86::PHI), DstReg)
27894 .addReg(mainDstReg).addMBB(mainMBB)
27895 .addReg(restoreDstReg).addMBB(restoreMBB);
27898 if (RegInfo->hasBasePointer(*MF)) {
27899 const bool Uses64BitFramePtr =
27900 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27901 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27902 X86FI->setRestoreBasePointer(MF);
27903 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27904 unsigned BasePtr = RegInfo->getBaseRegister();
27905 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27906 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27907 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27908 .setMIFlag(MachineInstr::FrameSetup);
27910 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27911 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27912 restoreMBB->addSuccessor(sinkMBB);
27914 MI.eraseFromParent();
27918 /// Fix the shadow stack using the previously saved SSP pointer.
27919 /// \sa emitSetJmpShadowStackFix
27920 /// \param [in] MI The temporary Machine Instruction for the builtin.
27921 /// \param [in] MBB The Machine Basic Block that will be modified.
27922 /// \return The sink MBB that will perform the future indirect branch.
27923 MachineBasicBlock *
27924 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
27925 MachineBasicBlock *MBB) const {
27926 DebugLoc DL = MI.getDebugLoc();
27927 MachineFunction *MF = MBB->getParent();
27928 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27929 MachineRegisterInfo &MRI = MF->getRegInfo();
27931 // Memory Reference
27932 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27933 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27935 MVT PVT = getPointerTy(MF->getDataLayout());
27936 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27939 // xor vreg1, vreg1
27941 // test vreg1, vreg1
27942 // je sinkMBB # Jump if Shadow Stack is not supported
27944 // mov buf+24/12(%rip), vreg2
27945 // sub vreg1, vreg2
27946 // jbe sinkMBB # No need to fix the Shadow Stack
27949 // incssp vreg2 # fix the SSP according to the lower 8 bits
27952 // fixShadowLoopPrepareMBB:
27955 // fixShadowLoopMBB:
27958 // jne fixShadowLoopMBB # Iterate until you finish fixing
27959 // # the Shadow Stack
27962 MachineFunction::iterator I = ++MBB->getIterator();
27963 const BasicBlock *BB = MBB->getBasicBlock();
27965 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
27966 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
27967 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
27968 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
27969 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
27970 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27971 MF->insert(I, checkSspMBB);
27972 MF->insert(I, fallMBB);
27973 MF->insert(I, fixShadowMBB);
27974 MF->insert(I, fixShadowLoopPrepareMBB);
27975 MF->insert(I, fixShadowLoopMBB);
27976 MF->insert(I, sinkMBB);
27978 // Transfer the remainder of BB and its successor edges to sinkMBB.
27979 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
27981 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27983 MBB->addSuccessor(checkSspMBB);
27985 // Initialize a register with zero.
27986 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
27987 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
27988 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
27990 .addReg(ZReg, RegState::Undef)
27991 .addReg(ZReg, RegState::Undef);
27993 // Read the current SSP Register value to the zeroed register.
27994 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
27995 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
27996 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
27998 // Check whether the result of the SSP register is zero and jump directly
28000 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
28001 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
28002 .addReg(SSPCopyReg)
28003 .addReg(SSPCopyReg);
28004 BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28005 checkSspMBB->addSuccessor(sinkMBB);
28006 checkSspMBB->addSuccessor(fallMBB);
28008 // Reload the previously saved SSP register value.
28009 unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
28010 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28011 const int64_t SPPOffset = 3 * PVT.getStoreSize();
28012 MachineInstrBuilder MIB =
28013 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
28014 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28015 if (i == X86::AddrDisp)
28016 MIB.addDisp(MI.getOperand(i), SPPOffset);
28018 MIB.add(MI.getOperand(i));
28020 MIB.setMemRefs(MMOBegin, MMOEnd);
28022 // Subtract the current SSP from the previous SSP.
28023 unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
28024 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
28025 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
28026 .addReg(PrevSSPReg)
28027 .addReg(SSPCopyReg);
28029 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
28030 BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
28031 fallMBB->addSuccessor(sinkMBB);
28032 fallMBB->addSuccessor(fixShadowMBB);
28034 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
28035 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
28036 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
28037 unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
28038 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
28042 // Increase SSP when looking only on the lower 8 bits of the delta.
28043 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
28044 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
28046 // Reset the lower 8 bits.
28047 unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
28048 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
28049 .addReg(SspFirstShrReg)
28052 // Jump if the result of the shift is zero.
28053 BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28054 fixShadowMBB->addSuccessor(sinkMBB);
28055 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
28057 // Do a single shift left.
28058 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
28059 unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
28060 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
28061 .addReg(SspSecondShrReg);
28063 // Save the value 128 to a register (will be used next with incssp).
28064 unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
28065 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
28066 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
28068 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
28070 // Since incssp only looks at the lower 8 bits, we might need to do several
28071 // iterations of incssp until we finish fixing the shadow stack.
28072 unsigned DecReg = MRI.createVirtualRegister(PtrRC);
28073 unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
28074 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
28075 .addReg(SspAfterShlReg)
28076 .addMBB(fixShadowLoopPrepareMBB)
28078 .addMBB(fixShadowLoopMBB);
28080 // Every iteration we increase the SSP by 128.
28081 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
28083 // Every iteration we decrement the counter by 1.
28084 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
28085 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
28087 // Jump if the counter is not zero yet.
28088 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
28089 fixShadowLoopMBB->addSuccessor(sinkMBB);
28090 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
28095 MachineBasicBlock *
28096 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
28097 MachineBasicBlock *MBB) const {
28098 DebugLoc DL = MI.getDebugLoc();
28099 MachineFunction *MF = MBB->getParent();
28100 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28101 MachineRegisterInfo &MRI = MF->getRegInfo();
28103 // Memory Reference
28104 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28105 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28107 MVT PVT = getPointerTy(MF->getDataLayout());
28108 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
28109 "Invalid Pointer Size!");
28111 const TargetRegisterClass *RC =
28112 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28113 unsigned Tmp = MRI.createVirtualRegister(RC);
28114 // Since FP is only updated here but NOT referenced, it's treated as GPR.
28115 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28116 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
28117 unsigned SP = RegInfo->getStackRegister();
28119 MachineInstrBuilder MIB;
28121 const int64_t LabelOffset = 1 * PVT.getStoreSize();
28122 const int64_t SPOffset = 2 * PVT.getStoreSize();
28124 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28125 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
28127 MachineBasicBlock *thisMBB = MBB;
28129 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
28130 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
28131 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
28135 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
28136 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
28137 MIB.add(MI.getOperand(i));
28138 MIB.setMemRefs(MMOBegin, MMOEnd);
28141 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
28142 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28143 if (i == X86::AddrDisp)
28144 MIB.addDisp(MI.getOperand(i), LabelOffset);
28146 MIB.add(MI.getOperand(i));
28148 MIB.setMemRefs(MMOBegin, MMOEnd);
28151 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
28152 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28153 if (i == X86::AddrDisp)
28154 MIB.addDisp(MI.getOperand(i), SPOffset);
28156 MIB.add(MI.getOperand(i));
28158 MIB.setMemRefs(MMOBegin, MMOEnd);
28161 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
28163 MI.eraseFromParent();
28167 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
28168 MachineBasicBlock *MBB,
28169 MachineBasicBlock *DispatchBB,
28171 DebugLoc DL = MI.getDebugLoc();
28172 MachineFunction *MF = MBB->getParent();
28173 MachineRegisterInfo *MRI = &MF->getRegInfo();
28174 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28176 MVT PVT = getPointerTy(MF->getDataLayout());
28177 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
28182 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
28183 !isPositionIndependent();
28186 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
28188 const TargetRegisterClass *TRC =
28189 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28190 VR = MRI->createVirtualRegister(TRC);
28191 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
28193 if (Subtarget.is64Bit())
28194 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
28198 .addMBB(DispatchBB)
28201 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
28202 .addReg(0) /* TII->getGlobalBaseReg(MF) */
28205 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
28209 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
28210 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
28212 MIB.addMBB(DispatchBB);
28217 MachineBasicBlock *
28218 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
28219 MachineBasicBlock *BB) const {
28220 DebugLoc DL = MI.getDebugLoc();
28221 MachineFunction *MF = BB->getParent();
28222 MachineFrameInfo &MFI = MF->getFrameInfo();
28223 MachineRegisterInfo *MRI = &MF->getRegInfo();
28224 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28225 int FI = MFI.getFunctionContextIndex();
28227 // Get a mapping of the call site numbers to all of the landing pads they're
28228 // associated with.
28229 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
28230 unsigned MaxCSNum = 0;
28231 for (auto &MBB : *MF) {
28232 if (!MBB.isEHPad())
28235 MCSymbol *Sym = nullptr;
28236 for (const auto &MI : MBB) {
28237 if (MI.isDebugInstr())
28240 assert(MI.isEHLabel() && "expected EH_LABEL");
28241 Sym = MI.getOperand(0).getMCSymbol();
28245 if (!MF->hasCallSiteLandingPad(Sym))
28248 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
28249 CallSiteNumToLPad[CSI].push_back(&MBB);
28250 MaxCSNum = std::max(MaxCSNum, CSI);
28254 // Get an ordered list of the machine basic blocks for the jump table.
28255 std::vector<MachineBasicBlock *> LPadList;
28256 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
28257 LPadList.reserve(CallSiteNumToLPad.size());
28259 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
28260 for (auto &LP : CallSiteNumToLPad[CSI]) {
28261 LPadList.push_back(LP);
28262 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
28266 assert(!LPadList.empty() &&
28267 "No landing pad destinations for the dispatch jump table!");
28269 // Create the MBBs for the dispatch code.
28271 // Shove the dispatch's address into the return slot in the function context.
28272 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
28273 DispatchBB->setIsEHPad(true);
28275 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
28276 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
28277 DispatchBB->addSuccessor(TrapBB);
28279 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
28280 DispatchBB->addSuccessor(DispContBB);
28283 MF->push_back(DispatchBB);
28284 MF->push_back(DispContBB);
28285 MF->push_back(TrapBB);
28287 // Insert code into the entry block that creates and registers the function
28289 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
28291 // Create the jump table and associated information
28292 unsigned JTE = getJumpTableEncoding();
28293 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
28294 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
28296 const X86RegisterInfo &RI = TII->getRegisterInfo();
28297 // Add a register mask with no preserved registers. This results in all
28298 // registers being marked as clobbered.
28299 if (RI.hasBasePointer(*MF)) {
28300 const bool FPIs64Bit =
28301 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
28302 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
28303 MFI->setRestoreBasePointer(MF);
28305 unsigned FP = RI.getFrameRegister(*MF);
28306 unsigned BP = RI.getBaseRegister();
28307 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
28308 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
28309 MFI->getRestoreBasePointerOffset())
28310 .addRegMask(RI.getNoPreservedMask());
28312 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
28313 .addRegMask(RI.getNoPreservedMask());
28316 // IReg is used as an index in a memory operand and therefore can't be SP
28317 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
28318 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
28319 Subtarget.is64Bit() ? 8 : 4);
28320 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
28322 .addImm(LPadList.size());
28323 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
28325 if (Subtarget.is64Bit()) {
28326 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28327 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
28329 // leaq .LJTI0_0(%rip), BReg
28330 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
28334 .addJumpTableIndex(MJTI)
28336 // movzx IReg64, IReg
28337 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
28340 .addImm(X86::sub_32bit);
28343 case MachineJumpTableInfo::EK_BlockAddress:
28344 // jmpq *(BReg,IReg64,8)
28345 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
28352 case MachineJumpTableInfo::EK_LabelDifference32: {
28353 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
28354 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
28355 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28357 // movl (BReg,IReg64,4), OReg
28358 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
28364 // movsx OReg64, OReg
28365 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
28366 // addq BReg, OReg64, TReg
28367 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
28371 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
28375 llvm_unreachable("Unexpected jump table encoding");
28378 // jmpl *.LJTI0_0(,IReg,4)
28379 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
28383 .addJumpTableIndex(MJTI)
28387 // Add the jump table entries as successors to the MBB.
28388 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
28389 for (auto &LP : LPadList)
28390 if (SeenMBBs.insert(LP).second)
28391 DispContBB->addSuccessor(LP);
28393 // N.B. the order the invoke BBs are processed in doesn't matter here.
28394 SmallVector<MachineBasicBlock *, 64> MBBLPads;
28395 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
28396 for (MachineBasicBlock *MBB : InvokeBBs) {
28397 // Remove the landing pad successor from the invoke block and replace it
28398 // with the new dispatch block.
28399 // Keep a copy of Successors since it's modified inside the loop.
28400 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
28402 // FIXME: Avoid quadratic complexity.
28403 for (auto MBBS : Successors) {
28404 if (MBBS->isEHPad()) {
28405 MBB->removeSuccessor(MBBS);
28406 MBBLPads.push_back(MBBS);
28410 MBB->addSuccessor(DispatchBB);
28412 // Find the invoke call and mark all of the callee-saved registers as
28413 // 'implicit defined' so that they're spilled. This prevents code from
28414 // moving instructions to before the EH block, where they will never be
28416 for (auto &II : reverse(*MBB)) {
28420 DenseMap<unsigned, bool> DefRegs;
28421 for (auto &MOp : II.operands())
28423 DefRegs[MOp.getReg()] = true;
28425 MachineInstrBuilder MIB(*MF, &II);
28426 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
28427 unsigned Reg = SavedRegs[RI];
28429 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
28436 // Mark all former landing pads as non-landing pads. The dispatch is the only
28437 // landing pad now.
28438 for (auto &LP : MBBLPads)
28439 LP->setIsEHPad(false);
28441 // The instruction is gone now.
28442 MI.eraseFromParent();
28446 MachineBasicBlock *
28447 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
28448 MachineBasicBlock *BB) const {
28449 MachineFunction *MF = BB->getParent();
28450 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28451 DebugLoc DL = MI.getDebugLoc();
28453 switch (MI.getOpcode()) {
28454 default: llvm_unreachable("Unexpected instr type to insert");
28455 case X86::TLS_addr32:
28456 case X86::TLS_addr64:
28457 case X86::TLS_base_addr32:
28458 case X86::TLS_base_addr64:
28459 return EmitLoweredTLSAddr(MI, BB);
28460 case X86::RETPOLINE_CALL32:
28461 case X86::RETPOLINE_CALL64:
28462 case X86::RETPOLINE_TCRETURN32:
28463 case X86::RETPOLINE_TCRETURN64:
28464 return EmitLoweredRetpoline(MI, BB);
28465 case X86::CATCHRET:
28466 return EmitLoweredCatchRet(MI, BB);
28467 case X86::CATCHPAD:
28468 return EmitLoweredCatchPad(MI, BB);
28469 case X86::SEG_ALLOCA_32:
28470 case X86::SEG_ALLOCA_64:
28471 return EmitLoweredSegAlloca(MI, BB);
28472 case X86::TLSCall_32:
28473 case X86::TLSCall_64:
28474 return EmitLoweredTLSCall(MI, BB);
28475 case X86::CMOV_FR32:
28476 case X86::CMOV_FR64:
28477 case X86::CMOV_FR128:
28478 case X86::CMOV_GR8:
28479 case X86::CMOV_GR16:
28480 case X86::CMOV_GR32:
28481 case X86::CMOV_RFP32:
28482 case X86::CMOV_RFP64:
28483 case X86::CMOV_RFP80:
28484 case X86::CMOV_V2F64:
28485 case X86::CMOV_V2I64:
28486 case X86::CMOV_V4F32:
28487 case X86::CMOV_V4F64:
28488 case X86::CMOV_V4I64:
28489 case X86::CMOV_V16F32:
28490 case X86::CMOV_V8F32:
28491 case X86::CMOV_V8F64:
28492 case X86::CMOV_V8I64:
28493 case X86::CMOV_V8I1:
28494 case X86::CMOV_V16I1:
28495 case X86::CMOV_V32I1:
28496 case X86::CMOV_V64I1:
28497 return EmitLoweredSelect(MI, BB);
28499 case X86::RDFLAGS32:
28500 case X86::RDFLAGS64: {
28502 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
28503 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
28504 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
28505 // Permit reads of the EFLAGS and DF registers without them being defined.
28506 // This intrinsic exists to read external processor state in flags, such as
28507 // the trap flag, interrupt flag, and direction flag, none of which are
28508 // modeled by the backend.
28509 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
28510 "Unexpected register in operand!");
28511 Push->getOperand(2).setIsUndef();
28512 assert(Push->getOperand(3).getReg() == X86::DF &&
28513 "Unexpected register in operand!");
28514 Push->getOperand(3).setIsUndef();
28515 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
28517 MI.eraseFromParent(); // The pseudo is gone now.
28521 case X86::WRFLAGS32:
28522 case X86::WRFLAGS64: {
28524 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
28526 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
28527 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
28528 BuildMI(*BB, MI, DL, TII->get(PopF));
28530 MI.eraseFromParent(); // The pseudo is gone now.
28534 case X86::RELEASE_FADD32mr:
28535 case X86::RELEASE_FADD64mr:
28536 return EmitLoweredAtomicFP(MI, BB);
28538 case X86::FP32_TO_INT16_IN_MEM:
28539 case X86::FP32_TO_INT32_IN_MEM:
28540 case X86::FP32_TO_INT64_IN_MEM:
28541 case X86::FP64_TO_INT16_IN_MEM:
28542 case X86::FP64_TO_INT32_IN_MEM:
28543 case X86::FP64_TO_INT64_IN_MEM:
28544 case X86::FP80_TO_INT16_IN_MEM:
28545 case X86::FP80_TO_INT32_IN_MEM:
28546 case X86::FP80_TO_INT64_IN_MEM: {
28547 // Change the floating point control register to use "round towards zero"
28548 // mode when truncating to an integer value.
28549 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
28550 addFrameReference(BuildMI(*BB, MI, DL,
28551 TII->get(X86::FNSTCW16m)), CWFrameIdx);
28553 // Load the old value of the high byte of the control word...
28555 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
28556 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
28559 // Set the high part to be round to zero...
28560 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
28563 // Reload the modified control word now...
28564 addFrameReference(BuildMI(*BB, MI, DL,
28565 TII->get(X86::FLDCW16m)), CWFrameIdx);
28567 // Restore the memory image of control word to original value
28568 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
28571 // Get the X86 opcode to use.
28573 switch (MI.getOpcode()) {
28574 default: llvm_unreachable("illegal opcode!");
28575 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
28576 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
28577 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
28578 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
28579 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
28580 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
28581 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
28582 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
28583 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
28586 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28587 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
28588 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
28590 // Reload the original control word now.
28591 addFrameReference(BuildMI(*BB, MI, DL,
28592 TII->get(X86::FLDCW16m)), CWFrameIdx);
28594 MI.eraseFromParent(); // The pseudo instruction is gone now.
28597 // Thread synchronization.
28599 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
28600 case X86::MONITORX:
28601 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
28605 return emitClzero(&MI, BB, Subtarget);
28609 return emitWRPKRU(MI, BB, Subtarget);
28611 return emitRDPKRU(MI, BB, Subtarget);
28614 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
28616 case X86::VASTART_SAVE_XMM_REGS:
28617 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
28619 case X86::VAARG_64:
28620 return EmitVAARG64WithCustomInserter(MI, BB);
28622 case X86::EH_SjLj_SetJmp32:
28623 case X86::EH_SjLj_SetJmp64:
28624 return emitEHSjLjSetJmp(MI, BB);
28626 case X86::EH_SjLj_LongJmp32:
28627 case X86::EH_SjLj_LongJmp64:
28628 return emitEHSjLjLongJmp(MI, BB);
28630 case X86::Int_eh_sjlj_setup_dispatch:
28631 return EmitSjLjDispatchBlock(MI, BB);
28633 case TargetOpcode::STATEPOINT:
28634 // As an implementation detail, STATEPOINT shares the STACKMAP format at
28635 // this point in the process. We diverge later.
28636 return emitPatchPoint(MI, BB);
28638 case TargetOpcode::STACKMAP:
28639 case TargetOpcode::PATCHPOINT:
28640 return emitPatchPoint(MI, BB);
28642 case TargetOpcode::PATCHABLE_EVENT_CALL:
28643 return emitXRayCustomEvent(MI, BB);
28645 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
28646 return emitXRayTypedEvent(MI, BB);
28648 case X86::LCMPXCHG8B: {
28649 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28650 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
28651 // requires a memory operand. If it happens that current architecture is
28652 // i686 and for current function we need a base pointer
28653 // - which is ESI for i686 - register allocator would not be able to
28654 // allocate registers for an address in form of X(%reg, %reg, Y)
28655 // - there never would be enough unreserved registers during regalloc
28656 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
28657 // We are giving a hand to register allocator by precomputing the address in
28658 // a new vreg using LEA.
28660 // If it is not i686 or there is no base pointer - nothing to do here.
28661 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
28664 // Even though this code does not necessarily needs the base pointer to
28665 // be ESI, we check for that. The reason: if this assert fails, there are
28666 // some changes happened in the compiler base pointer handling, which most
28667 // probably have to be addressed somehow here.
28668 assert(TRI->getBaseRegister() == X86::ESI &&
28669 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
28670 "base pointer in mind");
28672 MachineRegisterInfo &MRI = MF->getRegInfo();
28673 MVT SPTy = getPointerTy(MF->getDataLayout());
28674 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
28675 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
28677 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28678 // Regalloc does not need any help when the memory operand of CMPXCHG8B
28679 // does not use index register.
28680 if (AM.IndexReg == X86::NoRegister)
28683 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
28684 // four operand definitions that are E[ABCD] registers. We skip them and
28685 // then insert the LEA.
28686 MachineBasicBlock::iterator MBBI(MI);
28687 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
28688 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
28691 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
28693 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28697 case X86::LCMPXCHG16B:
28699 case X86::LCMPXCHG8B_SAVE_EBX:
28700 case X86::LCMPXCHG16B_SAVE_RBX: {
28702 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28703 if (!BB->isLiveIn(BasePtr))
28704 BB->addLiveIn(BasePtr);
28710 //===----------------------------------------------------------------------===//
28711 // X86 Optimization Hooks
28712 //===----------------------------------------------------------------------===//
28715 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
28716 const APInt &Demanded,
28717 TargetLoweringOpt &TLO) const {
28718 // Only optimize Ands to prevent shrinking a constant that could be
28719 // matched by movzx.
28720 if (Op.getOpcode() != ISD::AND)
28723 EVT VT = Op.getValueType();
28729 unsigned Size = VT.getSizeInBits();
28731 // Make sure the RHS really is a constant.
28732 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
28736 const APInt &Mask = C->getAPIntValue();
28738 // Clear all non-demanded bits initially.
28739 APInt ShrunkMask = Mask & Demanded;
28741 // Find the width of the shrunk mask.
28742 unsigned Width = ShrunkMask.getActiveBits();
28744 // If the mask is all 0s there's nothing to do here.
28748 // Find the next power of 2 width, rounding up to a byte.
28749 Width = PowerOf2Ceil(std::max(Width, 8U));
28750 // Truncate the width to size to handle illegal types.
28751 Width = std::min(Width, Size);
28753 // Calculate a possible zero extend mask for this constant.
28754 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
28756 // If we aren't changing the mask, just return true to keep it and prevent
28757 // the caller from optimizing.
28758 if (ZeroExtendMask == Mask)
28761 // Make sure the new mask can be represented by a combination of mask bits
28762 // and non-demanded bits.
28763 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
28766 // Replace the constant with the zero extend mask.
28768 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
28769 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
28770 return TLO.CombineTo(Op, NewOp);
28773 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28775 const APInt &DemandedElts,
28776 const SelectionDAG &DAG,
28777 unsigned Depth) const {
28778 unsigned BitWidth = Known.getBitWidth();
28779 unsigned Opc = Op.getOpcode();
28780 EVT VT = Op.getValueType();
28781 assert((Opc >= ISD::BUILTIN_OP_END ||
28782 Opc == ISD::INTRINSIC_WO_CHAIN ||
28783 Opc == ISD::INTRINSIC_W_CHAIN ||
28784 Opc == ISD::INTRINSIC_VOID) &&
28785 "Should use MaskedValueIsZero if you don't know whether Op"
28786 " is a target node!");
28791 case X86ISD::SETCC:
28792 Known.Zero.setBitsFrom(1);
28794 case X86ISD::MOVMSK: {
28795 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28796 Known.Zero.setBitsFrom(NumLoBits);
28799 case X86ISD::PEXTRB:
28800 case X86ISD::PEXTRW: {
28801 SDValue Src = Op.getOperand(0);
28802 EVT SrcVT = Src.getValueType();
28803 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28804 Op.getConstantOperandVal(1));
28805 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28806 Known = Known.zextOrTrunc(BitWidth);
28807 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28810 case X86ISD::VSHLI:
28811 case X86ISD::VSRLI: {
28812 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28813 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28814 Known.setAllZero();
28818 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28819 unsigned ShAmt = ShiftImm->getZExtValue();
28820 if (Opc == X86ISD::VSHLI) {
28821 Known.Zero <<= ShAmt;
28822 Known.One <<= ShAmt;
28823 // Low bits are known zero.
28824 Known.Zero.setLowBits(ShAmt);
28826 Known.Zero.lshrInPlace(ShAmt);
28827 Known.One.lshrInPlace(ShAmt);
28828 // High bits are known zero.
28829 Known.Zero.setHighBits(ShAmt);
28834 case X86ISD::PACKUS: {
28835 // PACKUS is just a truncation if the upper half is zero.
28836 // TODO: Add DemandedElts support.
28838 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
28839 DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
28840 Known.One &= Known2.One;
28841 Known.Zero &= Known2.Zero;
28842 if (Known.countMinLeadingZeros() < BitWidth)
28844 Known = Known.trunc(BitWidth);
28847 case X86ISD::VZEXT: {
28848 // TODO: Add DemandedElts support.
28849 SDValue N0 = Op.getOperand(0);
28850 unsigned NumElts = VT.getVectorNumElements();
28852 EVT SrcVT = N0.getValueType();
28853 unsigned InNumElts = SrcVT.getVectorNumElements();
28854 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28855 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28857 Known = KnownBits(InBitWidth);
28858 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28859 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28860 Known = Known.zext(BitWidth);
28861 Known.Zero.setBitsFrom(InBitWidth);
28864 case X86ISD::CMOV: {
28865 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28866 // If we don't know any bits, early out.
28867 if (Known.isUnknown())
28870 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28872 // Only known if known in both the LHS and RHS.
28873 Known.One &= Known2.One;
28874 Known.Zero &= Known2.Zero;
28877 case X86ISD::UDIVREM8_ZEXT_HREG:
28878 // TODO: Support more than just the zero extended bits?
28879 if (Op.getResNo() != 1)
28881 // The remainder is zero extended.
28882 Known.Zero.setBitsFrom(8);
28886 // Handle target shuffles.
28887 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
28888 if (isTargetShuffle(Opc)) {
28890 SmallVector<int, 64> Mask;
28891 SmallVector<SDValue, 2> Ops;
28892 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
28894 unsigned NumOps = Ops.size();
28895 unsigned NumElts = VT.getVectorNumElements();
28896 if (Mask.size() == NumElts) {
28897 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
28898 Known.Zero.setAllBits(); Known.One.setAllBits();
28899 for (unsigned i = 0; i != NumElts; ++i) {
28900 if (!DemandedElts[i])
28903 if (M == SM_SentinelUndef) {
28904 // For UNDEF elements, we don't know anything about the common state
28905 // of the shuffle result.
28908 } else if (M == SM_SentinelZero) {
28909 Known.One.clearAllBits();
28912 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
28913 "Shuffle index out of range");
28915 unsigned OpIdx = (unsigned)M / NumElts;
28916 unsigned EltIdx = (unsigned)M % NumElts;
28917 if (Ops[OpIdx].getValueType() != VT) {
28918 // TODO - handle target shuffle ops with different value types.
28922 DemandedOps[OpIdx].setBit(EltIdx);
28924 // Known bits are the values that are shared by every demanded element.
28925 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
28926 if (!DemandedOps[i])
28929 DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
28930 Known.One &= Known2.One;
28931 Known.Zero &= Known2.Zero;
28938 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
28939 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
28940 unsigned Depth) const {
28941 unsigned VTBits = Op.getScalarValueSizeInBits();
28942 unsigned Opcode = Op.getOpcode();
28944 case X86ISD::SETCC_CARRY:
28945 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
28948 case X86ISD::VSEXT: {
28949 // TODO: Add DemandedElts support.
28950 SDValue Src = Op.getOperand(0);
28951 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28952 Tmp += VTBits - Src.getScalarValueSizeInBits();
28956 case X86ISD::VTRUNC: {
28957 // TODO: Add DemandedElts support.
28958 SDValue Src = Op.getOperand(0);
28959 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
28960 assert(VTBits < NumSrcBits && "Illegal truncation input type");
28961 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28962 if (Tmp > (NumSrcBits - VTBits))
28963 return Tmp - (NumSrcBits - VTBits);
28967 case X86ISD::PACKSS: {
28968 // PACKSS is just a truncation if the sign bits extend to the packed size.
28969 // TODO: Add DemandedElts support.
28970 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
28971 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
28972 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
28973 unsigned Tmp = std::min(Tmp0, Tmp1);
28974 if (Tmp > (SrcBits - VTBits))
28975 return Tmp - (SrcBits - VTBits);
28979 case X86ISD::VSHLI: {
28980 SDValue Src = Op.getOperand(0);
28981 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28982 if (ShiftVal.uge(VTBits))
28983 return VTBits; // Shifted all bits out --> zero.
28984 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28985 if (ShiftVal.uge(Tmp))
28986 return 1; // Shifted all sign bits out --> unknown.
28987 return Tmp - ShiftVal.getZExtValue();
28990 case X86ISD::VSRAI: {
28991 SDValue Src = Op.getOperand(0);
28992 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28993 if (ShiftVal.uge(VTBits - 1))
28994 return VTBits; // Sign splat.
28995 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28997 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
29000 case X86ISD::PCMPGT:
29001 case X86ISD::PCMPEQ:
29003 case X86ISD::VPCOM:
29004 case X86ISD::VPCOMU:
29005 // Vector compares return zero/all-bits result values.
29008 case X86ISD::CMOV: {
29009 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
29010 if (Tmp0 == 1) return 1; // Early out.
29011 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
29012 return std::min(Tmp0, Tmp1);
29014 case X86ISD::SDIVREM8_SEXT_HREG:
29015 // TODO: Support more than just the sign extended bits?
29016 if (Op.getResNo() != 1)
29018 // The remainder is sign extended.
29026 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
29027 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
29028 return N->getOperand(0);
29032 /// Returns true (and the GlobalValue and the offset) if the node is a
29033 /// GlobalAddress + offset.
29034 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
29035 const GlobalValue* &GA,
29036 int64_t &Offset) const {
29037 if (N->getOpcode() == X86ISD::Wrapper) {
29038 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
29039 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
29040 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
29044 return TargetLowering::isGAPlusOffset(N, GA, Offset);
29047 // Attempt to match a combined shuffle mask against supported unary shuffle
29049 // TODO: Investigate sharing more of this with shuffle lowering.
29050 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29051 bool AllowFloatDomain, bool AllowIntDomain,
29052 SDValue &V1, const SDLoc &DL,
29054 const X86Subtarget &Subtarget,
29055 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
29056 unsigned NumMaskElts = Mask.size();
29057 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
29059 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
29060 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
29061 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
29062 Shuffle = X86ISD::VZEXT_MOVL;
29063 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29067 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
29068 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
29069 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
29070 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
29071 unsigned MaxScale = 64 / MaskEltSize;
29072 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
29074 unsigned NumDstElts = NumMaskElts / Scale;
29075 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
29076 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
29077 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
29080 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
29081 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
29082 MVT::getIntegerVT(MaskEltSize);
29083 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
29085 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
29086 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
29087 Shuffle = unsigned(X86ISD::VZEXT);
29089 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
29091 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
29092 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
29098 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
29099 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
29100 isUndefOrEqual(Mask[0], 0) &&
29101 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
29102 Shuffle = X86ISD::VZEXT_MOVL;
29103 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29107 // Check if we have SSE3 which will let us use MOVDDUP etc. The
29108 // instructions are no slower than UNPCKLPD but has the option to
29109 // fold the input operand into even an unaligned memory load.
29110 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
29111 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
29112 Shuffle = X86ISD::MOVDDUP;
29113 SrcVT = DstVT = MVT::v2f64;
29116 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29117 Shuffle = X86ISD::MOVSLDUP;
29118 SrcVT = DstVT = MVT::v4f32;
29121 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
29122 Shuffle = X86ISD::MOVSHDUP;
29123 SrcVT = DstVT = MVT::v4f32;
29128 if (MaskVT.is256BitVector() && AllowFloatDomain) {
29129 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
29130 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29131 Shuffle = X86ISD::MOVDDUP;
29132 SrcVT = DstVT = MVT::v4f64;
29135 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29136 Shuffle = X86ISD::MOVSLDUP;
29137 SrcVT = DstVT = MVT::v8f32;
29140 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
29141 Shuffle = X86ISD::MOVSHDUP;
29142 SrcVT = DstVT = MVT::v8f32;
29147 if (MaskVT.is512BitVector() && AllowFloatDomain) {
29148 assert(Subtarget.hasAVX512() &&
29149 "AVX512 required for 512-bit vector shuffles");
29150 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29151 Shuffle = X86ISD::MOVDDUP;
29152 SrcVT = DstVT = MVT::v8f64;
29155 if (isTargetShuffleEquivalent(
29156 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
29157 Shuffle = X86ISD::MOVSLDUP;
29158 SrcVT = DstVT = MVT::v16f32;
29161 if (isTargetShuffleEquivalent(
29162 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
29163 Shuffle = X86ISD::MOVSHDUP;
29164 SrcVT = DstVT = MVT::v16f32;
29169 // Attempt to match against broadcast-from-vector.
29170 if (Subtarget.hasAVX2()) {
29171 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
29172 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
29173 SrcVT = DstVT = MaskVT;
29174 Shuffle = X86ISD::VBROADCAST;
29182 // Attempt to match a combined shuffle mask against supported unary immediate
29183 // permute instructions.
29184 // TODO: Investigate sharing more of this with shuffle lowering.
29185 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29186 const APInt &Zeroable,
29187 bool AllowFloatDomain,
29188 bool AllowIntDomain,
29189 const X86Subtarget &Subtarget,
29190 unsigned &Shuffle, MVT &ShuffleVT,
29191 unsigned &PermuteImm) {
29192 unsigned NumMaskElts = Mask.size();
29193 unsigned InputSizeInBits = MaskVT.getSizeInBits();
29194 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
29195 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
29197 bool ContainsZeros =
29198 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29200 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
29201 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
29202 // Check for lane crossing permutes.
29203 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
29204 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
29205 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
29206 Shuffle = X86ISD::VPERMI;
29207 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
29208 PermuteImm = getV4X86ShuffleImm(Mask);
29211 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
29212 SmallVector<int, 4> RepeatedMask;
29213 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
29214 Shuffle = X86ISD::VPERMI;
29215 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
29216 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
29220 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
29221 // VPERMILPD can permute with a non-repeating shuffle.
29222 Shuffle = X86ISD::VPERMILPI;
29223 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
29225 for (int i = 0, e = Mask.size(); i != e; ++i) {
29227 if (M == SM_SentinelUndef)
29229 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
29230 PermuteImm |= (M & 1) << i;
29236 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
29237 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
29238 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
29239 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
29240 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
29241 SmallVector<int, 4> RepeatedMask;
29242 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29243 // Narrow the repeated mask to create 32-bit element permutes.
29244 SmallVector<int, 4> WordMask = RepeatedMask;
29245 if (MaskScalarSizeInBits == 64)
29246 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
29248 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
29249 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
29250 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
29251 PermuteImm = getV4X86ShuffleImm(WordMask);
29256 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
29257 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
29258 SmallVector<int, 4> RepeatedMask;
29259 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29260 ArrayRef<int> LoMask(Mask.data() + 0, 4);
29261 ArrayRef<int> HiMask(Mask.data() + 4, 4);
29263 // PSHUFLW: permute lower 4 elements only.
29264 if (isUndefOrInRange(LoMask, 0, 4) &&
29265 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
29266 Shuffle = X86ISD::PSHUFLW;
29267 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29268 PermuteImm = getV4X86ShuffleImm(LoMask);
29272 // PSHUFHW: permute upper 4 elements only.
29273 if (isUndefOrInRange(HiMask, 4, 8) &&
29274 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
29275 // Offset the HiMask so that we can create the shuffle immediate.
29276 int OffsetHiMask[4];
29277 for (int i = 0; i != 4; ++i)
29278 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
29280 Shuffle = X86ISD::PSHUFHW;
29281 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29282 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
29288 // Attempt to match against byte/bit shifts.
29289 // FIXME: Add 512-bit support.
29290 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29291 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29292 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
29293 MaskScalarSizeInBits, Mask,
29294 0, Zeroable, Subtarget);
29295 if (0 < ShiftAmt) {
29296 PermuteImm = (unsigned)ShiftAmt;
29304 // Attempt to match a combined unary shuffle mask against supported binary
29305 // shuffle instructions.
29306 // TODO: Investigate sharing more of this with shuffle lowering.
29307 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29308 bool AllowFloatDomain, bool AllowIntDomain,
29309 SDValue &V1, SDValue &V2, const SDLoc &DL,
29311 const X86Subtarget &Subtarget,
29312 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
29314 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29316 if (MaskVT.is128BitVector()) {
29317 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
29319 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
29320 Shuffle = X86ISD::MOVLHPS;
29321 SrcVT = DstVT = MVT::v4f32;
29324 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
29326 Shuffle = X86ISD::MOVHLPS;
29327 SrcVT = DstVT = MVT::v4f32;
29330 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
29331 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29333 Shuffle = X86ISD::MOVSD;
29334 SrcVT = DstVT = MaskVT;
29337 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
29338 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29339 Shuffle = X86ISD::MOVSS;
29340 SrcVT = DstVT = MaskVT;
29345 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
29346 // TODO add support for 256/512-bit types.
29347 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
29348 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
29355 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
29356 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
29357 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29358 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
29359 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
29360 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
29361 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
29363 SrcVT = DstVT = MaskVT;
29364 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
29365 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
29373 static bool matchBinaryPermuteVectorShuffle(
29374 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
29375 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
29376 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
29377 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
29378 unsigned NumMaskElts = Mask.size();
29379 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29381 // Attempt to match against PALIGNR byte rotate.
29382 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29383 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29384 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
29385 if (0 < ByteRotation) {
29386 Shuffle = X86ISD::PALIGNR;
29387 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
29388 PermuteImm = ByteRotation;
29393 // Attempt to combine to X86ISD::BLENDI.
29394 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
29395 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
29396 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
29397 uint64_t BlendMask = 0;
29398 bool ForceV1Zero = false, ForceV2Zero = false;
29399 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
29400 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
29402 if (MaskVT == MVT::v16i16) {
29403 // We can only use v16i16 PBLENDW if the lanes are repeated.
29404 SmallVector<int, 8> RepeatedMask;
29405 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
29407 assert(RepeatedMask.size() == 8 &&
29408 "Repeated mask size doesn't match!");
29410 for (int i = 0; i < 8; ++i)
29411 if (RepeatedMask[i] >= 8)
29412 PermuteImm |= 1 << i;
29413 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29414 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29415 Shuffle = X86ISD::BLENDI;
29416 ShuffleVT = MaskVT;
29420 // Determine a type compatible with X86ISD::BLENDI.
29421 ShuffleVT = MaskVT;
29422 if (Subtarget.hasAVX2()) {
29423 if (ShuffleVT == MVT::v4i64)
29424 ShuffleVT = MVT::v8i32;
29425 else if (ShuffleVT == MVT::v2i64)
29426 ShuffleVT = MVT::v4i32;
29428 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
29429 ShuffleVT = MVT::v8i16;
29430 else if (ShuffleVT == MVT::v4i64)
29431 ShuffleVT = MVT::v4f64;
29432 else if (ShuffleVT == MVT::v8i32)
29433 ShuffleVT = MVT::v8f32;
29436 if (!ShuffleVT.isFloatingPoint()) {
29437 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
29439 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
29440 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
29441 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
29444 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29445 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29446 PermuteImm = (unsigned)BlendMask;
29447 Shuffle = X86ISD::BLENDI;
29453 // Attempt to combine to INSERTPS.
29454 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
29455 MaskVT.is128BitVector()) {
29456 if (Zeroable.getBoolValue() &&
29457 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
29458 Shuffle = X86ISD::INSERTPS;
29459 ShuffleVT = MVT::v4f32;
29464 // Attempt to combine to SHUFPD.
29465 if (AllowFloatDomain && EltSizeInBits == 64 &&
29466 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29467 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29468 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29469 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
29470 Shuffle = X86ISD::SHUFP;
29471 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
29476 // Attempt to combine to SHUFPS.
29477 if (AllowFloatDomain && EltSizeInBits == 32 &&
29478 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
29479 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29480 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29481 SmallVector<int, 4> RepeatedMask;
29482 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
29483 // Match each half of the repeated mask, to determine if its just
29484 // referencing one of the vectors, is zeroable or entirely undef.
29485 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
29486 int M0 = RepeatedMask[Offset];
29487 int M1 = RepeatedMask[Offset + 1];
29489 if (isUndefInRange(RepeatedMask, Offset, 2)) {
29490 return DAG.getUNDEF(MaskVT);
29491 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
29492 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
29493 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
29494 return getZeroVector(MaskVT, Subtarget, DAG, DL);
29495 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
29496 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29497 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29499 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
29500 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29501 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29508 int ShufMask[4] = {-1, -1, -1, -1};
29509 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
29510 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
29515 Shuffle = X86ISD::SHUFP;
29516 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
29517 PermuteImm = getV4X86ShuffleImm(ShufMask);
29526 /// Combine an arbitrary chain of shuffles into a single instruction if
29529 /// This is the leaf of the recursive combine below. When we have found some
29530 /// chain of single-use x86 shuffle instructions and accumulated the combined
29531 /// shuffle mask represented by them, this will try to pattern match that mask
29532 /// into either a single instruction if there is a special purpose instruction
29533 /// for this operation, or into a PSHUFB instruction which is a fully general
29534 /// instruction but should only be used to replace chains over a certain depth.
29535 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
29536 ArrayRef<int> BaseMask, int Depth,
29537 bool HasVariableMask, SelectionDAG &DAG,
29538 const X86Subtarget &Subtarget) {
29539 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
29540 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
29541 "Unexpected number of shuffle inputs!");
29543 // Find the inputs that enter the chain. Note that multiple uses are OK
29544 // here, we're not going to remove the operands we find.
29545 bool UnaryShuffle = (Inputs.size() == 1);
29546 SDValue V1 = peekThroughBitcasts(Inputs[0]);
29547 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
29548 : peekThroughBitcasts(Inputs[1]));
29550 MVT VT1 = V1.getSimpleValueType();
29551 MVT VT2 = V2.getSimpleValueType();
29552 MVT RootVT = Root.getSimpleValueType();
29553 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
29554 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
29555 "Vector size mismatch");
29560 unsigned NumBaseMaskElts = BaseMask.size();
29561 if (NumBaseMaskElts == 1) {
29562 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
29563 return DAG.getBitcast(RootVT, V1);
29566 unsigned RootSizeInBits = RootVT.getSizeInBits();
29567 unsigned NumRootElts = RootVT.getVectorNumElements();
29568 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
29569 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
29570 (RootVT.isFloatingPoint() && Depth >= 2) ||
29571 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
29573 // Don't combine if we are a AVX512/EVEX target and the mask element size
29574 // is different from the root element size - this would prevent writemasks
29575 // from being reused.
29576 // TODO - this currently prevents all lane shuffles from occurring.
29577 // TODO - check for writemasks usage instead of always preventing combining.
29578 // TODO - attempt to narrow Mask back to writemask size.
29579 bool IsEVEXShuffle =
29580 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
29582 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
29584 // Handle 128-bit lane shuffles of 256-bit vectors.
29585 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
29586 // we need to use the zeroing feature.
29587 // TODO - this should support binary shuffles.
29588 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
29589 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
29590 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
29591 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
29592 return SDValue(); // Nothing to do!
29593 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
29594 unsigned PermMask = 0;
29595 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
29596 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
29598 Res = DAG.getBitcast(ShuffleVT, V1);
29599 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
29600 DAG.getUNDEF(ShuffleVT),
29601 DAG.getConstant(PermMask, DL, MVT::i8));
29602 return DAG.getBitcast(RootVT, Res);
29605 // For masks that have been widened to 128-bit elements or more,
29606 // narrow back down to 64-bit elements.
29607 SmallVector<int, 64> Mask;
29608 if (BaseMaskEltSizeInBits > 64) {
29609 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
29610 int MaskScale = BaseMaskEltSizeInBits / 64;
29611 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
29613 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
29616 unsigned NumMaskElts = Mask.size();
29617 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
29619 // Determine the effective mask value type.
29620 FloatDomain &= (32 <= MaskEltSizeInBits);
29621 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
29622 : MVT::getIntegerVT(MaskEltSizeInBits);
29623 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
29625 // Only allow legal mask types.
29626 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
29629 // Attempt to match the mask against known shuffle patterns.
29630 MVT ShuffleSrcVT, ShuffleVT;
29631 unsigned Shuffle, PermuteImm;
29633 // Which shuffle domains are permitted?
29634 // Permit domain crossing at higher combine depths.
29635 bool AllowFloatDomain = FloatDomain || (Depth > 3);
29636 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
29637 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
29639 // Determine zeroable mask elements.
29640 APInt Zeroable(NumMaskElts, 0);
29641 for (unsigned i = 0; i != NumMaskElts; ++i)
29642 if (isUndefOrZero(Mask[i]))
29643 Zeroable.setBit(i);
29645 if (UnaryShuffle) {
29646 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
29647 // directly if we don't shuffle the lower element and we shuffle the upper
29648 // (zero) elements within themselves.
29649 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
29650 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
29651 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
29652 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
29653 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
29654 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
29655 return DAG.getBitcast(RootVT, V1);
29659 SDValue NewV1 = V1; // Save operand in case early exit happens.
29660 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29661 NewV1, DL, DAG, Subtarget, Shuffle,
29662 ShuffleSrcVT, ShuffleVT) &&
29663 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29664 if (Depth == 1 && Root.getOpcode() == Shuffle)
29665 return SDValue(); // Nothing to do!
29666 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
29667 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
29668 return DAG.getBitcast(RootVT, Res);
29671 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
29672 AllowIntDomain, Subtarget, Shuffle,
29673 ShuffleVT, PermuteImm) &&
29674 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29675 if (Depth == 1 && Root.getOpcode() == Shuffle)
29676 return SDValue(); // Nothing to do!
29677 Res = DAG.getBitcast(ShuffleVT, V1);
29678 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
29679 DAG.getConstant(PermuteImm, DL, MVT::i8));
29680 return DAG.getBitcast(RootVT, Res);
29684 SDValue NewV1 = V1; // Save operands in case early exit happens.
29685 SDValue NewV2 = V2;
29686 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29687 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
29688 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
29689 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29690 if (Depth == 1 && Root.getOpcode() == Shuffle)
29691 return SDValue(); // Nothing to do!
29692 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
29693 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
29694 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
29695 return DAG.getBitcast(RootVT, Res);
29698 NewV1 = V1; // Save operands in case early exit happens.
29700 if (matchBinaryPermuteVectorShuffle(
29701 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
29702 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
29703 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29704 if (Depth == 1 && Root.getOpcode() == Shuffle)
29705 return SDValue(); // Nothing to do!
29706 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
29707 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
29708 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
29709 DAG.getConstant(PermuteImm, DL, MVT::i8));
29710 return DAG.getBitcast(RootVT, Res);
29713 // Typically from here on, we need an integer version of MaskVT.
29714 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
29715 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
29717 // Annoyingly, SSE4A instructions don't map into the above match helpers.
29718 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
29719 uint64_t BitLen, BitIdx;
29720 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
29722 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
29723 return SDValue(); // Nothing to do!
29724 V1 = DAG.getBitcast(IntMaskVT, V1);
29725 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
29726 DAG.getConstant(BitLen, DL, MVT::i8),
29727 DAG.getConstant(BitIdx, DL, MVT::i8));
29728 return DAG.getBitcast(RootVT, Res);
29731 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
29732 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
29733 return SDValue(); // Nothing to do!
29734 V1 = DAG.getBitcast(IntMaskVT, V1);
29735 V2 = DAG.getBitcast(IntMaskVT, V2);
29736 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
29737 DAG.getConstant(BitLen, DL, MVT::i8),
29738 DAG.getConstant(BitIdx, DL, MVT::i8));
29739 return DAG.getBitcast(RootVT, Res);
29743 // Don't try to re-form single instruction chains under any circumstances now
29744 // that we've done encoding canonicalization for them.
29748 // Depth threshold above which we can efficiently use variable mask shuffles.
29749 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
29750 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
29752 bool MaskContainsZeros =
29753 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29755 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
29756 // If we have a single input lane-crossing shuffle then lower to VPERMV.
29757 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29758 ((Subtarget.hasAVX2() &&
29759 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29760 (Subtarget.hasAVX512() &&
29761 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29762 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29763 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29764 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29765 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29766 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29767 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29768 Res = DAG.getBitcast(MaskVT, V1);
29769 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
29770 return DAG.getBitcast(RootVT, Res);
29773 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
29774 // vector as the second source.
29775 if (UnaryShuffle && AllowVariableMask &&
29776 ((Subtarget.hasAVX512() &&
29777 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29778 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29779 (Subtarget.hasVLX() &&
29780 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29781 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29782 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29783 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29784 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29785 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29786 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
29787 for (unsigned i = 0; i != NumMaskElts; ++i)
29788 if (Mask[i] == SM_SentinelZero)
29789 Mask[i] = NumMaskElts + i;
29791 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29792 Res = DAG.getBitcast(MaskVT, V1);
29793 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
29794 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
29795 return DAG.getBitcast(RootVT, Res);
29798 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
29799 if (AllowVariableMask && !MaskContainsZeros &&
29800 ((Subtarget.hasAVX512() &&
29801 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29802 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29803 (Subtarget.hasVLX() &&
29804 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29805 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29806 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29807 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29808 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29809 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29810 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29811 V1 = DAG.getBitcast(MaskVT, V1);
29812 V2 = DAG.getBitcast(MaskVT, V2);
29813 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29814 return DAG.getBitcast(RootVT, Res);
29819 // See if we can combine a single input shuffle with zeros to a bit-mask,
29820 // which is much simpler than any shuffle.
29821 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29822 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29823 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29824 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29825 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29826 APInt UndefElts(NumMaskElts, 0);
29827 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29828 for (unsigned i = 0; i != NumMaskElts; ++i) {
29830 if (M == SM_SentinelUndef) {
29831 UndefElts.setBit(i);
29834 if (M == SM_SentinelZero)
29836 EltBits[i] = AllOnes;
29838 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29839 Res = DAG.getBitcast(MaskVT, V1);
29840 unsigned AndOpcode =
29841 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29842 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29843 return DAG.getBitcast(RootVT, Res);
29846 // If we have a single input shuffle with different shuffle patterns in the
29847 // the 128-bit lanes use the variable mask to VPERMILPS.
29848 // TODO Combine other mask types at higher depths.
29849 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29850 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29851 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29852 SmallVector<SDValue, 16> VPermIdx;
29853 for (int M : Mask) {
29855 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29856 VPermIdx.push_back(Idx);
29858 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29859 Res = DAG.getBitcast(MaskVT, V1);
29860 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29861 return DAG.getBitcast(RootVT, Res);
29864 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29865 // to VPERMIL2PD/VPERMIL2PS.
29866 if (AllowVariableMask && Subtarget.hasXOP() &&
29867 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29868 MaskVT == MVT::v8f32)) {
29869 // VPERMIL2 Operation.
29870 // Bits[3] - Match Bit.
29871 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29872 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29873 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29874 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29875 SmallVector<int, 8> VPerm2Idx;
29876 unsigned M2ZImm = 0;
29877 for (int M : Mask) {
29878 if (M == SM_SentinelUndef) {
29879 VPerm2Idx.push_back(-1);
29882 if (M == SM_SentinelZero) {
29884 VPerm2Idx.push_back(8);
29887 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29888 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29889 VPerm2Idx.push_back(Index);
29891 V1 = DAG.getBitcast(MaskVT, V1);
29892 V2 = DAG.getBitcast(MaskVT, V2);
29893 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29894 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29895 DAG.getConstant(M2ZImm, DL, MVT::i8));
29896 return DAG.getBitcast(RootVT, Res);
29899 // If we have 3 or more shuffle instructions or a chain involving a variable
29900 // mask, we can replace them with a single PSHUFB instruction profitably.
29901 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29902 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29903 // more aggressive.
29904 if (UnaryShuffle && AllowVariableMask &&
29905 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29906 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29907 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29908 SmallVector<SDValue, 16> PSHUFBMask;
29909 int NumBytes = RootVT.getSizeInBits() / 8;
29910 int Ratio = NumBytes / NumMaskElts;
29911 for (int i = 0; i < NumBytes; ++i) {
29912 int M = Mask[i / Ratio];
29913 if (M == SM_SentinelUndef) {
29914 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29917 if (M == SM_SentinelZero) {
29918 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29921 M = Ratio * M + i % Ratio;
29922 assert((M / 16) == (i / 16) && "Lane crossing detected");
29923 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29925 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
29926 Res = DAG.getBitcast(ByteVT, V1);
29927 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
29928 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
29929 return DAG.getBitcast(RootVT, Res);
29932 // With XOP, if we have a 128-bit binary input shuffle we can always combine
29933 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
29934 // slower than PSHUFB on targets that support both.
29935 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
29936 // VPPERM Mask Operation
29937 // Bits[4:0] - Byte Index (0 - 31)
29938 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
29939 SmallVector<SDValue, 16> VPPERMMask;
29941 int Ratio = NumBytes / NumMaskElts;
29942 for (int i = 0; i < NumBytes; ++i) {
29943 int M = Mask[i / Ratio];
29944 if (M == SM_SentinelUndef) {
29945 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
29948 if (M == SM_SentinelZero) {
29949 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
29952 M = Ratio * M + i % Ratio;
29953 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29955 MVT ByteVT = MVT::v16i8;
29956 V1 = DAG.getBitcast(ByteVT, V1);
29957 V2 = DAG.getBitcast(ByteVT, V2);
29958 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
29959 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
29960 return DAG.getBitcast(RootVT, Res);
29963 // Failed to find any combines.
29967 // Attempt to constant fold all of the constant source ops.
29968 // Returns true if the entire shuffle is folded to a constant.
29969 // TODO: Extend this to merge multiple constant Ops and update the mask.
29970 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
29971 ArrayRef<int> Mask, SDValue Root,
29972 bool HasVariableMask,
29974 const X86Subtarget &Subtarget) {
29975 MVT VT = Root.getSimpleValueType();
29977 unsigned SizeInBits = VT.getSizeInBits();
29978 unsigned NumMaskElts = Mask.size();
29979 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
29980 unsigned NumOps = Ops.size();
29982 // Extract constant bits from each source op.
29983 bool OneUseConstantOp = false;
29984 SmallVector<APInt, 16> UndefEltsOps(NumOps);
29985 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
29986 for (unsigned i = 0; i != NumOps; ++i) {
29987 SDValue SrcOp = Ops[i];
29988 OneUseConstantOp |= SrcOp.hasOneUse();
29989 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
29994 // Only fold if at least one of the constants is only used once or
29995 // the combined shuffle has included a variable mask shuffle, this
29996 // is to avoid constant pool bloat.
29997 if (!OneUseConstantOp && !HasVariableMask)
30000 // Shuffle the constant bits according to the mask.
30001 APInt UndefElts(NumMaskElts, 0);
30002 APInt ZeroElts(NumMaskElts, 0);
30003 APInt ConstantElts(NumMaskElts, 0);
30004 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
30005 APInt::getNullValue(MaskSizeInBits));
30006 for (unsigned i = 0; i != NumMaskElts; ++i) {
30008 if (M == SM_SentinelUndef) {
30009 UndefElts.setBit(i);
30011 } else if (M == SM_SentinelZero) {
30012 ZeroElts.setBit(i);
30015 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
30017 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
30018 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
30020 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
30021 if (SrcUndefElts[SrcMaskIdx]) {
30022 UndefElts.setBit(i);
30026 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
30027 APInt &Bits = SrcEltBits[SrcMaskIdx];
30029 ZeroElts.setBit(i);
30033 ConstantElts.setBit(i);
30034 ConstantBitData[i] = Bits;
30036 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
30038 // Create the constant data.
30040 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
30041 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
30043 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
30045 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
30048 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
30049 return DAG.getBitcast(VT, CstOp);
30052 /// Fully generic combining of x86 shuffle instructions.
30054 /// This should be the last combine run over the x86 shuffle instructions. Once
30055 /// they have been fully optimized, this will recursively consider all chains
30056 /// of single-use shuffle instructions, build a generic model of the cumulative
30057 /// shuffle operation, and check for simpler instructions which implement this
30058 /// operation. We use this primarily for two purposes:
30060 /// 1) Collapse generic shuffles to specialized single instructions when
30061 /// equivalent. In most cases, this is just an encoding size win, but
30062 /// sometimes we will collapse multiple generic shuffles into a single
30063 /// special-purpose shuffle.
30064 /// 2) Look for sequences of shuffle instructions with 3 or more total
30065 /// instructions, and replace them with the slightly more expensive SSSE3
30066 /// PSHUFB instruction if available. We do this as the last combining step
30067 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
30068 /// a suitable short sequence of other instructions. The PSHUFB will either
30069 /// use a register or have to read from memory and so is slightly (but only
30070 /// slightly) more expensive than the other shuffle instructions.
30072 /// Because this is inherently a quadratic operation (for each shuffle in
30073 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
30074 /// This should never be an issue in practice as the shuffle lowering doesn't
30075 /// produce sequences of more than 8 instructions.
30077 /// FIXME: We will currently miss some cases where the redundant shuffling
30078 /// would simplify under the threshold for PSHUFB formation because of
30079 /// combine-ordering. To fix this, we should do the redundant instruction
30080 /// combining in this recursive walk.
30081 static SDValue combineX86ShufflesRecursively(
30082 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
30083 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
30084 bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
30085 // Bound the depth of our recursive combine because this is ultimately
30086 // quadratic in nature.
30087 const unsigned MaxRecursionDepth = 8;
30088 if (Depth > MaxRecursionDepth)
30091 // Directly rip through bitcasts to find the underlying operand.
30092 SDValue Op = SrcOps[SrcOpIndex];
30093 Op = peekThroughOneUseBitcasts(Op);
30095 MVT VT = Op.getSimpleValueType();
30096 if (!VT.isVector())
30097 return SDValue(); // Bail if we hit a non-vector.
30099 assert(Root.getSimpleValueType().isVector() &&
30100 "Shuffles operate on vector types!");
30101 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
30102 "Can only combine shuffles of the same vector register size.");
30104 // Extract target shuffle mask and resolve sentinels and inputs.
30105 SmallVector<int, 64> OpMask;
30106 SmallVector<SDValue, 2> OpInputs;
30107 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
30110 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
30111 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
30112 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
30114 // Add the inputs to the Ops list, avoiding duplicates.
30115 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
30117 int InputIdx0 = -1, InputIdx1 = -1;
30118 for (int i = 0, e = Ops.size(); i < e; ++i) {
30119 SDValue BC = peekThroughBitcasts(Ops[i]);
30120 if (Input0 && BC == peekThroughBitcasts(Input0))
30122 if (Input1 && BC == peekThroughBitcasts(Input1))
30126 if (Input0 && InputIdx0 < 0) {
30127 InputIdx0 = SrcOpIndex;
30128 Ops[SrcOpIndex] = Input0;
30130 if (Input1 && InputIdx1 < 0) {
30131 InputIdx1 = Ops.size();
30132 Ops.push_back(Input1);
30135 assert(((RootMask.size() > OpMask.size() &&
30136 RootMask.size() % OpMask.size() == 0) ||
30137 (OpMask.size() > RootMask.size() &&
30138 OpMask.size() % RootMask.size() == 0) ||
30139 OpMask.size() == RootMask.size()) &&
30140 "The smaller number of elements must divide the larger.");
30142 // This function can be performance-critical, so we rely on the power-of-2
30143 // knowledge that we have about the mask sizes to replace div/rem ops with
30144 // bit-masks and shifts.
30145 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
30146 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
30147 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
30148 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
30150 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
30151 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
30152 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
30153 assert((RootRatio == 1 || OpRatio == 1) &&
30154 "Must not have a ratio for both incoming and op masks!");
30156 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
30157 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
30158 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
30159 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
30160 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
30162 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
30164 // Merge this shuffle operation's mask into our accumulated mask. Note that
30165 // this shuffle's mask will be the first applied to the input, followed by the
30166 // root mask to get us all the way to the root value arrangement. The reason
30167 // for this order is that we are recursing up the operation chain.
30168 for (unsigned i = 0; i < MaskWidth; ++i) {
30169 unsigned RootIdx = i >> RootRatioLog2;
30170 if (RootMask[RootIdx] < 0) {
30171 // This is a zero or undef lane, we're done.
30172 Mask[i] = RootMask[RootIdx];
30176 unsigned RootMaskedIdx =
30178 ? RootMask[RootIdx]
30179 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
30181 // Just insert the scaled root mask value if it references an input other
30182 // than the SrcOp we're currently inserting.
30183 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
30184 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
30185 Mask[i] = RootMaskedIdx;
30189 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
30190 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
30191 if (OpMask[OpIdx] < 0) {
30192 // The incoming lanes are zero or undef, it doesn't matter which ones we
30194 Mask[i] = OpMask[OpIdx];
30198 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
30199 unsigned OpMaskedIdx =
30202 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
30204 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
30205 if (OpMask[OpIdx] < (int)OpMask.size()) {
30206 assert(0 <= InputIdx0 && "Unknown target shuffle input");
30207 OpMaskedIdx += InputIdx0 * MaskWidth;
30209 assert(0 <= InputIdx1 && "Unknown target shuffle input");
30210 OpMaskedIdx += InputIdx1 * MaskWidth;
30213 Mask[i] = OpMaskedIdx;
30216 // Handle the all undef/zero cases early.
30217 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
30218 return DAG.getUNDEF(Root.getValueType());
30220 // TODO - should we handle the mixed zero/undef case as well? Just returning
30221 // a zero mask will lose information on undef elements possibly reducing
30222 // future combine possibilities.
30223 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
30224 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
30227 // Remove unused shuffle source ops.
30228 resolveTargetShuffleInputsAndMask(Ops, Mask);
30229 assert(!Ops.empty() && "Shuffle with no inputs detected");
30231 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
30233 // Update the list of shuffle nodes that have been combined so far.
30234 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
30236 CombinedNodes.push_back(Op.getNode());
30238 // See if we can recurse into each shuffle source op (if it's a target
30239 // shuffle). The source op should only be combined if it either has a
30240 // single use (i.e. current Op) or all its users have already been combined.
30241 // Don't recurse if we already have more source ops than we can combine in
30242 // the remaining recursion depth.
30243 if (Ops.size() < (MaxRecursionDepth - Depth)) {
30244 for (int i = 0, e = Ops.size(); i < e; ++i)
30245 if (Ops[i].getNode()->hasOneUse() ||
30246 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
30247 if (SDValue Res = combineX86ShufflesRecursively(
30248 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
30253 // Attempt to constant fold all of the constant source ops.
30254 if (SDValue Cst = combineX86ShufflesConstants(
30255 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
30258 // We can only combine unary and binary shuffle mask cases.
30259 if (Ops.size() > 2)
30262 // Minor canonicalization of the accumulated shuffle mask to make it easier
30263 // to match below. All this does is detect masks with sequential pairs of
30264 // elements, and shrink them to the half-width mask. It does this in a loop
30265 // so it will reduce the size of the mask to the minimal width mask which
30266 // performs an equivalent shuffle.
30267 SmallVector<int, 64> WidenedMask;
30268 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
30269 Mask = std::move(WidenedMask);
30272 // Canonicalization of binary shuffle masks to improve pattern matching by
30273 // commuting the inputs.
30274 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
30275 ShuffleVectorSDNode::commuteMask(Mask);
30276 std::swap(Ops[0], Ops[1]);
30279 // Finally, try to combine into a single shuffle instruction.
30280 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
30284 /// Get the PSHUF-style mask from PSHUF node.
30286 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
30287 /// PSHUF-style masks that can be reused with such instructions.
30288 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
30289 MVT VT = N.getSimpleValueType();
30290 SmallVector<int, 4> Mask;
30291 SmallVector<SDValue, 2> Ops;
30294 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
30298 // If we have more than 128-bits, only the low 128-bits of shuffle mask
30299 // matter. Check that the upper masks are repeats and remove them.
30300 if (VT.getSizeInBits() > 128) {
30301 int LaneElts = 128 / VT.getScalarSizeInBits();
30303 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
30304 for (int j = 0; j < LaneElts; ++j)
30305 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
30306 "Mask doesn't repeat in high 128-bit lanes!");
30308 Mask.resize(LaneElts);
30311 switch (N.getOpcode()) {
30312 case X86ISD::PSHUFD:
30314 case X86ISD::PSHUFLW:
30317 case X86ISD::PSHUFHW:
30318 Mask.erase(Mask.begin(), Mask.begin() + 4);
30319 for (int &M : Mask)
30323 llvm_unreachable("No valid shuffle instruction found!");
30327 /// Search for a combinable shuffle across a chain ending in pshufd.
30329 /// We walk up the chain and look for a combinable shuffle, skipping over
30330 /// shuffles that we could hoist this shuffle's transformation past without
30331 /// altering anything.
30333 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
30334 SelectionDAG &DAG) {
30335 assert(N.getOpcode() == X86ISD::PSHUFD &&
30336 "Called with something other than an x86 128-bit half shuffle!");
30339 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
30340 // of the shuffles in the chain so that we can form a fresh chain to replace
30342 SmallVector<SDValue, 8> Chain;
30343 SDValue V = N.getOperand(0);
30344 for (; V.hasOneUse(); V = V.getOperand(0)) {
30345 switch (V.getOpcode()) {
30347 return SDValue(); // Nothing combined!
30350 // Skip bitcasts as we always know the type for the target specific
30354 case X86ISD::PSHUFD:
30355 // Found another dword shuffle.
30358 case X86ISD::PSHUFLW:
30359 // Check that the low words (being shuffled) are the identity in the
30360 // dword shuffle, and the high words are self-contained.
30361 if (Mask[0] != 0 || Mask[1] != 1 ||
30362 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
30365 Chain.push_back(V);
30368 case X86ISD::PSHUFHW:
30369 // Check that the high words (being shuffled) are the identity in the
30370 // dword shuffle, and the low words are self-contained.
30371 if (Mask[2] != 2 || Mask[3] != 3 ||
30372 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
30375 Chain.push_back(V);
30378 case X86ISD::UNPCKL:
30379 case X86ISD::UNPCKH:
30380 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
30381 // shuffle into a preceding word shuffle.
30382 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
30383 V.getSimpleValueType().getVectorElementType() != MVT::i16)
30386 // Search for a half-shuffle which we can combine with.
30387 unsigned CombineOp =
30388 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
30389 if (V.getOperand(0) != V.getOperand(1) ||
30390 !V->isOnlyUserOf(V.getOperand(0).getNode()))
30392 Chain.push_back(V);
30393 V = V.getOperand(0);
30395 switch (V.getOpcode()) {
30397 return SDValue(); // Nothing to combine.
30399 case X86ISD::PSHUFLW:
30400 case X86ISD::PSHUFHW:
30401 if (V.getOpcode() == CombineOp)
30404 Chain.push_back(V);
30408 V = V.getOperand(0);
30412 } while (V.hasOneUse());
30415 // Break out of the loop if we break out of the switch.
30419 if (!V.hasOneUse())
30420 // We fell out of the loop without finding a viable combining instruction.
30423 // Merge this node's mask and our incoming mask.
30424 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30425 for (int &M : Mask)
30427 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
30428 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30430 // Rebuild the chain around this new shuffle.
30431 while (!Chain.empty()) {
30432 SDValue W = Chain.pop_back_val();
30434 if (V.getValueType() != W.getOperand(0).getValueType())
30435 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
30437 switch (W.getOpcode()) {
30439 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
30441 case X86ISD::UNPCKL:
30442 case X86ISD::UNPCKH:
30443 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
30446 case X86ISD::PSHUFD:
30447 case X86ISD::PSHUFLW:
30448 case X86ISD::PSHUFHW:
30449 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
30453 if (V.getValueType() != N.getValueType())
30454 V = DAG.getBitcast(N.getValueType(), V);
30456 // Return the new chain to replace N.
30460 /// Search for a combinable shuffle across a chain ending in pshuflw or
30463 /// We walk up the chain, skipping shuffles of the other half and looking
30464 /// through shuffles which switch halves trying to find a shuffle of the same
30465 /// pair of dwords.
30466 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
30468 TargetLowering::DAGCombinerInfo &DCI) {
30470 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
30471 "Called with something other than an x86 128-bit half shuffle!");
30473 unsigned CombineOpcode = N.getOpcode();
30475 // Walk up a single-use chain looking for a combinable shuffle.
30476 SDValue V = N.getOperand(0);
30477 for (; V.hasOneUse(); V = V.getOperand(0)) {
30478 switch (V.getOpcode()) {
30480 return false; // Nothing combined!
30483 // Skip bitcasts as we always know the type for the target specific
30487 case X86ISD::PSHUFLW:
30488 case X86ISD::PSHUFHW:
30489 if (V.getOpcode() == CombineOpcode)
30492 // Other-half shuffles are no-ops.
30495 // Break out of the loop if we break out of the switch.
30499 if (!V.hasOneUse())
30500 // We fell out of the loop without finding a viable combining instruction.
30503 // Combine away the bottom node as its shuffle will be accumulated into
30504 // a preceding shuffle.
30505 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30507 // Record the old value.
30510 // Merge this node's mask and our incoming mask (adjusted to account for all
30511 // the pshufd instructions encountered).
30512 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30513 for (int &M : Mask)
30515 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
30516 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30518 // Check that the shuffles didn't cancel each other out. If not, we need to
30519 // combine to the new one.
30521 // Replace the combinable shuffle with the combined one, updating all users
30522 // so that we re-evaluate the chain here.
30523 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
30528 /// Try to combine x86 target specific shuffles.
30529 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
30530 TargetLowering::DAGCombinerInfo &DCI,
30531 const X86Subtarget &Subtarget) {
30533 MVT VT = N.getSimpleValueType();
30534 SmallVector<int, 4> Mask;
30535 unsigned Opcode = N.getOpcode();
30537 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
30538 // single instruction.
30539 if (VT.getScalarSizeInBits() == 64 &&
30540 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
30541 Opcode == X86ISD::UNPCKL)) {
30542 auto BC0 = peekThroughBitcasts(N.getOperand(0));
30543 auto BC1 = peekThroughBitcasts(N.getOperand(1));
30544 EVT VT0 = BC0.getValueType();
30545 EVT VT1 = BC1.getValueType();
30546 unsigned Opcode0 = BC0.getOpcode();
30547 unsigned Opcode1 = BC1.getOpcode();
30548 if (Opcode0 == Opcode1 && VT0 == VT1 &&
30549 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
30550 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
30551 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
30553 if (Opcode == X86ISD::MOVSD) {
30554 Lo = BC1.getOperand(0);
30555 Hi = BC0.getOperand(1);
30557 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30558 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30560 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
30561 DCI.AddToWorklist(Horiz.getNode());
30562 return DAG.getBitcast(VT, Horiz);
30567 case X86ISD::VBROADCAST: {
30568 // If broadcasting from another shuffle, attempt to simplify it.
30569 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
30570 SDValue Src = N.getOperand(0);
30571 SDValue BC = peekThroughBitcasts(Src);
30572 EVT SrcVT = Src.getValueType();
30573 EVT BCVT = BC.getValueType();
30574 if (isTargetShuffle(BC.getOpcode()) &&
30575 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
30576 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
30577 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
30579 for (unsigned i = 0; i != Scale; ++i)
30580 DemandedMask[i] = i;
30581 if (SDValue Res = combineX86ShufflesRecursively(
30582 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
30583 /*HasVarMask*/ false, DAG, Subtarget))
30584 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
30585 DAG.getBitcast(SrcVT, Res));
30589 case X86ISD::PSHUFD:
30590 case X86ISD::PSHUFLW:
30591 case X86ISD::PSHUFHW:
30592 Mask = getPSHUFShuffleMask(N);
30593 assert(Mask.size() == 4);
30595 case X86ISD::UNPCKL: {
30596 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
30597 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
30598 // moves upper half elements into the lower half part. For example:
30600 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
30602 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
30604 // will be combined to:
30606 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
30608 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
30609 // happen due to advanced instructions.
30610 if (!VT.is128BitVector())
30613 auto Op0 = N.getOperand(0);
30614 auto Op1 = N.getOperand(1);
30615 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
30616 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
30618 unsigned NumElts = VT.getVectorNumElements();
30619 SmallVector<int, 8> ExpectedMask(NumElts, -1);
30620 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
30623 auto ShufOp = Op1.getOperand(0);
30624 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
30625 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
30629 case X86ISD::BLENDI: {
30630 SDValue V0 = N->getOperand(0);
30631 SDValue V1 = N->getOperand(1);
30632 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
30633 "Unexpected input vector types");
30635 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
30636 // operands and changing the mask to 1. This saves us a bunch of
30637 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
30638 // x86InstrInfo knows how to commute this back after instruction selection
30639 // if it would help register allocation.
30641 // TODO: If optimizing for size or a processor that doesn't suffer from
30642 // partial register update stalls, this should be transformed into a MOVSD
30643 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
30645 if (VT == MVT::v2f64)
30646 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
30647 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
30648 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
30649 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
30654 case X86ISD::MOVSD:
30655 case X86ISD::MOVSS: {
30656 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
30657 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
30658 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
30659 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
30660 if (isZero0 && isZero1)
30663 // We often lower to MOVSD/MOVSS from integer as well as native float
30664 // types; remove unnecessary domain-crossing bitcasts if we can to make it
30665 // easier to combine shuffles later on. We've already accounted for the
30666 // domain switching cost when we decided to lower with it.
30667 bool isFloat = VT.isFloatingPoint();
30668 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
30669 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
30670 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
30671 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
30672 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
30673 V0 = DAG.getBitcast(NewVT, V0);
30674 V1 = DAG.getBitcast(NewVT, V1);
30675 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
30680 case X86ISD::INSERTPS: {
30681 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
30682 SDValue Op0 = N.getOperand(0);
30683 SDValue Op1 = N.getOperand(1);
30684 SDValue Op2 = N.getOperand(2);
30685 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
30686 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
30687 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
30688 unsigned ZeroMask = InsertPSMask & 0xF;
30690 // If we zero out all elements from Op0 then we don't need to reference it.
30691 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
30692 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
30693 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30695 // If we zero out the element from Op1 then we don't need to reference it.
30696 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
30697 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30698 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30700 // Attempt to merge insertps Op1 with an inner target shuffle node.
30701 SmallVector<int, 8> TargetMask1;
30702 SmallVector<SDValue, 2> Ops1;
30703 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
30704 int M = TargetMask1[SrcIdx];
30705 if (isUndefOrZero(M)) {
30706 // Zero/UNDEF insertion - zero out element and remove dependency.
30707 InsertPSMask |= (1u << DstIdx);
30708 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30709 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30711 // Update insertps mask srcidx and reference the source input directly.
30712 assert(0 <= M && M < 8 && "Shuffle index out of range");
30713 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
30714 Op1 = Ops1[M < 4 ? 0 : 1];
30715 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30716 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30719 // Attempt to merge insertps Op0 with an inner target shuffle node.
30720 SmallVector<int, 8> TargetMask0;
30721 SmallVector<SDValue, 2> Ops0;
30722 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
30725 bool Updated = false;
30726 bool UseInput00 = false;
30727 bool UseInput01 = false;
30728 for (int i = 0; i != 4; ++i) {
30729 int M = TargetMask0[i];
30730 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
30731 // No change if element is already zero or the inserted element.
30733 } else if (isUndefOrZero(M)) {
30734 // If the target mask is undef/zero then we must zero the element.
30735 InsertPSMask |= (1u << i);
30740 // The input vector element must be inline.
30741 if (M != i && M != (i + 4))
30744 // Determine which inputs of the target shuffle we're using.
30745 UseInput00 |= (0 <= M && M < 4);
30746 UseInput01 |= (4 <= M);
30749 // If we're not using both inputs of the target shuffle then use the
30750 // referenced input directly.
30751 if (UseInput00 && !UseInput01) {
30754 } else if (!UseInput00 && UseInput01) {
30760 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30761 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30769 // Nuke no-op shuffles that show up after combining.
30770 if (isNoopShuffleMask(Mask))
30771 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30773 // Look for simplifications involving one or two shuffle instructions.
30774 SDValue V = N.getOperand(0);
30775 switch (N.getOpcode()) {
30778 case X86ISD::PSHUFLW:
30779 case X86ISD::PSHUFHW:
30780 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
30782 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
30783 return SDValue(); // We combined away this shuffle, so we're done.
30785 // See if this reduces to a PSHUFD which is no more expensive and can
30786 // combine with more operations. Note that it has to at least flip the
30787 // dwords as otherwise it would have been removed as a no-op.
30788 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
30789 int DMask[] = {0, 1, 2, 3};
30790 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
30791 DMask[DOffset + 0] = DOffset + 1;
30792 DMask[DOffset + 1] = DOffset + 0;
30793 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30794 V = DAG.getBitcast(DVT, V);
30795 DCI.AddToWorklist(V.getNode());
30796 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
30797 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
30798 DCI.AddToWorklist(V.getNode());
30799 return DAG.getBitcast(VT, V);
30802 // Look for shuffle patterns which can be implemented as a single unpack.
30803 // FIXME: This doesn't handle the location of the PSHUFD generically, and
30804 // only works when we have a PSHUFD followed by two half-shuffles.
30805 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30806 (V.getOpcode() == X86ISD::PSHUFLW ||
30807 V.getOpcode() == X86ISD::PSHUFHW) &&
30808 V.getOpcode() != N.getOpcode() &&
30810 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30811 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30812 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30813 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30814 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30815 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30817 for (int i = 0; i < 4; ++i) {
30818 WordMask[i + NOffset] = Mask[i] + NOffset;
30819 WordMask[i + VOffset] = VMask[i] + VOffset;
30821 // Map the word mask through the DWord mask.
30823 for (int i = 0; i < 8; ++i)
30824 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30825 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30826 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30827 // We can replace all three shuffles with an unpack.
30828 V = DAG.getBitcast(VT, D.getOperand(0));
30829 DCI.AddToWorklist(V.getNode());
30830 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30839 case X86ISD::PSHUFD:
30840 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30849 /// Checks if the shuffle mask takes subsequent elements
30850 /// alternately from two vectors.
30851 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
30852 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
30854 int ParitySrc[2] = {-1, -1};
30855 unsigned Size = Mask.size();
30856 for (unsigned i = 0; i != Size; ++i) {
30861 // Make sure we are using the matching element from the input.
30862 if ((M % Size) != i)
30865 // Make sure we use the same input for all elements of the same parity.
30866 int Src = M / Size;
30867 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
30869 ParitySrc[i % 2] = Src;
30872 // Make sure each input is used.
30873 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
30876 Op0Even = ParitySrc[0] == 0;
30880 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30881 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30882 /// are written to the parameters \p Opnd0 and \p Opnd1.
30884 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30885 /// so it is easier to generically match. We also insert dummy vector shuffle
30886 /// nodes for the operands which explicitly discard the lanes which are unused
30887 /// by this operation to try to flow through the rest of the combiner
30888 /// the fact that they're unused.
30889 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30890 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
30893 EVT VT = N->getValueType(0);
30894 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30895 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
30896 !VT.getSimpleVT().isFloatingPoint())
30899 // We only handle target-independent shuffles.
30900 // FIXME: It would be easy and harmless to use the target shuffle mask
30901 // extraction tool to support more.
30902 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30905 SDValue V1 = N->getOperand(0);
30906 SDValue V2 = N->getOperand(1);
30908 // Make sure we have an FADD and an FSUB.
30909 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
30910 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
30911 V1.getOpcode() == V2.getOpcode())
30914 // If there are other uses of these operations we can't fold them.
30915 if (!V1->hasOneUse() || !V2->hasOneUse())
30918 // Ensure that both operations have the same operands. Note that we can
30919 // commute the FADD operands.
30921 if (V1.getOpcode() == ISD::FSUB) {
30922 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
30923 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30924 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30927 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
30928 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
30929 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
30930 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
30934 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30936 if (!isAddSubOrSubAddMask(Mask, Op0Even))
30939 // It's a subadd if the vector in the even parity is an FADD.
30940 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
30941 : V2->getOpcode() == ISD::FADD;
30948 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
30949 static SDValue combineShuffleToFMAddSub(SDNode *N,
30950 const X86Subtarget &Subtarget,
30951 SelectionDAG &DAG) {
30952 // We only handle target-independent shuffles.
30953 // FIXME: It would be easy and harmless to use the target shuffle mask
30954 // extraction tool to support more.
30955 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30958 MVT VT = N->getSimpleValueType(0);
30959 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30960 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
30963 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
30964 SDValue Op0 = N->getOperand(0);
30965 SDValue Op1 = N->getOperand(1);
30966 SDValue FMAdd = Op0, FMSub = Op1;
30967 if (FMSub.getOpcode() != X86ISD::FMSUB)
30968 std::swap(FMAdd, FMSub);
30970 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
30971 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
30972 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
30973 FMAdd.getOperand(2) != FMSub.getOperand(2))
30976 // Check for correct shuffle mask.
30977 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30979 if (!isAddSubOrSubAddMask(Mask, Op0Even))
30982 // FMAddSub takes zeroth operand from FMSub node.
30984 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
30985 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
30986 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
30987 FMAdd.getOperand(2));
30990 /// Try to combine a shuffle into a target-specific add-sub or
30991 /// mul-add-sub node.
30992 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
30993 const X86Subtarget &Subtarget,
30994 SelectionDAG &DAG) {
30995 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
30998 SDValue Opnd0, Opnd1;
31000 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
31003 MVT VT = N->getSimpleValueType(0);
31006 // Try to generate X86ISD::FMADDSUB node here.
31008 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
31009 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31010 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
31016 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
31017 // the ADDSUB idiom has been successfully recognized. There are no known
31018 // X86 targets with 512-bit ADDSUB instructions!
31019 if (VT.is512BitVector())
31022 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
31025 // We are looking for a shuffle where both sources are concatenated with undef
31026 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
31027 // if we can express this as a single-source shuffle, that's preferable.
31028 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
31029 const X86Subtarget &Subtarget) {
31030 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
31033 EVT VT = N->getValueType(0);
31035 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
31036 if (!VT.is128BitVector() && !VT.is256BitVector())
31039 if (VT.getVectorElementType() != MVT::i32 &&
31040 VT.getVectorElementType() != MVT::i64 &&
31041 VT.getVectorElementType() != MVT::f32 &&
31042 VT.getVectorElementType() != MVT::f64)
31045 SDValue N0 = N->getOperand(0);
31046 SDValue N1 = N->getOperand(1);
31048 // Check that both sources are concats with undef.
31049 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
31050 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
31051 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
31052 !N1.getOperand(1).isUndef())
31055 // Construct the new shuffle mask. Elements from the first source retain their
31056 // index, but elements from the second source no longer need to skip an undef.
31057 SmallVector<int, 8> Mask;
31058 int NumElts = VT.getVectorNumElements();
31060 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31061 for (int Elt : SVOp->getMask())
31062 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
31065 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
31067 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
31070 /// Eliminate a redundant shuffle of a horizontal math op.
31071 static SDValue foldShuffleOfHorizOp(SDNode *N) {
31072 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
31075 SDValue HOp = N->getOperand(0);
31076 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
31077 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
31080 // 128-bit horizontal math instructions are defined to operate on adjacent
31081 // lanes of each operand as:
31082 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
31083 // ...similarly for v2f64 and v8i16.
31084 // TODO: 256-bit is not the same because...x86.
31085 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
31088 // When the operands of a horizontal math op are identical, the low half of
31089 // the result is the same as the high half. If the shuffle is also replicating
31090 // low and high halves, we don't need the shuffle.
31091 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
31092 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31093 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
31094 // but this should be tied to whatever horizontal op matching and shuffle
31095 // canonicalization are producing.
31096 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
31097 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
31098 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
31104 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
31105 TargetLowering::DAGCombinerInfo &DCI,
31106 const X86Subtarget &Subtarget) {
31108 EVT VT = N->getValueType(0);
31109 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31110 // If we have legalized the vector types, look for blends of FADD and FSUB
31111 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
31112 if (TLI.isTypeLegal(VT)) {
31113 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
31116 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
31120 // During Type Legalization, when promoting illegal vector types,
31121 // the backend might introduce new shuffle dag nodes and bitcasts.
31123 // This code performs the following transformation:
31124 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
31125 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
31127 // We do this only if both the bitcast and the BINOP dag nodes have
31128 // one use. Also, perform this transformation only if the new binary
31129 // operation is legal. This is to avoid introducing dag nodes that
31130 // potentially need to be further expanded (or custom lowered) into a
31131 // less optimal sequence of dag nodes.
31132 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
31133 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
31134 N->getOperand(0).getOpcode() == ISD::BITCAST &&
31135 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
31136 SDValue N0 = N->getOperand(0);
31137 SDValue N1 = N->getOperand(1);
31139 SDValue BC0 = N0.getOperand(0);
31140 EVT SVT = BC0.getValueType();
31141 unsigned Opcode = BC0.getOpcode();
31142 unsigned NumElts = VT.getVectorNumElements();
31144 if (BC0.hasOneUse() && SVT.isVector() &&
31145 SVT.getVectorNumElements() * 2 == NumElts &&
31146 TLI.isOperationLegal(Opcode, VT)) {
31147 bool CanFold = false;
31153 // isOperationLegal lies for integer ops on floating point types.
31154 CanFold = VT.isInteger();
31159 // isOperationLegal lies for floating point ops on integer types.
31160 CanFold = VT.isFloatingPoint();
31164 unsigned SVTNumElts = SVT.getVectorNumElements();
31165 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31166 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
31167 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
31168 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
31169 CanFold = SVOp->getMaskElt(i) < 0;
31172 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
31173 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
31174 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
31175 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
31180 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
31181 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
31182 // consecutive, non-overlapping, and in the right order.
31183 SmallVector<SDValue, 16> Elts;
31184 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
31185 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
31186 Elts.push_back(Elt);
31193 if (Elts.size() == VT.getVectorNumElements())
31195 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
31198 // For AVX2, we sometimes want to combine
31199 // (vector_shuffle <mask> (concat_vectors t1, undef)
31200 // (concat_vectors t2, undef))
31202 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
31203 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
31204 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
31207 if (isTargetShuffle(N->getOpcode())) {
31209 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
31212 // Try recursively combining arbitrary sequences of x86 shuffle
31213 // instructions into higher-order shuffles. We do this after combining
31214 // specific PSHUF instruction sequences into their minimal form so that we
31215 // can evaluate how many specialized shuffle instructions are involved in
31216 // a particular chain.
31217 if (SDValue Res = combineX86ShufflesRecursively(
31218 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31219 /*HasVarMask*/ false, DAG, Subtarget)) {
31220 DCI.CombineTo(N, Res);
31228 /// Check if a vector extract from a target-specific shuffle of a load can be
31229 /// folded into a single element load.
31230 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
31231 /// shuffles have been custom lowered so we need to handle those here.
31232 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
31233 TargetLowering::DAGCombinerInfo &DCI) {
31234 if (DCI.isBeforeLegalizeOps())
31237 SDValue InVec = N->getOperand(0);
31238 SDValue EltNo = N->getOperand(1);
31239 EVT EltVT = N->getValueType(0);
31241 if (!isa<ConstantSDNode>(EltNo))
31244 EVT OriginalVT = InVec.getValueType();
31246 // Peek through bitcasts, don't duplicate a load with other uses.
31247 InVec = peekThroughOneUseBitcasts(InVec);
31249 EVT CurrentVT = InVec.getValueType();
31250 if (!CurrentVT.isVector() ||
31251 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
31254 if (!isTargetShuffle(InVec.getOpcode()))
31257 // Don't duplicate a load with other uses.
31258 if (!InVec.hasOneUse())
31261 SmallVector<int, 16> ShuffleMask;
31262 SmallVector<SDValue, 2> ShuffleOps;
31264 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
31265 ShuffleOps, ShuffleMask, UnaryShuffle))
31268 // Select the input vector, guarding against out of range extract vector.
31269 unsigned NumElems = CurrentVT.getVectorNumElements();
31270 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
31271 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
31273 if (Idx == SM_SentinelZero)
31274 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
31275 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
31276 if (Idx == SM_SentinelUndef)
31277 return DAG.getUNDEF(EltVT);
31279 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
31280 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
31283 // If inputs to shuffle are the same for both ops, then allow 2 uses
31284 unsigned AllowedUses =
31285 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
31287 if (LdNode.getOpcode() == ISD::BITCAST) {
31288 // Don't duplicate a load with other uses.
31289 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
31292 AllowedUses = 1; // only allow 1 load use if we have a bitcast
31293 LdNode = LdNode.getOperand(0);
31296 if (!ISD::isNormalLoad(LdNode.getNode()))
31299 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
31301 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
31304 // If there's a bitcast before the shuffle, check if the load type and
31305 // alignment is valid.
31306 unsigned Align = LN0->getAlignment();
31307 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31308 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
31309 EltVT.getTypeForEVT(*DAG.getContext()));
31311 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
31314 // All checks match so transform back to vector_shuffle so that DAG combiner
31315 // can finish the job
31318 // Create shuffle node taking into account the case that its a unary shuffle
31319 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
31320 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
31322 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
31323 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
31327 // Try to match patterns such as
31328 // (i16 bitcast (v16i1 x))
31330 // (i16 movmsk (16i8 sext (v16i1 x)))
31331 // before the illegal vector is scalarized on subtargets that don't have legal
31333 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
31334 const X86Subtarget &Subtarget) {
31335 EVT VT = BitCast.getValueType();
31336 SDValue N0 = BitCast.getOperand(0);
31337 EVT VecVT = N0->getValueType(0);
31339 if (!VT.isScalarInteger() || !VecVT.isSimple())
31342 // With AVX512 vxi1 types are legal and we prefer using k-regs.
31343 // MOVMSK is supported in SSE2 or later.
31344 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
31347 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
31348 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
31349 // v8i16 and v16i16.
31350 // For these two cases, we can shuffle the upper element bytes to a
31351 // consecutive sequence at the start of the vector and treat the results as
31352 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
31353 // for v16i16 this is not the case, because the shuffle is expensive, so we
31354 // avoid sign-extending to this type entirely.
31355 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
31356 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
31358 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
31359 switch (VecVT.getSimpleVT().SimpleTy) {
31363 SExtVT = MVT::v2i64;
31364 FPCastVT = MVT::v2f64;
31367 SExtVT = MVT::v4i32;
31368 FPCastVT = MVT::v4f32;
31369 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
31370 // sign-extend to a 256-bit operation to avoid truncation.
31371 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31372 N0->getOperand(0).getValueType().is256BitVector()) {
31373 SExtVT = MVT::v4i64;
31374 FPCastVT = MVT::v4f64;
31378 SExtVT = MVT::v8i16;
31379 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
31380 // sign-extend to a 256-bit operation to match the compare.
31381 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
31382 // 256-bit because the shuffle is cheaper than sign extending the result of
31384 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31385 (N0->getOperand(0).getValueType().is256BitVector() ||
31386 N0->getOperand(0).getValueType().is512BitVector())) {
31387 SExtVT = MVT::v8i32;
31388 FPCastVT = MVT::v8f32;
31392 SExtVT = MVT::v16i8;
31393 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
31394 // it is not profitable to sign-extend to 256-bit because this will
31395 // require an extra cross-lane shuffle which is more expensive than
31396 // truncating the result of the compare to 128-bits.
31399 SExtVT = MVT::v32i8;
31404 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
31406 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
31407 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31408 return DAG.getZExtOrTrunc(V, DL, VT);
31411 if (SExtVT == MVT::v8i16) {
31412 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
31413 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
31414 DAG.getUNDEF(MVT::v8i16));
31416 assert(SExtVT.getScalarType() != MVT::i16 &&
31417 "Vectors of i16 must be packed");
31418 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
31419 V = DAG.getBitcast(FPCastVT, V);
31420 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31421 return DAG.getZExtOrTrunc(V, DL, VT);
31424 // Convert a vXi1 constant build vector to the same width scalar integer.
31425 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
31426 EVT SrcVT = Op.getValueType();
31427 assert(SrcVT.getVectorElementType() == MVT::i1 &&
31428 "Expected a vXi1 vector");
31429 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
31430 "Expected a constant build vector");
31432 APInt Imm(SrcVT.getVectorNumElements(), 0);
31433 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
31434 SDValue In = Op.getOperand(Idx);
31435 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
31438 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
31439 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
31442 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31443 TargetLowering::DAGCombinerInfo &DCI,
31444 const X86Subtarget &Subtarget) {
31445 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
31447 if (!DCI.isBeforeLegalizeOps())
31450 // Only do this if we have k-registers.
31451 if (!Subtarget.hasAVX512())
31454 EVT DstVT = N->getValueType(0);
31455 SDValue Op = N->getOperand(0);
31456 EVT SrcVT = Op.getValueType();
31458 if (!Op.hasOneUse())
31461 // Look for logic ops.
31462 if (Op.getOpcode() != ISD::AND &&
31463 Op.getOpcode() != ISD::OR &&
31464 Op.getOpcode() != ISD::XOR)
31467 // Make sure we have a bitcast between mask registers and a scalar type.
31468 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31469 DstVT.isScalarInteger()) &&
31470 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
31471 SrcVT.isScalarInteger()))
31474 SDValue LHS = Op.getOperand(0);
31475 SDValue RHS = Op.getOperand(1);
31477 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
31478 LHS.getOperand(0).getValueType() == DstVT)
31479 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
31480 DAG.getBitcast(DstVT, RHS));
31482 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
31483 RHS.getOperand(0).getValueType() == DstVT)
31484 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31485 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
31487 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
31488 // Most of these have to move a constant from the scalar domain anyway.
31489 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
31490 RHS = combinevXi1ConstantToInteger(RHS, DAG);
31491 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31492 DAG.getBitcast(DstVT, LHS), RHS);
31498 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
31499 const X86Subtarget &Subtarget) {
31501 unsigned NumElts = N.getNumOperands();
31503 auto *BV = cast<BuildVectorSDNode>(N);
31504 SDValue Splat = BV->getSplatValue();
31506 // Build MMX element from integer GPR or SSE float values.
31507 auto CreateMMXElement = [&](SDValue V) {
31509 return DAG.getUNDEF(MVT::x86mmx);
31510 if (V.getValueType().isFloatingPoint()) {
31511 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
31512 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31513 V = DAG.getBitcast(MVT::v2i64, V);
31514 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31516 V = DAG.getBitcast(MVT::i32, V);
31518 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31520 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31523 // Convert build vector ops to MMX data in the bottom elements.
31524 SmallVector<SDValue, 8> Ops;
31526 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31528 if (Splat.isUndef())
31529 return DAG.getUNDEF(MVT::x86mmx);
31531 Splat = CreateMMXElement(Splat);
31533 if (Subtarget.hasSSE1()) {
31534 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31536 Splat = DAG.getNode(
31537 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31538 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31541 // Use PSHUFW to repeat 16-bit elements.
31542 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31543 return DAG.getNode(
31544 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31545 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31546 DAG.getConstant(ShufMask, DL, MVT::i8));
31548 Ops.append(NumElts, Splat);
31550 for (unsigned i = 0; i != NumElts; ++i)
31551 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31554 // Use tree of PUNPCKLs to build up general MMX vector.
31555 while (Ops.size() > 1) {
31556 unsigned NumOps = Ops.size();
31557 unsigned IntrinOp =
31558 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31559 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31560 : Intrinsic::x86_mmx_punpcklbw));
31561 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31562 for (unsigned i = 0; i != NumOps; i += 2)
31563 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31564 Ops[i], Ops[i + 1]);
31565 Ops.resize(NumOps / 2);
31571 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
31572 TargetLowering::DAGCombinerInfo &DCI,
31573 const X86Subtarget &Subtarget) {
31574 SDValue N0 = N->getOperand(0);
31575 EVT VT = N->getValueType(0);
31576 EVT SrcVT = N0.getValueType();
31578 // Try to match patterns such as
31579 // (i16 bitcast (v16i1 x))
31581 // (i16 movmsk (16i8 sext (v16i1 x)))
31582 // before the setcc result is scalarized on subtargets that don't have legal
31584 if (DCI.isBeforeLegalize()) {
31585 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
31588 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31589 // type, widen both sides to avoid a trip through memory.
31590 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
31591 Subtarget.hasAVX512()) {
31593 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
31594 N0 = DAG.getBitcast(MVT::v8i1, N0);
31595 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
31596 DAG.getIntPtrConstant(0, dl));
31599 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31600 // type, widen both sides to avoid a trip through memory.
31601 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
31602 Subtarget.hasAVX512()) {
31604 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
31605 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
31607 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
31608 N0 = DAG.getBitcast(MVT::i8, N0);
31609 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
31613 // Since MMX types are special and don't usually play with other vector types,
31614 // it's better to handle them early to be sure we emit efficient code by
31615 // avoiding store-load conversions.
31616 if (VT == MVT::x86mmx) {
31617 // Detect MMX constant vectors.
31619 SmallVector<APInt, 1> EltBits;
31620 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
31622 // Handle zero-extension of i32 with MOVD.
31623 if (EltBits[0].countLeadingZeros() >= 32)
31624 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
31625 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
31626 // Else, bitcast to a double.
31627 // TODO - investigate supporting sext 32-bit immediates on x86_64.
31628 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
31629 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
31632 // Detect bitcasts to x86mmx low word.
31633 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31634 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
31635 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
31636 bool LowUndef = true, AllUndefOrZero = true;
31637 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
31638 SDValue Op = N0.getOperand(i);
31639 LowUndef &= Op.isUndef() || (i >= e/2);
31640 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
31642 if (AllUndefOrZero) {
31643 SDValue N00 = N0.getOperand(0);
31645 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
31646 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
31647 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
31651 // Detect bitcasts of 64-bit build vectors and convert to a
31652 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31654 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31655 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31656 SrcVT == MVT::v8i8))
31657 return createMMXBuildVector(N0, DAG, Subtarget);
31659 // Detect bitcasts between element or subvector extraction to x86mmx.
31660 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
31661 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
31662 isNullConstant(N0.getOperand(1))) {
31663 SDValue N00 = N0.getOperand(0);
31664 if (N00.getValueType().is128BitVector())
31665 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
31666 DAG.getBitcast(MVT::v2i64, N00));
31669 // Detect bitcasts from FP_TO_SINT to x86mmx.
31670 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
31672 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
31673 DAG.getUNDEF(MVT::v2i32));
31674 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
31675 DAG.getBitcast(MVT::v2i64, Res));
31679 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
31680 // most of these to scalar anyway.
31681 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
31682 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31683 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
31684 return combinevXi1ConstantToInteger(N0, DAG);
31687 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
31688 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
31689 isa<ConstantSDNode>(N0)) {
31690 auto *C = cast<ConstantSDNode>(N0);
31691 if (C->isAllOnesValue())
31692 return DAG.getConstant(1, SDLoc(N0), VT);
31693 if (C->isNullValue())
31694 return DAG.getConstant(0, SDLoc(N0), VT);
31697 // Try to remove bitcasts from input and output of mask arithmetic to
31698 // remove GPR<->K-register crossings.
31699 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
31702 // Convert a bitcasted integer logic operation that has one bitcasted
31703 // floating-point operand into a floating-point logic operation. This may
31704 // create a load of a constant, but that is cheaper than materializing the
31705 // constant in an integer register and transferring it to an SSE register or
31706 // transferring the SSE operand to integer register and back.
31708 switch (N0.getOpcode()) {
31709 case ISD::AND: FPOpcode = X86ISD::FAND; break;
31710 case ISD::OR: FPOpcode = X86ISD::FOR; break;
31711 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
31712 default: return SDValue();
31715 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
31716 (Subtarget.hasSSE2() && VT == MVT::f64)))
31719 SDValue LogicOp0 = N0.getOperand(0);
31720 SDValue LogicOp1 = N0.getOperand(1);
31723 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
31724 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
31725 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
31726 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
31727 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
31728 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
31730 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
31731 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
31732 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
31733 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
31734 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
31735 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
31741 // Match a binop + shuffle pyramid that represents a horizontal reduction over
31742 // the elements of a vector.
31743 // Returns the vector that is being reduced on, or SDValue() if a reduction
31744 // was not matched.
31745 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
31746 ArrayRef<ISD::NodeType> CandidateBinOps) {
31747 // The pattern must end in an extract from index 0.
31748 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
31749 !isNullConstant(Extract->getOperand(1)))
31752 SDValue Op = Extract->getOperand(0);
31753 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
31755 // Match against one of the candidate binary ops.
31756 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
31757 return Op.getOpcode() == unsigned(BinOp);
31761 // At each stage, we're looking for something that looks like:
31762 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
31763 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
31764 // i32 undef, i32 undef, i32 undef, i32 undef>
31765 // %a = binop <8 x i32> %op, %s
31766 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
31767 // we expect something like:
31768 // <4,5,6,7,u,u,u,u>
31769 // <2,3,u,u,u,u,u,u>
31770 // <1,u,u,u,u,u,u,u>
31771 unsigned CandidateBinOp = Op.getOpcode();
31772 for (unsigned i = 0; i < Stages; ++i) {
31773 if (Op.getOpcode() != CandidateBinOp)
31776 ShuffleVectorSDNode *Shuffle =
31777 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
31779 Op = Op.getOperand(1);
31781 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
31782 Op = Op.getOperand(0);
31785 // The first operand of the shuffle should be the same as the other operand
31787 if (!Shuffle || Shuffle->getOperand(0) != Op)
31790 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
31791 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
31792 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
31796 BinOp = CandidateBinOp;
31800 // Given a select, detect the following pattern:
31801 // 1: %2 = zext <N x i8> %0 to <N x i32>
31802 // 2: %3 = zext <N x i8> %1 to <N x i32>
31803 // 3: %4 = sub nsw <N x i32> %2, %3
31804 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
31805 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
31806 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
31807 // This is useful as it is the input into a SAD pattern.
31808 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
31810 // Check the condition of the select instruction is greater-than.
31811 SDValue SetCC = Select->getOperand(0);
31812 if (SetCC.getOpcode() != ISD::SETCC)
31814 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
31815 if (CC != ISD::SETGT && CC != ISD::SETLT)
31818 SDValue SelectOp1 = Select->getOperand(1);
31819 SDValue SelectOp2 = Select->getOperand(2);
31821 // The following instructions assume SelectOp1 is the subtraction operand
31822 // and SelectOp2 is the negation operand.
31823 // In the case of SETLT this is the other way around.
31824 if (CC == ISD::SETLT)
31825 std::swap(SelectOp1, SelectOp2);
31827 // The second operand of the select should be the negation of the first
31828 // operand, which is implemented as 0 - SelectOp1.
31829 if (!(SelectOp2.getOpcode() == ISD::SUB &&
31830 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
31831 SelectOp2.getOperand(1) == SelectOp1))
31834 // The first operand of SetCC is the first operand of the select, which is the
31835 // difference between the two input vectors.
31836 if (SetCC.getOperand(0) != SelectOp1)
31839 // In SetLT case, The second operand of the comparison can be either 1 or 0.
31841 if ((CC == ISD::SETLT) &&
31842 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
31843 SplatVal.isOneValue()) ||
31844 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
31847 // In SetGT case, The second operand of the comparison can be either -1 or 0.
31848 if ((CC == ISD::SETGT) &&
31849 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
31850 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
31853 // The first operand of the select is the difference between the two input
31855 if (SelectOp1.getOpcode() != ISD::SUB)
31858 Op0 = SelectOp1.getOperand(0);
31859 Op1 = SelectOp1.getOperand(1);
31861 // Check if the operands of the sub are zero-extended from vectors of i8.
31862 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
31863 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
31864 Op1.getOpcode() != ISD::ZERO_EXTEND ||
31865 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
31871 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
31873 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
31874 const SDValue &Zext1, const SDLoc &DL,
31875 const X86Subtarget &Subtarget) {
31876 // Find the appropriate width for the PSADBW.
31877 EVT InVT = Zext0.getOperand(0).getValueType();
31878 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
31880 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
31881 // fill in the missing vector elements with 0.
31882 unsigned NumConcat = RegSize / InVT.getSizeInBits();
31883 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
31884 Ops[0] = Zext0.getOperand(0);
31885 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
31886 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31887 Ops[0] = Zext1.getOperand(0);
31888 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31890 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
31891 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
31892 ArrayRef<SDValue> Ops) {
31893 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
31894 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
31896 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
31897 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
31901 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
31903 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
31904 const X86Subtarget &Subtarget) {
31905 // Bail without SSE41.
31906 if (!Subtarget.hasSSE41())
31909 EVT ExtractVT = Extract->getValueType(0);
31910 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
31913 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
31915 SDValue Src = matchBinOpReduction(
31916 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
31920 EVT SrcVT = Src.getValueType();
31921 EVT SrcSVT = SrcVT.getScalarType();
31922 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
31926 SDValue MinPos = Src;
31928 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
31929 while (SrcVT.getSizeInBits() > 128) {
31930 unsigned NumElts = SrcVT.getVectorNumElements();
31931 unsigned NumSubElts = NumElts / 2;
31932 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
31933 unsigned SubSizeInBits = SrcVT.getSizeInBits();
31934 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
31935 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
31936 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
31938 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
31939 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
31940 "Unexpected value type");
31942 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
31943 // to flip the value accordingly.
31945 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
31946 if (BinOp == ISD::SMAX)
31947 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
31948 else if (BinOp == ISD::SMIN)
31949 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
31950 else if (BinOp == ISD::UMAX)
31951 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
31954 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
31956 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
31957 // shuffling each upper element down and insert zeros. This means that the
31958 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
31959 // ready for the PHMINPOS.
31960 if (ExtractVT == MVT::i8) {
31961 SDValue Upper = DAG.getVectorShuffle(
31962 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
31963 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
31964 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
31967 // Perform the PHMINPOS on a v8i16 vector,
31968 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
31969 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
31970 MinPos = DAG.getBitcast(SrcVT, MinPos);
31973 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
31975 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
31976 DAG.getIntPtrConstant(0, DL));
31979 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
31980 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
31982 const X86Subtarget &Subtarget) {
31983 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
31984 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
31987 EVT ExtractVT = Extract->getValueType(0);
31988 unsigned BitWidth = ExtractVT.getSizeInBits();
31989 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
31990 ExtractVT != MVT::i8)
31993 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
31994 unsigned BinOp = 0;
31995 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
31999 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
32000 // which we can't support here for now.
32001 if (Match.getScalarValueSizeInBits() != BitWidth)
32004 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
32005 unsigned MatchSizeInBits = Match.getValueSizeInBits();
32006 if (!(MatchSizeInBits == 128 ||
32007 (MatchSizeInBits == 256 &&
32008 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
32011 // Don't bother performing this for 2-element vectors.
32012 if (Match.getValueType().getVectorNumElements() <= 2)
32015 // Check that we are extracting a reduction of all sign bits.
32016 if (DAG.ComputeNumSignBits(Match) != BitWidth)
32019 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
32021 if (64 == BitWidth || 32 == BitWidth)
32022 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
32023 MatchSizeInBits / BitWidth);
32025 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
32028 ISD::CondCode CondCode;
32029 if (BinOp == ISD::OR) {
32030 // any_of -> MOVMSK != 0
32031 CompareBits = APInt::getNullValue(32);
32032 CondCode = ISD::CondCode::SETNE;
32034 // all_of -> MOVMSK == ((1 << NumElts) - 1)
32035 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
32036 CondCode = ISD::CondCode::SETEQ;
32039 // Perform the select as i32/i64 and then truncate to avoid partial register
32041 unsigned ResWidth = std::max(BitWidth, 32u);
32042 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
32044 SDValue Zero = DAG.getConstant(0, DL, ResVT);
32045 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
32046 SDValue Res = DAG.getBitcast(MaskVT, Match);
32047 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
32048 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
32049 Ones, Zero, CondCode);
32050 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
32053 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
32054 const X86Subtarget &Subtarget) {
32055 // PSADBW is only supported on SSE2 and up.
32056 if (!Subtarget.hasSSE2())
32059 // Verify the type we're extracting from is any integer type above i16.
32060 EVT VT = Extract->getOperand(0).getValueType();
32061 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
32064 unsigned RegSize = 128;
32065 if (Subtarget.useBWIRegs())
32067 else if (Subtarget.hasAVX())
32070 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
32071 // TODO: We should be able to handle larger vectors by splitting them before
32072 // feeding them into several SADs, and then reducing over those.
32073 if (RegSize / VT.getVectorNumElements() < 8)
32076 // Match shuffle + add pyramid.
32077 unsigned BinOp = 0;
32078 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
32080 // The operand is expected to be zero extended from i8
32081 // (verified in detectZextAbsDiff).
32082 // In order to convert to i64 and above, additional any/zero/sign
32083 // extend is expected.
32084 // The zero extend from 32 bit has no mathematical effect on the result.
32085 // Also the sign extend is basically zero extend
32086 // (extends the sign bit which is zero).
32087 // So it is correct to skip the sign/zero extend instruction.
32088 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
32089 Root.getOpcode() == ISD::ZERO_EXTEND ||
32090 Root.getOpcode() == ISD::ANY_EXTEND))
32091 Root = Root.getOperand(0);
32093 // If there was a match, we want Root to be a select that is the root of an
32094 // abs-diff pattern.
32095 if (!Root || (Root.getOpcode() != ISD::VSELECT))
32098 // Check whether we have an abs-diff pattern feeding into the select.
32099 SDValue Zext0, Zext1;
32100 if (!detectZextAbsDiff(Root, Zext0, Zext1))
32103 // Create the SAD instruction.
32105 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
32107 // If the original vector was wider than 8 elements, sum over the results
32108 // in the SAD vector.
32109 unsigned Stages = Log2_32(VT.getVectorNumElements());
32110 MVT SadVT = SAD.getSimpleValueType();
32112 unsigned SadElems = SadVT.getVectorNumElements();
32114 for(unsigned i = Stages - 3; i > 0; --i) {
32115 SmallVector<int, 16> Mask(SadElems, -1);
32116 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
32117 Mask[j] = MaskEnd + j;
32120 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
32121 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
32125 MVT Type = Extract->getSimpleValueType(0);
32126 unsigned TypeSizeInBits = Type.getSizeInBits();
32127 // Return the lowest TypeSizeInBits bits.
32128 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
32129 SAD = DAG.getBitcast(ResVT, SAD);
32130 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
32131 Extract->getOperand(1));
32134 // Attempt to peek through a target shuffle and extract the scalar from the
32136 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
32137 TargetLowering::DAGCombinerInfo &DCI,
32138 const X86Subtarget &Subtarget) {
32139 if (DCI.isBeforeLegalizeOps())
32142 SDValue Src = N->getOperand(0);
32143 SDValue Idx = N->getOperand(1);
32145 EVT VT = N->getValueType(0);
32146 EVT SrcVT = Src.getValueType();
32147 EVT SrcSVT = SrcVT.getVectorElementType();
32148 unsigned NumSrcElts = SrcVT.getVectorNumElements();
32150 // Don't attempt this for boolean mask vectors or unknown extraction indices.
32151 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
32154 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
32155 if (X86ISD::VBROADCAST == Src.getOpcode() &&
32156 Src.getOperand(0).getValueType() == VT)
32157 return Src.getOperand(0);
32159 // Resolve the target shuffle inputs and mask.
32160 SmallVector<int, 16> Mask;
32161 SmallVector<SDValue, 2> Ops;
32162 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
32165 // Attempt to narrow/widen the shuffle mask to the correct size.
32166 if (Mask.size() != NumSrcElts) {
32167 if ((NumSrcElts % Mask.size()) == 0) {
32168 SmallVector<int, 16> ScaledMask;
32169 int Scale = NumSrcElts / Mask.size();
32170 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
32171 Mask = std::move(ScaledMask);
32172 } else if ((Mask.size() % NumSrcElts) == 0) {
32173 SmallVector<int, 16> WidenedMask;
32174 while (Mask.size() > NumSrcElts &&
32175 canWidenShuffleElements(Mask, WidenedMask))
32176 Mask = std::move(WidenedMask);
32177 // TODO - investigate support for wider shuffle masks with known upper
32178 // undef/zero elements for implicit zero-extension.
32182 // Check if narrowing/widening failed.
32183 if (Mask.size() != NumSrcElts)
32186 int SrcIdx = Mask[N->getConstantOperandVal(1)];
32189 // If the shuffle source element is undef/zero then we can just accept it.
32190 if (SrcIdx == SM_SentinelUndef)
32191 return DAG.getUNDEF(VT);
32193 if (SrcIdx == SM_SentinelZero)
32194 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
32195 : DAG.getConstant(0, dl, VT);
32197 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
32198 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
32199 SrcIdx = SrcIdx % Mask.size();
32201 // We can only extract other elements from 128-bit vectors and in certain
32202 // circumstances, depending on SSE-level.
32203 // TODO: Investigate using extract_subvector for larger vectors.
32204 // TODO: Investigate float/double extraction if it will be just stored.
32205 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
32206 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
32207 assert(SrcSVT == VT && "Unexpected extraction type");
32208 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
32209 DAG.getIntPtrConstant(SrcIdx, dl));
32212 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
32213 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
32214 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
32215 "Unexpected extraction type");
32216 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
32217 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
32218 DAG.getIntPtrConstant(SrcIdx, dl));
32219 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
32225 /// Detect vector gather/scatter index generation and convert it from being a
32226 /// bunch of shuffles and extracts into a somewhat faster sequence.
32227 /// For i686, the best sequence is apparently storing the value and loading
32228 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
32229 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
32230 TargetLowering::DAGCombinerInfo &DCI,
32231 const X86Subtarget &Subtarget) {
32232 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
32235 // TODO - Remove this once we can handle the implicit zero-extension of
32236 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
32237 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
32238 // combineBasicSADPattern.
32239 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
32242 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
32245 SDValue InputVector = N->getOperand(0);
32246 SDValue EltIdx = N->getOperand(1);
32248 EVT SrcVT = InputVector.getValueType();
32249 EVT VT = N->getValueType(0);
32250 SDLoc dl(InputVector);
32252 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
32253 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32254 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
32255 SDValue MMXSrc = InputVector.getOperand(0);
32257 // The bitcast source is a direct mmx result.
32258 if (MMXSrc.getValueType() == MVT::x86mmx)
32259 return DAG.getBitcast(VT, InputVector);
32262 // Detect mmx to i32 conversion through a v2i32 elt extract.
32263 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32264 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
32265 SDValue MMXSrc = InputVector.getOperand(0);
32267 // The bitcast source is a direct mmx result.
32268 if (MMXSrc.getValueType() == MVT::x86mmx)
32269 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
32272 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
32273 isa<ConstantSDNode>(EltIdx) &&
32274 isa<ConstantSDNode>(InputVector.getOperand(0))) {
32275 uint64_t ExtractedElt = N->getConstantOperandVal(1);
32276 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
32277 const APInt &InputValue = InputC->getAPIntValue();
32278 uint64_t Res = InputValue[ExtractedElt];
32279 return DAG.getConstant(Res, dl, MVT::i1);
32282 // Check whether this extract is the root of a sum of absolute differences
32283 // pattern. This has to be done here because we really want it to happen
32284 // pre-legalization,
32285 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
32288 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
32289 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
32292 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
32293 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
32299 /// If a vector select has an operand that is -1 or 0, try to simplify the
32300 /// select to a bitwise logic operation.
32301 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
32303 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
32304 TargetLowering::DAGCombinerInfo &DCI,
32305 const X86Subtarget &Subtarget) {
32306 SDValue Cond = N->getOperand(0);
32307 SDValue LHS = N->getOperand(1);
32308 SDValue RHS = N->getOperand(2);
32309 EVT VT = LHS.getValueType();
32310 EVT CondVT = Cond.getValueType();
32312 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32314 if (N->getOpcode() != ISD::VSELECT)
32317 assert(CondVT.isVector() && "Vector select expects a vector selector!");
32319 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
32320 // Check if the first operand is all zeros and Cond type is vXi1.
32321 // This situation only applies to avx512.
32322 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
32323 CondVT.getVectorElementType() == MVT::i1) {
32324 // Invert the cond to not(cond) : xor(op,allones)=not(op)
32325 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
32326 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
32327 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
32330 // To use the condition operand as a bitwise mask, it must have elements that
32331 // are the same size as the select elements. Ie, the condition operand must
32332 // have already been promoted from the IR select condition type <N x i1>.
32333 // Don't check if the types themselves are equal because that excludes
32334 // vector floating-point selects.
32335 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
32338 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
32339 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
32341 // Try to invert the condition if true value is not all 1s and false value is
32343 if (!TValIsAllOnes && !FValIsAllZeros &&
32344 // Check if the selector will be produced by CMPP*/PCMP*.
32345 Cond.getOpcode() == ISD::SETCC &&
32346 // Check if SETCC has already been promoted.
32347 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
32349 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
32351 if (TValIsAllZeros || FValIsAllOnes) {
32352 SDValue CC = Cond.getOperand(2);
32353 ISD::CondCode NewCC =
32354 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
32355 Cond.getOperand(0).getValueType().isInteger());
32356 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
32358 std::swap(LHS, RHS);
32359 TValIsAllOnes = FValIsAllOnes;
32360 FValIsAllZeros = TValIsAllZeros;
32364 // Cond value must be 'sign splat' to be converted to a logical op.
32365 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
32368 // vselect Cond, 111..., 000... -> Cond
32369 if (TValIsAllOnes && FValIsAllZeros)
32370 return DAG.getBitcast(VT, Cond);
32372 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
32375 // vselect Cond, 111..., X -> or Cond, X
32376 if (TValIsAllOnes) {
32377 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
32378 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
32379 return DAG.getBitcast(VT, Or);
32382 // vselect Cond, X, 000... -> and Cond, X
32383 if (FValIsAllZeros) {
32384 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
32385 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
32386 return DAG.getBitcast(VT, And);
32389 // vselect Cond, 000..., X -> andn Cond, X
32390 if (TValIsAllZeros) {
32391 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
32392 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
32393 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
32394 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
32395 return DAG.getBitcast(VT, AndN);
32401 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
32402 SDValue Cond = N->getOperand(0);
32403 SDValue LHS = N->getOperand(1);
32404 SDValue RHS = N->getOperand(2);
32407 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
32408 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
32409 if (!TrueC || !FalseC)
32412 // Don't do this for crazy integer types.
32413 EVT VT = N->getValueType(0);
32414 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32417 // We're going to use the condition bit in math or logic ops. We could allow
32418 // this with a wider condition value (post-legalization it becomes an i8),
32419 // but if nothing is creating selects that late, it doesn't matter.
32420 if (Cond.getValueType() != MVT::i1)
32423 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
32424 // 3, 5, or 9 with i32/i64, so those get transformed too.
32425 // TODO: For constants that overflow or do not differ by power-of-2 or small
32426 // multiplier, convert to 'and' + 'add'.
32427 const APInt &TrueVal = TrueC->getAPIntValue();
32428 const APInt &FalseVal = FalseC->getAPIntValue();
32430 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
32434 APInt AbsDiff = Diff.abs();
32435 if (AbsDiff.isPowerOf2() ||
32436 ((VT == MVT::i32 || VT == MVT::i64) &&
32437 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
32439 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
32440 // of the condition can usually be folded into a compare predicate, but even
32441 // without that, the sequence should be cheaper than a CMOV alternative.
32442 if (TrueVal.slt(FalseVal)) {
32443 Cond = DAG.getNOT(DL, Cond, MVT::i1);
32444 std::swap(TrueC, FalseC);
32447 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
32448 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
32450 // Multiply condition by the difference if non-one.
32451 if (!AbsDiff.isOneValue())
32452 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
32454 // Add the base if non-zero.
32455 if (!FalseC->isNullValue())
32456 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
32464 /// If this is a *dynamic* select (non-constant condition) and we can match
32465 /// this node with one of the variable blend instructions, restructure the
32466 /// condition so that blends can use the high (sign) bit of each element.
32467 static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
32468 TargetLowering::DAGCombinerInfo &DCI,
32469 const X86Subtarget &Subtarget) {
32470 SDValue Cond = N->getOperand(0);
32471 if (N->getOpcode() != ISD::VSELECT ||
32472 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
32475 // Don't optimize before the condition has been transformed to a legal type
32476 // and don't ever optimize vector selects that map to AVX512 mask-registers.
32477 unsigned BitWidth = Cond.getScalarValueSizeInBits();
32478 if (BitWidth < 8 || BitWidth > 64)
32481 // We can only handle the cases where VSELECT is directly legal on the
32482 // subtarget. We custom lower VSELECT nodes with constant conditions and
32483 // this makes it hard to see whether a dynamic VSELECT will correctly
32484 // lower, so we both check the operation's status and explicitly handle the
32485 // cases where a *dynamic* blend will fail even though a constant-condition
32486 // blend could be custom lowered.
32487 // FIXME: We should find a better way to handle this class of problems.
32488 // Potentially, we should combine constant-condition vselect nodes
32489 // pre-legalization into shuffles and not mark as many types as custom
32491 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32492 EVT VT = N->getValueType(0);
32493 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
32495 // FIXME: We don't support i16-element blends currently. We could and
32496 // should support them by making *all* the bits in the condition be set
32497 // rather than just the high bit and using an i8-element blend.
32498 if (VT.getVectorElementType() == MVT::i16)
32500 // Dynamic blending was only available from SSE4.1 onward.
32501 if (VT.is128BitVector() && !Subtarget.hasSSE41())
32503 // Byte blends are only available in AVX2
32504 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
32506 // There are no 512-bit blend instructions that use sign bits.
32507 if (VT.is512BitVector())
32510 // TODO: Add other opcodes eventually lowered into BLEND.
32511 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
32513 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
32516 APInt DemandedMask(APInt::getSignMask(BitWidth));
32518 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32519 !DCI.isBeforeLegalizeOps());
32520 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
32523 // If we changed the computation somewhere in the DAG, this change will
32524 // affect all users of Cond. Update all the nodes so that we do not use
32525 // the generic VSELECT anymore. Otherwise, we may perform wrong
32526 // optimizations as we messed with the actual expectation for the vector
32528 for (SDNode *U : Cond->uses()) {
32529 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
32530 Cond, U->getOperand(1), U->getOperand(2));
32531 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
32533 DCI.CommitTargetLoweringOpt(TLO);
32534 return SDValue(N, 0);
32537 /// Do target-specific dag combines on SELECT and VSELECT nodes.
32538 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
32539 TargetLowering::DAGCombinerInfo &DCI,
32540 const X86Subtarget &Subtarget) {
32542 SDValue Cond = N->getOperand(0);
32543 // Get the LHS/RHS of the select.
32544 SDValue LHS = N->getOperand(1);
32545 SDValue RHS = N->getOperand(2);
32546 EVT VT = LHS.getValueType();
32547 EVT CondVT = Cond.getValueType();
32548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32550 // Convert vselects with constant condition into shuffles.
32551 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
32552 DCI.isBeforeLegalizeOps()) {
32553 SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
32554 for (int i = 0, Size = Mask.size(); i != Size; ++i) {
32555 SDValue CondElt = Cond->getOperand(i);
32557 // Arbitrarily choose from the 2nd operand if the select condition element
32559 // TODO: Can we do better by matching patterns such as even/odd?
32560 if (CondElt.isUndef() || isNullConstant(CondElt))
32564 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
32567 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
32568 // instructions match the semantics of the common C idiom x<y?x:y but not
32569 // x<=y?x:y, because of how they handle negative zero (which can be
32570 // ignored in unsafe-math mode).
32571 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
32572 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
32573 VT != MVT::f80 && VT != MVT::f128 &&
32574 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
32575 (Subtarget.hasSSE2() ||
32576 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
32577 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32579 unsigned Opcode = 0;
32580 // Check for x CC y ? x : y.
32581 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32582 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32586 // Converting this to a min would handle NaNs incorrectly, and swapping
32587 // the operands would cause it to handle comparisons between positive
32588 // and negative zero incorrectly.
32589 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32590 if (!DAG.getTarget().Options.UnsafeFPMath &&
32591 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
32593 std::swap(LHS, RHS);
32595 Opcode = X86ISD::FMIN;
32598 // Converting this to a min would handle comparisons between positive
32599 // and negative zero incorrectly.
32600 if (!DAG.getTarget().Options.UnsafeFPMath &&
32601 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
32603 Opcode = X86ISD::FMIN;
32606 // Converting this to a min would handle both negative zeros and NaNs
32607 // incorrectly, but we can swap the operands to fix both.
32608 std::swap(LHS, RHS);
32613 Opcode = X86ISD::FMIN;
32617 // Converting this to a max would handle comparisons between positive
32618 // and negative zero incorrectly.
32619 if (!DAG.getTarget().Options.UnsafeFPMath &&
32620 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
32622 Opcode = X86ISD::FMAX;
32625 // Converting this to a max would handle NaNs incorrectly, and swapping
32626 // the operands would cause it to handle comparisons between positive
32627 // and negative zero incorrectly.
32628 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32629 if (!DAG.getTarget().Options.UnsafeFPMath &&
32630 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
32632 std::swap(LHS, RHS);
32634 Opcode = X86ISD::FMAX;
32637 // Converting this to a max would handle both negative zeros and NaNs
32638 // incorrectly, but we can swap the operands to fix both.
32639 std::swap(LHS, RHS);
32644 Opcode = X86ISD::FMAX;
32647 // Check for x CC y ? y : x -- a min/max with reversed arms.
32648 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
32649 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
32653 // Converting this to a min would handle comparisons between positive
32654 // and negative zero incorrectly, and swapping the operands would
32655 // cause it to handle NaNs incorrectly.
32656 if (!DAG.getTarget().Options.UnsafeFPMath &&
32657 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
32658 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32660 std::swap(LHS, RHS);
32662 Opcode = X86ISD::FMIN;
32665 // Converting this to a min would handle NaNs incorrectly.
32666 if (!DAG.getTarget().Options.UnsafeFPMath &&
32667 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
32669 Opcode = X86ISD::FMIN;
32672 // Converting this to a min would handle both negative zeros and NaNs
32673 // incorrectly, but we can swap the operands to fix both.
32674 std::swap(LHS, RHS);
32679 Opcode = X86ISD::FMIN;
32683 // Converting this to a max would handle NaNs incorrectly.
32684 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32686 Opcode = X86ISD::FMAX;
32689 // Converting this to a max would handle comparisons between positive
32690 // and negative zero incorrectly, and swapping the operands would
32691 // cause it to handle NaNs incorrectly.
32692 if (!DAG.getTarget().Options.UnsafeFPMath &&
32693 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
32694 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32696 std::swap(LHS, RHS);
32698 Opcode = X86ISD::FMAX;
32701 // Converting this to a max would handle both negative zeros and NaNs
32702 // incorrectly, but we can swap the operands to fix both.
32703 std::swap(LHS, RHS);
32708 Opcode = X86ISD::FMAX;
32714 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
32717 // Some mask scalar intrinsics rely on checking if only one bit is set
32718 // and implement it in C code like this:
32719 // A[0] = (U & 1) ? A[0] : W[0];
32720 // This creates some redundant instructions that break pattern matching.
32721 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
32722 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
32723 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
32724 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32725 SDValue AndNode = Cond.getOperand(0);
32726 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
32727 isNullConstant(Cond.getOperand(1)) &&
32728 isOneConstant(AndNode.getOperand(1))) {
32729 // LHS and RHS swapped due to
32730 // setcc outputting 1 when AND resulted in 0 and vice versa.
32731 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
32732 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
32736 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
32737 // lowering on KNL. In this case we convert it to
32738 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
32739 // The same situation all vectors of i8 and i16 without BWI.
32740 // Make sure we extend these even before type legalization gets a chance to
32741 // split wide vectors.
32742 // Since SKX these selects have a proper lowering.
32743 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
32744 CondVT.getVectorElementType() == MVT::i1 &&
32745 VT.getVectorNumElements() > 4 &&
32746 (VT.getVectorElementType() == MVT::i8 ||
32747 VT.getVectorElementType() == MVT::i16)) {
32748 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
32749 DCI.AddToWorklist(Cond.getNode());
32750 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
32753 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
32756 // Canonicalize max and min:
32757 // (x > y) ? x : y -> (x >= y) ? x : y
32758 // (x < y) ? x : y -> (x <= y) ? x : y
32759 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
32760 // the need for an extra compare
32761 // against zero. e.g.
32762 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
32764 // testl %edi, %edi
32766 // cmovgl %edi, %eax
32770 // cmovsl %eax, %edi
32771 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
32772 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32773 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32774 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32779 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
32780 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
32781 Cond.getOperand(0), Cond.getOperand(1), NewCC);
32782 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
32787 // Early exit check
32788 if (!TLI.isTypeLegal(VT))
32791 // Match VSELECTs into subs with unsigned saturation.
32792 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
32793 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
32794 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
32795 (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
32796 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32798 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
32799 // left side invert the predicate to simplify logic below.
32801 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
32803 CC = ISD::getSetCCInverse(CC, true);
32804 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
32808 if (Other.getNode() && Other->getNumOperands() == 2 &&
32809 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
32810 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
32811 SDValue CondRHS = Cond->getOperand(1);
32813 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
32814 ArrayRef<SDValue> Ops) {
32815 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
32818 // Look for a general sub with unsigned saturation first.
32819 // x >= y ? x-y : 0 --> subus x, y
32820 // x > y ? x-y : 0 --> subus x, y
32821 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
32822 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
32823 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32826 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
32827 if (isa<BuildVectorSDNode>(CondRHS)) {
32828 // If the RHS is a constant we have to reverse the const
32829 // canonicalization.
32830 // x > C-1 ? x+-C : 0 --> subus x, C
32831 auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
32832 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
32834 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
32835 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
32836 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
32837 DAG.getConstant(0, DL, VT), OpRHS);
32838 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32842 // Another special case: If C was a sign bit, the sub has been
32843 // canonicalized into a xor.
32844 // FIXME: Would it be better to use computeKnownBits to determine
32845 // whether it's safe to decanonicalize the xor?
32846 // x s< 0 ? x^C : 0 --> subus x, C
32847 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
32848 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
32849 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
32850 OpRHSConst->getAPIntValue().isSignMask()) {
32851 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
32852 // Note that we have to rebuild the RHS constant here to ensure we
32853 // don't rely on particular values of undef lanes.
32854 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32861 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
32864 if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
32867 // Custom action for SELECT MMX
32868 if (VT == MVT::x86mmx) {
32869 LHS = DAG.getBitcast(MVT::i64, LHS);
32870 RHS = DAG.getBitcast(MVT::i64, RHS);
32871 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
32872 return DAG.getBitcast(VT, newSelect);
32879 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
32881 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
32882 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
32883 /// Note that this is only legal for some op/cc combinations.
32884 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
32886 const X86Subtarget &Subtarget) {
32887 // This combine only operates on CMP-like nodes.
32888 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32889 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32892 // Can't replace the cmp if it has more uses than the one we're looking at.
32893 // FIXME: We would like to be able to handle this, but would need to make sure
32894 // all uses were updated.
32895 if (!Cmp.hasOneUse())
32898 // This only applies to variations of the common case:
32899 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
32900 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
32901 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
32902 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
32903 // Using the proper condcodes (see below), overflow is checked for.
32905 // FIXME: We can generalize both constraints:
32906 // - XOR/OR/AND (if they were made to survive AtomicExpand)
32908 // if the result is compared.
32910 SDValue CmpLHS = Cmp.getOperand(0);
32911 SDValue CmpRHS = Cmp.getOperand(1);
32913 if (!CmpLHS.hasOneUse())
32916 unsigned Opc = CmpLHS.getOpcode();
32917 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
32920 SDValue OpRHS = CmpLHS.getOperand(2);
32921 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
32925 APInt Addend = OpRHSC->getAPIntValue();
32926 if (Opc == ISD::ATOMIC_LOAD_SUB)
32929 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
32933 APInt Comparison = CmpRHSC->getAPIntValue();
32935 // If the addend is the negation of the comparison value, then we can do
32936 // a full comparison by emitting the atomic arithmetic as a locked sub.
32937 if (Comparison == -Addend) {
32938 // The CC is fine, but we need to rewrite the LHS of the comparison as an
32940 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
32941 auto AtomicSub = DAG.getAtomic(
32942 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
32943 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
32944 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
32945 AN->getMemOperand());
32946 // If the comparision uses the CF flag we can't use INC/DEC instructions.
32947 bool NeedCF = false;
32950 case X86::COND_A: case X86::COND_AE:
32951 case X86::COND_B: case X86::COND_BE:
32955 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
32956 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32957 DAG.getUNDEF(CmpLHS.getValueType()));
32958 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32962 // We can handle comparisons with zero in a number of cases by manipulating
32964 if (!Comparison.isNullValue())
32967 if (CC == X86::COND_S && Addend == 1)
32969 else if (CC == X86::COND_NS && Addend == 1)
32971 else if (CC == X86::COND_G && Addend == -1)
32973 else if (CC == X86::COND_LE && Addend == -1)
32978 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
32979 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32980 DAG.getUNDEF(CmpLHS.getValueType()));
32981 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32985 // Check whether a boolean test is testing a boolean value generated by
32986 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
32989 // Simplify the following patterns:
32990 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
32991 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
32992 // to (Op EFLAGS Cond)
32994 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
32995 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
32996 // to (Op EFLAGS !Cond)
32998 // where Op could be BRCOND or CMOV.
33000 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
33001 // This combine only operates on CMP-like nodes.
33002 if (!(Cmp.getOpcode() == X86ISD::CMP ||
33003 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
33006 // Quit if not used as a boolean value.
33007 if (CC != X86::COND_E && CC != X86::COND_NE)
33010 // Check CMP operands. One of them should be 0 or 1 and the other should be
33011 // an SetCC or extended from it.
33012 SDValue Op1 = Cmp.getOperand(0);
33013 SDValue Op2 = Cmp.getOperand(1);
33016 const ConstantSDNode* C = nullptr;
33017 bool needOppositeCond = (CC == X86::COND_E);
33018 bool checkAgainstTrue = false; // Is it a comparison against 1?
33020 if ((C = dyn_cast<ConstantSDNode>(Op1)))
33022 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
33024 else // Quit if all operands are not constants.
33027 if (C->getZExtValue() == 1) {
33028 needOppositeCond = !needOppositeCond;
33029 checkAgainstTrue = true;
33030 } else if (C->getZExtValue() != 0)
33031 // Quit if the constant is neither 0 or 1.
33034 bool truncatedToBoolWithAnd = false;
33035 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
33036 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
33037 SetCC.getOpcode() == ISD::TRUNCATE ||
33038 SetCC.getOpcode() == ISD::AND) {
33039 if (SetCC.getOpcode() == ISD::AND) {
33041 if (isOneConstant(SetCC.getOperand(0)))
33043 if (isOneConstant(SetCC.getOperand(1)))
33047 SetCC = SetCC.getOperand(OpIdx);
33048 truncatedToBoolWithAnd = true;
33050 SetCC = SetCC.getOperand(0);
33053 switch (SetCC.getOpcode()) {
33054 case X86ISD::SETCC_CARRY:
33055 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
33056 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
33057 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
33058 // truncated to i1 using 'and'.
33059 if (checkAgainstTrue && !truncatedToBoolWithAnd)
33061 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
33062 "Invalid use of SETCC_CARRY!");
33064 case X86ISD::SETCC:
33065 // Set the condition code or opposite one if necessary.
33066 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
33067 if (needOppositeCond)
33068 CC = X86::GetOppositeBranchCondition(CC);
33069 return SetCC.getOperand(1);
33070 case X86ISD::CMOV: {
33071 // Check whether false/true value has canonical one, i.e. 0 or 1.
33072 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
33073 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
33074 // Quit if true value is not a constant.
33077 // Quit if false value is not a constant.
33079 SDValue Op = SetCC.getOperand(0);
33080 // Skip 'zext' or 'trunc' node.
33081 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
33082 Op.getOpcode() == ISD::TRUNCATE)
33083 Op = Op.getOperand(0);
33084 // A special case for rdrand/rdseed, where 0 is set if false cond is
33086 if ((Op.getOpcode() != X86ISD::RDRAND &&
33087 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
33090 // Quit if false value is not the constant 0 or 1.
33091 bool FValIsFalse = true;
33092 if (FVal && FVal->getZExtValue() != 0) {
33093 if (FVal->getZExtValue() != 1)
33095 // If FVal is 1, opposite cond is needed.
33096 needOppositeCond = !needOppositeCond;
33097 FValIsFalse = false;
33099 // Quit if TVal is not the constant opposite of FVal.
33100 if (FValIsFalse && TVal->getZExtValue() != 1)
33102 if (!FValIsFalse && TVal->getZExtValue() != 0)
33104 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
33105 if (needOppositeCond)
33106 CC = X86::GetOppositeBranchCondition(CC);
33107 return SetCC.getOperand(3);
33114 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
33116 /// (X86or (X86setcc) (X86setcc))
33117 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
33118 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
33119 X86::CondCode &CC1, SDValue &Flags,
33121 if (Cond->getOpcode() == X86ISD::CMP) {
33122 if (!isNullConstant(Cond->getOperand(1)))
33125 Cond = Cond->getOperand(0);
33130 SDValue SetCC0, SetCC1;
33131 switch (Cond->getOpcode()) {
33132 default: return false;
33139 SetCC0 = Cond->getOperand(0);
33140 SetCC1 = Cond->getOperand(1);
33144 // Make sure we have SETCC nodes, using the same flags value.
33145 if (SetCC0.getOpcode() != X86ISD::SETCC ||
33146 SetCC1.getOpcode() != X86ISD::SETCC ||
33147 SetCC0->getOperand(1) != SetCC1->getOperand(1))
33150 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
33151 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
33152 Flags = SetCC0->getOperand(1);
33156 // When legalizing carry, we create carries via add X, -1
33157 // If that comes from an actual carry, via setcc, we use the
33159 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
33160 if (EFLAGS.getOpcode() == X86ISD::ADD) {
33161 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
33162 SDValue Carry = EFLAGS.getOperand(0);
33163 while (Carry.getOpcode() == ISD::TRUNCATE ||
33164 Carry.getOpcode() == ISD::ZERO_EXTEND ||
33165 Carry.getOpcode() == ISD::SIGN_EXTEND ||
33166 Carry.getOpcode() == ISD::ANY_EXTEND ||
33167 (Carry.getOpcode() == ISD::AND &&
33168 isOneConstant(Carry.getOperand(1))))
33169 Carry = Carry.getOperand(0);
33170 if (Carry.getOpcode() == X86ISD::SETCC ||
33171 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
33172 if (Carry.getConstantOperandVal(0) == X86::COND_B)
33173 return Carry.getOperand(1);
33181 /// Optimize an EFLAGS definition used according to the condition code \p CC
33182 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
33183 /// uses of chain values.
33184 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
33186 const X86Subtarget &Subtarget) {
33187 if (CC == X86::COND_B)
33188 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
33191 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
33193 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
33196 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
33197 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
33198 TargetLowering::DAGCombinerInfo &DCI,
33199 const X86Subtarget &Subtarget) {
33202 SDValue FalseOp = N->getOperand(0);
33203 SDValue TrueOp = N->getOperand(1);
33204 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
33205 SDValue Cond = N->getOperand(3);
33207 // Try to simplify the EFLAGS and condition code operands.
33208 // We can't always do this as FCMOV only supports a subset of X86 cond.
33209 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
33210 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
33211 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
33213 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33217 // If this is a select between two integer constants, try to do some
33218 // optimizations. Note that the operands are ordered the opposite of SELECT
33220 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
33221 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
33222 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
33223 // larger than FalseC (the false value).
33224 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
33225 CC = X86::GetOppositeBranchCondition(CC);
33226 std::swap(TrueC, FalseC);
33227 std::swap(TrueOp, FalseOp);
33230 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
33231 // This is efficient for any integer data type (including i8/i16) and
33233 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
33234 Cond = getSETCC(CC, Cond, DL, DAG);
33236 // Zero extend the condition if needed.
33237 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
33239 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
33240 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
33241 DAG.getConstant(ShAmt, DL, MVT::i8));
33245 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
33246 // for any integer data type, including i8/i16.
33247 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
33248 Cond = getSETCC(CC, Cond, DL, DAG);
33250 // Zero extend the condition if needed.
33251 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
33252 FalseC->getValueType(0), Cond);
33253 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33254 SDValue(FalseC, 0));
33258 // Optimize cases that will turn into an LEA instruction. This requires
33259 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
33260 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
33261 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
33262 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
33264 bool isFastMultiplier = false;
33266 switch ((unsigned char)Diff) {
33268 case 1: // result = add base, cond
33269 case 2: // result = lea base( , cond*2)
33270 case 3: // result = lea base(cond, cond*2)
33271 case 4: // result = lea base( , cond*4)
33272 case 5: // result = lea base(cond, cond*4)
33273 case 8: // result = lea base( , cond*8)
33274 case 9: // result = lea base(cond, cond*8)
33275 isFastMultiplier = true;
33280 if (isFastMultiplier) {
33281 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
33282 Cond = getSETCC(CC, Cond, DL ,DAG);
33283 // Zero extend the condition if needed.
33284 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
33286 // Scale the condition by the difference.
33288 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
33289 DAG.getConstant(Diff, DL, Cond.getValueType()));
33291 // Add the base if non-zero.
33292 if (FalseC->getAPIntValue() != 0)
33293 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33294 SDValue(FalseC, 0));
33301 // Handle these cases:
33302 // (select (x != c), e, c) -> select (x != c), e, x),
33303 // (select (x == c), c, e) -> select (x == c), x, e)
33304 // where the c is an integer constant, and the "select" is the combination
33305 // of CMOV and CMP.
33307 // The rationale for this change is that the conditional-move from a constant
33308 // needs two instructions, however, conditional-move from a register needs
33309 // only one instruction.
33311 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
33312 // some instruction-combining opportunities. This opt needs to be
33313 // postponed as late as possible.
33315 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
33316 // the DCI.xxxx conditions are provided to postpone the optimization as
33317 // late as possible.
33319 ConstantSDNode *CmpAgainst = nullptr;
33320 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
33321 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
33322 !isa<ConstantSDNode>(Cond.getOperand(0))) {
33324 if (CC == X86::COND_NE &&
33325 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
33326 CC = X86::GetOppositeBranchCondition(CC);
33327 std::swap(TrueOp, FalseOp);
33330 if (CC == X86::COND_E &&
33331 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
33332 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
33333 DAG.getConstant(CC, DL, MVT::i8), Cond };
33334 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33339 // Fold and/or of setcc's to double CMOV:
33340 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
33341 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
33343 // This combine lets us generate:
33344 // cmovcc1 (jcc1 if we don't have CMOV)
33350 // cmovne (jne if we don't have CMOV)
33351 // When we can't use the CMOV instruction, it might increase branch
33353 // When we can use CMOV, or when there is no mispredict, this improves
33354 // throughput and reduces register pressure.
33356 if (CC == X86::COND_NE) {
33358 X86::CondCode CC0, CC1;
33360 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
33362 std::swap(FalseOp, TrueOp);
33363 CC0 = X86::GetOppositeBranchCondition(CC0);
33364 CC1 = X86::GetOppositeBranchCondition(CC1);
33367 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
33369 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
33370 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
33371 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33379 /// Different mul shrinking modes.
33380 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
33382 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
33383 EVT VT = N->getOperand(0).getValueType();
33384 if (VT.getScalarSizeInBits() != 32)
33387 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
33388 unsigned SignBits[2] = {1, 1};
33389 bool IsPositive[2] = {false, false};
33390 for (unsigned i = 0; i < 2; i++) {
33391 SDValue Opd = N->getOperand(i);
33393 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
33394 // compute signbits for it separately.
33395 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
33396 // For anyextend, it is safe to assume an appropriate number of leading
33398 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
33400 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
33405 IsPositive[i] = true;
33406 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
33407 // All the operands of BUILD_VECTOR need to be int constant.
33408 // Find the smallest value range which all the operands belong to.
33410 IsPositive[i] = true;
33411 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
33412 if (SubOp.isUndef())
33414 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
33417 APInt IntVal = CN->getAPIntValue();
33418 if (IntVal.isNegative())
33419 IsPositive[i] = false;
33420 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
33423 SignBits[i] = DAG.ComputeNumSignBits(Opd);
33424 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
33425 IsPositive[i] = true;
33429 bool AllPositive = IsPositive[0] && IsPositive[1];
33430 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
33431 // When ranges are from -128 ~ 127, use MULS8 mode.
33432 if (MinSignBits >= 25)
33434 // When ranges are from 0 ~ 255, use MULU8 mode.
33435 else if (AllPositive && MinSignBits >= 24)
33437 // When ranges are from -32768 ~ 32767, use MULS16 mode.
33438 else if (MinSignBits >= 17)
33440 // When ranges are from 0 ~ 65535, use MULU16 mode.
33441 else if (AllPositive && MinSignBits >= 16)
33448 /// When the operands of vector mul are extended from smaller size values,
33449 /// like i8 and i16, the type of mul may be shrinked to generate more
33450 /// efficient code. Two typical patterns are handled:
33452 /// %2 = sext/zext <N x i8> %1 to <N x i32>
33453 /// %4 = sext/zext <N x i8> %3 to <N x i32>
33454 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33455 /// %5 = mul <N x i32> %2, %4
33458 /// %2 = zext/sext <N x i16> %1 to <N x i32>
33459 /// %4 = zext/sext <N x i16> %3 to <N x i32>
33460 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33461 /// %5 = mul <N x i32> %2, %4
33463 /// There are four mul shrinking modes:
33464 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
33465 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
33466 /// generate pmullw+sext32 for it (MULS8 mode).
33467 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
33468 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
33469 /// generate pmullw+zext32 for it (MULU8 mode).
33470 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
33471 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
33472 /// generate pmullw+pmulhw for it (MULS16 mode).
33473 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
33474 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
33475 /// generate pmullw+pmulhuw for it (MULU16 mode).
33476 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
33477 const X86Subtarget &Subtarget) {
33478 // Check for legality
33479 // pmullw/pmulhw are not supported by SSE.
33480 if (!Subtarget.hasSSE2())
33483 // Check for profitability
33484 // pmulld is supported since SSE41. It is better to use pmulld
33485 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
33487 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
33488 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
33492 if (!canReduceVMulWidth(N, DAG, Mode))
33496 SDValue N0 = N->getOperand(0);
33497 SDValue N1 = N->getOperand(1);
33498 EVT VT = N->getOperand(0).getValueType();
33499 unsigned NumElts = VT.getVectorNumElements();
33500 if ((NumElts % 2) != 0)
33503 unsigned RegSize = 128;
33504 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
33505 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
33507 // Shrink the operands of mul.
33508 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
33509 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
33511 if (NumElts >= OpsVT.getVectorNumElements()) {
33512 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
33513 // lower part is needed.
33514 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
33515 if (Mode == MULU8 || Mode == MULS8) {
33516 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
33519 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
33520 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
33521 // the higher part is also needed.
33522 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33523 ReducedVT, NewN0, NewN1);
33525 // Repack the lower part and higher part result of mul into a wider
33527 // Generate shuffle functioning as punpcklwd.
33528 SmallVector<int, 16> ShuffleMask(NumElts);
33529 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33530 ShuffleMask[2 * i] = i;
33531 ShuffleMask[2 * i + 1] = i + NumElts;
33534 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33535 ResLo = DAG.getBitcast(ResVT, ResLo);
33536 // Generate shuffle functioning as punpckhwd.
33537 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33538 ShuffleMask[2 * i] = i + NumElts / 2;
33539 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
33542 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33543 ResHi = DAG.getBitcast(ResVT, ResHi);
33544 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
33547 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
33548 // to legalize the mul explicitly because implicit legalization for type
33549 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
33550 // instructions which will not exist when we explicitly legalize it by
33551 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
33552 // <4 x i16> undef).
33554 // Legalize the operands of mul.
33555 // FIXME: We may be able to handle non-concatenated vectors by insertion.
33556 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
33557 if ((RegSize % ReducedSizeInBits) != 0)
33560 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
33561 DAG.getUNDEF(ReducedVT));
33563 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33565 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33567 if (Mode == MULU8 || Mode == MULS8) {
33568 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
33570 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33572 // convert the type of mul result to VT.
33573 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33574 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
33575 : ISD::SIGN_EXTEND_VECTOR_INREG,
33577 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33578 DAG.getIntPtrConstant(0, DL));
33580 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
33581 // MULU16/MULS16, both parts are needed.
33582 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33583 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33584 OpsVT, NewN0, NewN1);
33586 // Repack the lower part and higher part result of mul into a wider
33587 // result. Make sure the type of mul result is VT.
33588 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33589 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
33590 Res = DAG.getBitcast(ResVT, Res);
33591 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33592 DAG.getIntPtrConstant(0, DL));
33597 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
33598 EVT VT, const SDLoc &DL) {
33600 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
33601 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33602 DAG.getConstant(Mult, DL, VT));
33603 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
33604 DAG.getConstant(Shift, DL, MVT::i8));
33605 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33610 auto combineMulMulAddOrSub = [&](bool isAdd) {
33611 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33612 DAG.getConstant(9, DL, VT));
33613 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
33614 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33623 // mul x, 11 => add ((shl (mul x, 5), 1), x)
33624 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
33626 // mul x, 21 => add ((shl (mul x, 5), 2), x)
33627 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
33629 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
33630 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33631 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
33633 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
33634 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
33636 // mul x, 13 => add ((shl (mul x, 3), 2), x)
33637 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
33639 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
33640 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
33642 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
33643 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33644 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
33646 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
33647 return combineMulMulAddOrSub(/*isAdd*/ false);
33649 // mul x, 28 => add ((mul (mul x, 9), 3), x)
33650 return combineMulMulAddOrSub(/*isAdd*/ true);
33652 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
33653 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33654 combineMulMulAddOrSub(/*isAdd*/ true));
33656 // mul x, 30 => sub (sub ((shl x, 5), x), x)
33657 return DAG.getNode(
33659 DAG.getNode(ISD::SUB, DL, VT,
33660 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33661 DAG.getConstant(5, DL, MVT::i8)),
33668 // If the upper 17 bits of each element are zero then we can use PMADDWD,
33669 // which is always at least as quick as PMULLD, expect on KNL.
33670 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
33671 const X86Subtarget &Subtarget) {
33672 if (!Subtarget.hasSSE2())
33675 if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
33678 EVT VT = N->getValueType(0);
33680 // Only support vXi32 vectors.
33681 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
33684 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
33685 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
33686 if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
33689 SDValue N0 = N->getOperand(0);
33690 SDValue N1 = N->getOperand(1);
33691 APInt Mask17 = APInt::getHighBitsSet(32, 17);
33692 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
33693 !DAG.MaskedValueIsZero(N0, Mask17))
33696 // Use SplitOpsAndApply to handle AVX splitting.
33697 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33698 ArrayRef<SDValue> Ops) {
33699 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
33700 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
33702 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
33703 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
33707 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
33708 const X86Subtarget &Subtarget) {
33709 if (!Subtarget.hasSSE2())
33712 EVT VT = N->getValueType(0);
33714 // Only support vXi64 vectors.
33715 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
33716 !DAG.getTargetLoweringInfo().isTypeLegal(VT))
33719 SDValue N0 = N->getOperand(0);
33720 SDValue N1 = N->getOperand(1);
33722 // MULDQ returns the 64-bit result of the signed multiplication of the lower
33723 // 32-bits. We can lower with this if the sign bits stretch that far.
33724 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
33725 DAG.ComputeNumSignBits(N1) > 32) {
33726 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33727 ArrayRef<SDValue> Ops) {
33728 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
33730 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33731 PMULDQBuilder, /*CheckBWI*/false);
33734 // If the upper bits are zero we can use a single pmuludq.
33735 APInt Mask = APInt::getHighBitsSet(64, 32);
33736 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
33737 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33738 ArrayRef<SDValue> Ops) {
33739 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
33741 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33742 PMULUDQBuilder, /*CheckBWI*/false);
33748 /// Optimize a single multiply with constant into two operations in order to
33749 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
33750 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
33751 TargetLowering::DAGCombinerInfo &DCI,
33752 const X86Subtarget &Subtarget) {
33753 EVT VT = N->getValueType(0);
33755 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
33758 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
33761 if (DCI.isBeforeLegalize() && VT.isVector())
33762 return reduceVMULWidth(N, DAG, Subtarget);
33764 if (!MulConstantOptimization)
33766 // An imul is usually smaller than the alternative sequence.
33767 if (DAG.getMachineFunction().getFunction().optForMinSize())
33770 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
33773 if (VT != MVT::i64 && VT != MVT::i32)
33776 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
33779 uint64_t MulAmt = C->getZExtValue();
33780 if (isPowerOf2_64(MulAmt))
33784 if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
33785 return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33788 uint64_t MulAmt1 = 0;
33789 uint64_t MulAmt2 = 0;
33790 if ((MulAmt % 9) == 0) {
33792 MulAmt2 = MulAmt / 9;
33793 } else if ((MulAmt % 5) == 0) {
33795 MulAmt2 = MulAmt / 5;
33796 } else if ((MulAmt % 3) == 0) {
33798 MulAmt2 = MulAmt / 3;
33803 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
33805 if (isPowerOf2_64(MulAmt2) &&
33806 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
33807 // If second multiplifer is pow2, issue it first. We want the multiply by
33808 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
33810 std::swap(MulAmt1, MulAmt2);
33812 if (isPowerOf2_64(MulAmt1))
33813 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33814 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
33816 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33817 DAG.getConstant(MulAmt1, DL, VT));
33819 if (isPowerOf2_64(MulAmt2))
33820 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
33821 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
33823 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
33824 DAG.getConstant(MulAmt2, DL, VT));
33825 } else if (!Subtarget.slowLEA())
33826 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
33829 assert(MulAmt != 0 &&
33830 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
33831 "Both cases that could cause potential overflows should have "
33832 "already been handled.");
33833 int64_t SignMulAmt = C->getSExtValue();
33834 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
33835 (SignMulAmt != -INT64_MAX)) {
33836 int NumSign = SignMulAmt > 0 ? 1 : -1;
33837 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
33838 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
33839 if (IsPowerOf2_64PlusOne) {
33840 // (mul x, 2^N + 1) => (add (shl x, N), x)
33841 NewMul = DAG.getNode(
33842 ISD::ADD, DL, VT, N->getOperand(0),
33843 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33844 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
33846 } else if (IsPowerOf2_64MinusOne) {
33847 // (mul x, 2^N - 1) => (sub (shl x, N), x)
33848 NewMul = DAG.getNode(
33850 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33851 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
33855 // To negate, subtract the number from zero
33856 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
33858 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
33863 // Do not add new nodes to DAG combiner worklist.
33864 DCI.CombineTo(N, NewMul, false);
33869 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
33870 SDValue N0 = N->getOperand(0);
33871 SDValue N1 = N->getOperand(1);
33872 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
33873 EVT VT = N0.getValueType();
33875 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
33876 // since the result of setcc_c is all zero's or all ones.
33877 if (VT.isInteger() && !VT.isVector() &&
33878 N1C && N0.getOpcode() == ISD::AND &&
33879 N0.getOperand(1).getOpcode() == ISD::Constant) {
33880 SDValue N00 = N0.getOperand(0);
33881 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
33882 Mask <<= N1C->getAPIntValue();
33883 bool MaskOK = false;
33884 // We can handle cases concerning bit-widening nodes containing setcc_c if
33885 // we carefully interrogate the mask to make sure we are semantics
33887 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
33888 // of the underlying setcc_c operation if the setcc_c was zero extended.
33889 // Consider the following example:
33890 // zext(setcc_c) -> i32 0x0000FFFF
33891 // c1 -> i32 0x0000FFFF
33892 // c2 -> i32 0x00000001
33893 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
33894 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
33895 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33897 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
33898 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
33900 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
33901 N00.getOpcode() == ISD::ANY_EXTEND) &&
33902 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
33903 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
33905 if (MaskOK && Mask != 0) {
33907 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
33911 // Hardware support for vector shifts is sparse which makes us scalarize the
33912 // vector operations in many cases. Also, on sandybridge ADD is faster than
33914 // (shl V, 1) -> add V,V
33915 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
33916 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
33917 assert(N0.getValueType().isVector() && "Invalid vector shift type");
33918 // We shift all of the values by one. In many cases we do not have
33919 // hardware support for this operation. This is better expressed as an ADD
33921 if (N1SplatC->getAPIntValue() == 1)
33922 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
33928 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
33929 SDValue N0 = N->getOperand(0);
33930 SDValue N1 = N->getOperand(1);
33931 EVT VT = N0.getValueType();
33932 unsigned Size = VT.getSizeInBits();
33934 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
33935 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
33936 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
33937 // depending on sign of (SarConst - [56,48,32,24,16])
33939 // sexts in X86 are MOVs. The MOVs have the same code size
33940 // as above SHIFTs (only SHIFT on 1 has lower code size).
33941 // However the MOVs have 2 advantages to a SHIFT:
33942 // 1. MOVs can write to a register that differs from source
33943 // 2. MOVs accept memory operands
33945 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
33946 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
33947 N0.getOperand(1).getOpcode() != ISD::Constant)
33950 SDValue N00 = N0.getOperand(0);
33951 SDValue N01 = N0.getOperand(1);
33952 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
33953 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
33954 EVT CVT = N1.getValueType();
33956 if (SarConst.isNegative())
33959 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
33960 unsigned ShiftSize = SVT.getSizeInBits();
33961 // skipping types without corresponding sext/zext and
33962 // ShlConst that is not one of [56,48,32,24,16]
33963 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
33967 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
33968 SarConst = SarConst - (Size - ShiftSize);
33971 else if (SarConst.isNegative())
33972 return DAG.getNode(ISD::SHL, DL, VT, NN,
33973 DAG.getConstant(-SarConst, DL, CVT));
33975 return DAG.getNode(ISD::SRA, DL, VT, NN,
33976 DAG.getConstant(SarConst, DL, CVT));
33981 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
33982 TargetLowering::DAGCombinerInfo &DCI) {
33983 SDValue N0 = N->getOperand(0);
33984 SDValue N1 = N->getOperand(1);
33985 EVT VT = N0.getValueType();
33987 // Only do this on the last DAG combine as it can interfere with other
33989 if (!DCI.isAfterLegalizeDAG())
33992 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
33993 // TODO: This is a generic DAG combine that became an x86-only combine to
33994 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
33995 // and-not ('andn').
33996 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
33999 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
34000 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
34001 if (!ShiftC || !AndC)
34004 // If we can shrink the constant mask below 8-bits or 32-bits, then this
34005 // transform should reduce code size. It may also enable secondary transforms
34006 // from improved known-bits analysis or instruction selection.
34007 APInt MaskVal = AndC->getAPIntValue();
34009 // If this can be matched by a zero extend, don't optimize.
34010 if (MaskVal.isMask()) {
34011 unsigned TO = MaskVal.countTrailingOnes();
34012 if (TO >= 8 && isPowerOf2_32(TO))
34016 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
34017 unsigned OldMaskSize = MaskVal.getMinSignedBits();
34018 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
34019 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
34020 (OldMaskSize > 32 && NewMaskSize <= 32)) {
34021 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
34023 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
34024 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
34025 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
34030 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
34031 TargetLowering::DAGCombinerInfo &DCI,
34032 const X86Subtarget &Subtarget) {
34033 if (N->getOpcode() == ISD::SHL)
34034 if (SDValue V = combineShiftLeft(N, DAG))
34037 if (N->getOpcode() == ISD::SRA)
34038 if (SDValue V = combineShiftRightArithmetic(N, DAG))
34041 if (N->getOpcode() == ISD::SRL)
34042 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
34048 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
34049 TargetLowering::DAGCombinerInfo &DCI,
34050 const X86Subtarget &Subtarget) {
34051 unsigned Opcode = N->getOpcode();
34052 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
34053 "Unexpected shift opcode");
34055 EVT VT = N->getValueType(0);
34056 SDValue N0 = N->getOperand(0);
34057 SDValue N1 = N->getOperand(1);
34058 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
34059 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
34060 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
34061 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
34062 "Unexpected PACKSS/PACKUS input type");
34064 // Constant Folding.
34065 APInt UndefElts0, UndefElts1;
34066 SmallVector<APInt, 32> EltBits0, EltBits1;
34067 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
34068 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
34069 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
34070 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
34071 unsigned NumLanes = VT.getSizeInBits() / 128;
34072 unsigned NumDstElts = VT.getVectorNumElements();
34073 unsigned NumSrcElts = NumDstElts / 2;
34074 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
34075 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
34076 bool IsSigned = (X86ISD::PACKSS == Opcode);
34078 APInt Undefs(NumDstElts, 0);
34079 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
34080 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
34081 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
34082 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
34083 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
34084 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
34086 if (UndefElts[SrcIdx]) {
34087 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
34091 APInt &Val = EltBits[SrcIdx];
34093 // PACKSS: Truncate signed value with signed saturation.
34094 // Source values less than dst minint are saturated to minint.
34095 // Source values greater than dst maxint are saturated to maxint.
34096 if (Val.isSignedIntN(DstBitsPerElt))
34097 Val = Val.trunc(DstBitsPerElt);
34098 else if (Val.isNegative())
34099 Val = APInt::getSignedMinValue(DstBitsPerElt);
34101 Val = APInt::getSignedMaxValue(DstBitsPerElt);
34103 // PACKUS: Truncate signed value with unsigned saturation.
34104 // Source values less than zero are saturated to zero.
34105 // Source values greater than dst maxuint are saturated to maxuint.
34106 if (Val.isIntN(DstBitsPerElt))
34107 Val = Val.trunc(DstBitsPerElt);
34108 else if (Val.isNegative())
34109 Val = APInt::getNullValue(DstBitsPerElt);
34111 Val = APInt::getAllOnesValue(DstBitsPerElt);
34113 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
34117 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
34120 // Attempt to combine as shuffle.
34123 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34124 /*HasVarMask*/ false, DAG, Subtarget)) {
34125 DCI.CombineTo(N, Res);
34132 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
34133 TargetLowering::DAGCombinerInfo &DCI,
34134 const X86Subtarget &Subtarget) {
34135 unsigned Opcode = N->getOpcode();
34136 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
34137 X86ISD::VSRLI == Opcode) &&
34138 "Unexpected shift opcode");
34139 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
34140 EVT VT = N->getValueType(0);
34141 SDValue N0 = N->getOperand(0);
34142 SDValue N1 = N->getOperand(1);
34143 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
34144 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
34145 "Unexpected value type");
34147 // Out of range logical bit shifts are guaranteed to be zero.
34148 // Out of range arithmetic bit shifts splat the sign bit.
34149 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
34150 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
34152 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34154 ShiftVal = NumBitsPerElt - 1;
34157 // Shift N0 by zero -> N0.
34161 // Shift zero -> zero.
34162 if (ISD::isBuildVectorAllZeros(N0.getNode()))
34163 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34165 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
34166 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
34167 // TODO - support other sra opcodes as needed.
34168 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
34169 N0.getOpcode() == X86ISD::VSRAI)
34170 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
34172 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
34173 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
34174 N1 == N0.getOperand(1)) {
34175 SDValue N00 = N0.getOperand(0);
34176 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
34177 if (ShiftVal.ult(NumSignBits))
34181 // We can decode 'whole byte' logical bit shifts as shuffles.
34182 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
34184 if (SDValue Res = combineX86ShufflesRecursively(
34185 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34186 /*HasVarMask*/ false, DAG, Subtarget)) {
34187 DCI.CombineTo(N, Res);
34192 // Constant Folding.
34194 SmallVector<APInt, 32> EltBits;
34195 if (N->isOnlyUserOf(N0.getNode()) &&
34196 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
34197 assert(EltBits.size() == VT.getVectorNumElements() &&
34198 "Unexpected shift value type");
34199 unsigned ShiftImm = ShiftVal.getZExtValue();
34200 for (APInt &Elt : EltBits) {
34201 if (X86ISD::VSHLI == Opcode)
34203 else if (X86ISD::VSRAI == Opcode)
34204 Elt.ashrInPlace(ShiftImm);
34206 Elt.lshrInPlace(ShiftImm);
34208 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
34214 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
34215 TargetLowering::DAGCombinerInfo &DCI,
34216 const X86Subtarget &Subtarget) {
34218 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
34219 (N->getOpcode() == X86ISD::PINSRW &&
34220 N->getValueType(0) == MVT::v8i16)) &&
34221 "Unexpected vector insertion");
34223 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
34226 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34227 /*HasVarMask*/ false, DAG, Subtarget)) {
34228 DCI.CombineTo(N, Res);
34235 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
34236 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
34237 /// OR -> CMPNEQSS.
34238 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
34239 TargetLowering::DAGCombinerInfo &DCI,
34240 const X86Subtarget &Subtarget) {
34243 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
34244 // we're requiring SSE2 for both.
34245 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
34246 SDValue N0 = N->getOperand(0);
34247 SDValue N1 = N->getOperand(1);
34248 SDValue CMP0 = N0->getOperand(1);
34249 SDValue CMP1 = N1->getOperand(1);
34252 // The SETCCs should both refer to the same CMP.
34253 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
34256 SDValue CMP00 = CMP0->getOperand(0);
34257 SDValue CMP01 = CMP0->getOperand(1);
34258 EVT VT = CMP00.getValueType();
34260 if (VT == MVT::f32 || VT == MVT::f64) {
34261 bool ExpectingFlags = false;
34262 // Check for any users that want flags:
34263 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
34264 !ExpectingFlags && UI != UE; ++UI)
34265 switch (UI->getOpcode()) {
34270 ExpectingFlags = true;
34272 case ISD::CopyToReg:
34273 case ISD::SIGN_EXTEND:
34274 case ISD::ZERO_EXTEND:
34275 case ISD::ANY_EXTEND:
34279 if (!ExpectingFlags) {
34280 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
34281 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
34283 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
34284 X86::CondCode tmp = cc0;
34289 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
34290 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
34291 // FIXME: need symbolic constants for these magic numbers.
34292 // See X86ATTInstPrinter.cpp:printSSECC().
34293 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
34294 if (Subtarget.hasAVX512()) {
34296 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
34297 DAG.getConstant(x86cc, DL, MVT::i8));
34298 // Need to fill with zeros to ensure the bitcast will produce zeroes
34299 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
34300 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
34301 DAG.getConstant(0, DL, MVT::v16i1),
34302 FSetCC, DAG.getIntPtrConstant(0, DL));
34303 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
34304 N->getSimpleValueType(0));
34306 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
34307 CMP00.getValueType(), CMP00, CMP01,
34308 DAG.getConstant(x86cc, DL,
34311 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
34312 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
34314 if (is64BitFP && !Subtarget.is64Bit()) {
34315 // On a 32-bit target, we cannot bitcast the 64-bit float to a
34316 // 64-bit integer, since that's not a legal type. Since
34317 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
34318 // bits, but can do this little dance to extract the lowest 32 bits
34319 // and work with those going forward.
34320 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
34322 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
34323 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
34324 Vector32, DAG.getIntPtrConstant(0, DL));
34328 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
34329 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
34330 DAG.getConstant(1, DL, IntVT));
34331 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
34333 return OneBitOfTruth;
34341 // Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
34342 static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
34343 if (N->getOpcode() != ISD::AND)
34346 SDValue N0 = N->getOperand(0);
34347 SDValue N1 = N->getOperand(1);
34348 if (N0.getOpcode() == ISD::XOR &&
34349 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
34350 X = N0.getOperand(0);
34354 if (N1.getOpcode() == ISD::XOR &&
34355 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
34356 X = N1.getOperand(0);
34364 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
34365 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
34366 assert(N->getOpcode() == ISD::AND);
34368 EVT VT = N->getValueType(0);
34369 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
34373 if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
34374 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
34379 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
34380 // register. In most cases we actually compare or select YMM-sized registers
34381 // and mixing the two types creates horrible code. This method optimizes
34382 // some of the transition sequences.
34383 // Even with AVX-512 this is still useful for removing casts around logical
34384 // operations on vXi1 mask types.
34385 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
34386 const X86Subtarget &Subtarget) {
34387 EVT VT = N->getValueType(0);
34388 assert(VT.isVector() && "Expected vector type");
34390 assert((N->getOpcode() == ISD::ANY_EXTEND ||
34391 N->getOpcode() == ISD::ZERO_EXTEND ||
34392 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
34394 SDValue Narrow = N->getOperand(0);
34395 EVT NarrowVT = Narrow.getValueType();
34397 if (Narrow->getOpcode() != ISD::XOR &&
34398 Narrow->getOpcode() != ISD::AND &&
34399 Narrow->getOpcode() != ISD::OR)
34402 SDValue N0 = Narrow->getOperand(0);
34403 SDValue N1 = Narrow->getOperand(1);
34406 // The Left side has to be a trunc.
34407 if (N0.getOpcode() != ISD::TRUNCATE)
34410 // The type of the truncated inputs.
34411 if (N0->getOperand(0).getValueType() != VT)
34414 // The right side has to be a 'trunc' or a constant vector.
34415 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
34416 N1.getOperand(0).getValueType() == VT;
34418 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
34421 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34423 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
34426 // Set N0 and N1 to hold the inputs to the new wide operation.
34427 N0 = N0->getOperand(0);
34429 N1 = N1->getOperand(0);
34431 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
34433 // Generate the wide operation.
34434 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
34435 unsigned Opcode = N->getOpcode();
34437 default: llvm_unreachable("Unexpected opcode");
34438 case ISD::ANY_EXTEND:
34440 case ISD::ZERO_EXTEND:
34441 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
34442 case ISD::SIGN_EXTEND:
34443 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
34444 Op, DAG.getValueType(NarrowVT));
34448 /// If both input operands of a logic op are being cast from floating point
34449 /// types, try to convert this into a floating point logic node to avoid
34450 /// unnecessary moves from SSE to integer registers.
34451 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
34452 const X86Subtarget &Subtarget) {
34453 unsigned FPOpcode = ISD::DELETED_NODE;
34454 if (N->getOpcode() == ISD::AND)
34455 FPOpcode = X86ISD::FAND;
34456 else if (N->getOpcode() == ISD::OR)
34457 FPOpcode = X86ISD::FOR;
34458 else if (N->getOpcode() == ISD::XOR)
34459 FPOpcode = X86ISD::FXOR;
34461 assert(FPOpcode != ISD::DELETED_NODE &&
34462 "Unexpected input node for FP logic conversion");
34464 EVT VT = N->getValueType(0);
34465 SDValue N0 = N->getOperand(0);
34466 SDValue N1 = N->getOperand(1);
34468 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
34469 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
34470 (Subtarget.hasSSE2() && VT == MVT::i64))) {
34471 SDValue N00 = N0.getOperand(0);
34472 SDValue N10 = N1.getOperand(0);
34473 EVT N00Type = N00.getValueType();
34474 EVT N10Type = N10.getValueType();
34475 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
34476 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
34477 return DAG.getBitcast(VT, FPLogic);
34483 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
34484 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
34485 /// with a shift-right to eliminate loading the vector constant mask value.
34486 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
34487 const X86Subtarget &Subtarget) {
34488 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
34489 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
34490 EVT VT0 = Op0.getValueType();
34491 EVT VT1 = Op1.getValueType();
34493 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
34497 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
34498 !SplatVal.isMask())
34501 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
34504 unsigned EltBitWidth = VT0.getScalarSizeInBits();
34505 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
34509 unsigned ShiftVal = SplatVal.countTrailingOnes();
34510 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
34511 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
34512 return DAG.getBitcast(N->getValueType(0), Shift);
34515 // Get the index node from the lowered DAG of a GEP IR instruction with one
34516 // indexing dimension.
34517 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
34518 if (Ld->isIndexed())
34521 SDValue Base = Ld->getBasePtr();
34523 if (Base.getOpcode() != ISD::ADD)
34526 SDValue ShiftedIndex = Base.getOperand(0);
34528 if (ShiftedIndex.getOpcode() != ISD::SHL)
34531 return ShiftedIndex.getOperand(0);
34535 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
34536 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
34537 switch (VT.getSizeInBits()) {
34538 default: return false;
34539 case 64: return Subtarget.is64Bit() ? true : false;
34540 case 32: return true;
34546 // This function recognizes cases where X86 bzhi instruction can replace and
34547 // 'and-load' sequence.
34548 // In case of loading integer value from an array of constants which is defined
34551 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
34553 // then applying a bitwise and on the result with another input.
34554 // It's equivalent to performing bzhi (zero high bits) on the input, with the
34555 // same index of the load.
34556 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
34557 const X86Subtarget &Subtarget) {
34558 MVT VT = Node->getSimpleValueType(0);
34561 // Check if subtarget has BZHI instruction for the node's type
34562 if (!hasBZHI(Subtarget, VT))
34565 // Try matching the pattern for both operands.
34566 for (unsigned i = 0; i < 2; i++) {
34567 SDValue N = Node->getOperand(i);
34568 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
34570 // continue if the operand is not a load instruction
34574 const Value *MemOp = Ld->getMemOperand()->getValue();
34579 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
34580 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
34581 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
34583 Constant *Init = GV->getInitializer();
34584 Type *Ty = Init->getType();
34585 if (!isa<ConstantDataArray>(Init) ||
34586 !Ty->getArrayElementType()->isIntegerTy() ||
34587 Ty->getArrayElementType()->getScalarSizeInBits() !=
34588 VT.getSizeInBits() ||
34589 Ty->getArrayNumElements() >
34590 Ty->getArrayElementType()->getScalarSizeInBits())
34593 // Check if the array's constant elements are suitable to our case.
34594 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
34595 bool ConstantsMatch = true;
34596 for (uint64_t j = 0; j < ArrayElementCount; j++) {
34597 ConstantInt *Elem =
34598 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
34599 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
34600 ConstantsMatch = false;
34604 if (!ConstantsMatch)
34607 // Do the transformation (For 32-bit type):
34608 // -> (and (load arr[idx]), inp)
34609 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
34610 // that will be replaced with one bzhi instruction.
34611 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
34612 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
34614 // Get the Node which indexes into the array.
34615 SDValue Index = getIndexFromUnindexedLoad(Ld);
34618 Index = DAG.getZExtOrTrunc(Index, dl, VT);
34620 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
34622 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
34623 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
34625 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
34633 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
34634 TargetLowering::DAGCombinerInfo &DCI,
34635 const X86Subtarget &Subtarget) {
34636 EVT VT = N->getValueType(0);
34638 // If this is SSE1 only convert to FAND to avoid scalarization.
34639 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34640 return DAG.getBitcast(
34641 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
34642 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34643 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34646 // Use a 32-bit and+zext if upper bits known zero.
34647 if (VT == MVT::i64 && Subtarget.is64Bit() &&
34648 !isa<ConstantSDNode>(N->getOperand(1))) {
34649 APInt HiMask = APInt::getHighBitsSet(64, 32);
34650 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
34651 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
34653 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
34654 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
34655 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
34656 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
34660 if (DCI.isBeforeLegalizeOps())
34663 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34666 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34669 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
34672 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
34675 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
34678 // Attempt to recursively combine a bitmask AND with shuffles.
34679 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34681 if (SDValue Res = combineX86ShufflesRecursively(
34682 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34683 /*HasVarMask*/ false, DAG, Subtarget)) {
34684 DCI.CombineTo(N, Res);
34689 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
34690 if ((VT.getScalarSizeInBits() % 8) == 0 &&
34691 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34692 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
34693 SDValue BitMask = N->getOperand(1);
34694 SDValue SrcVec = N->getOperand(0).getOperand(0);
34695 EVT SrcVecVT = SrcVec.getValueType();
34697 // Check that the constant bitmask masks whole bytes.
34699 SmallVector<APInt, 64> EltBits;
34700 if (VT == SrcVecVT.getScalarType() &&
34701 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
34702 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
34703 llvm::all_of(EltBits, [](APInt M) {
34704 return M.isNullValue() || M.isAllOnesValue();
34706 unsigned NumElts = SrcVecVT.getVectorNumElements();
34707 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
34708 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
34710 // Create a root shuffle mask from the byte mask and the extracted index.
34711 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
34712 for (unsigned i = 0; i != Scale; ++i) {
34715 int VecIdx = Scale * Idx + i;
34716 ShuffleMask[VecIdx] =
34717 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
34720 if (SDValue Shuffle = combineX86ShufflesRecursively(
34721 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
34722 /*HasVarMask*/ false, DAG, Subtarget))
34723 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
34724 N->getOperand(0).getOperand(1));
34731 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
34732 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
34733 if (N->getOpcode() != ISD::OR)
34736 SDValue N0 = N->getOperand(0);
34737 SDValue N1 = N->getOperand(1);
34739 // Canonicalize AND to LHS.
34740 if (N1.getOpcode() == ISD::AND)
34743 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
34744 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
34747 Mask = N1.getOperand(0);
34748 X = N1.getOperand(1);
34750 // Check to see if the mask appeared in both the AND and ANDNP.
34751 if (N0.getOperand(0) == Mask)
34752 Y = N0.getOperand(1);
34753 else if (N0.getOperand(1) == Mask)
34754 Y = N0.getOperand(0);
34758 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
34759 // ANDNP combine allows other combines to happen that prevent matching.
34764 // (or (and (m, y), (pandn m, x)))
34766 // (vselect m, x, y)
34767 // As a special case, try to fold:
34768 // (or (and (m, (sub 0, x)), (pandn m, x)))
34770 // (sub (xor X, M), M)
34771 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
34772 const X86Subtarget &Subtarget) {
34773 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
34775 EVT VT = N->getValueType(0);
34776 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
34777 (VT.is256BitVector() && Subtarget.hasInt256())))
34780 SDValue X, Y, Mask;
34781 if (!matchLogicBlend(N, X, Y, Mask))
34784 // Validate that X, Y, and Mask are bitcasts, and see through them.
34785 Mask = peekThroughBitcasts(Mask);
34786 X = peekThroughBitcasts(X);
34787 Y = peekThroughBitcasts(Y);
34789 EVT MaskVT = Mask.getValueType();
34790 unsigned EltBits = MaskVT.getScalarSizeInBits();
34792 // TODO: Attempt to handle floating point cases as well?
34793 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
34799 // (or (and (M, (sub 0, X)), (pandn M, X)))
34800 // which is a special case of vselect:
34801 // (vselect M, (sub 0, X), X)
34803 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
34804 // We know that, if fNegate is 0 or 1:
34805 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
34807 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
34808 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
34809 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
34810 // This lets us transform our vselect to:
34811 // (add (xor X, M), (and M, 1))
34813 // (sub (xor X, M), M)
34814 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
34815 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
34816 auto IsNegV = [](SDNode *N, SDValue V) {
34817 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
34818 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
34821 if (IsNegV(Y.getNode(), X))
34823 else if (IsNegV(X.getNode(), Y))
34827 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
34828 SDValue SubOp2 = Mask;
34830 // If the negate was on the false side of the select, then
34831 // the operands of the SUB need to be swapped. PR 27251.
34832 // This is because the pattern being matched above is
34833 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
34834 // but if the pattern matched was
34835 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
34836 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
34837 // pattern also needs to be a negation of the replacement pattern above.
34838 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
34839 // sub accomplishes the negation of the replacement pattern.
34841 std::swap(SubOp1, SubOp2);
34843 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
34844 return DAG.getBitcast(VT, Res);
34848 // PBLENDVB is only available on SSE 4.1.
34849 if (!Subtarget.hasSSE41())
34852 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
34854 X = DAG.getBitcast(BlendVT, X);
34855 Y = DAG.getBitcast(BlendVT, Y);
34856 Mask = DAG.getBitcast(BlendVT, Mask);
34857 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
34858 return DAG.getBitcast(VT, Mask);
34861 // Helper function for combineOrCmpEqZeroToCtlzSrl
34865 // srl(ctlz x), log2(bitsize(x))
34866 // Input pattern is checked by caller.
34867 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
34868 SelectionDAG &DAG) {
34869 SDValue Cmp = Op.getOperand(1);
34870 EVT VT = Cmp.getOperand(0).getValueType();
34871 unsigned Log2b = Log2_32(VT.getSizeInBits());
34873 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
34874 // The result of the shift is true or false, and on X86, the 32-bit
34875 // encoding of shr and lzcnt is more desirable.
34876 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
34877 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
34878 DAG.getConstant(Log2b, dl, VT));
34879 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
34882 // Try to transform:
34883 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
34885 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
34886 // Will also attempt to match more generic cases, eg:
34887 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
34888 // Only applies if the target supports the FastLZCNT feature.
34889 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
34890 TargetLowering::DAGCombinerInfo &DCI,
34891 const X86Subtarget &Subtarget) {
34892 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
34895 auto isORCandidate = [](SDValue N) {
34896 return (N->getOpcode() == ISD::OR && N->hasOneUse());
34899 // Check the zero extend is extending to 32-bit or more. The code generated by
34900 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
34901 // instructions to clear the upper bits.
34902 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
34903 !isORCandidate(N->getOperand(0)))
34906 // Check the node matches: setcc(eq, cmp 0)
34907 auto isSetCCCandidate = [](SDValue N) {
34908 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
34909 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
34910 N->getOperand(1).getOpcode() == X86ISD::CMP &&
34911 isNullConstant(N->getOperand(1).getOperand(1)) &&
34912 N->getOperand(1).getValueType().bitsGE(MVT::i32);
34915 SDNode *OR = N->getOperand(0).getNode();
34916 SDValue LHS = OR->getOperand(0);
34917 SDValue RHS = OR->getOperand(1);
34919 // Save nodes matching or(or, setcc(eq, cmp 0)).
34920 SmallVector<SDNode *, 2> ORNodes;
34921 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
34922 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
34923 ORNodes.push_back(OR);
34924 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
34925 LHS = OR->getOperand(0);
34926 RHS = OR->getOperand(1);
34929 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
34930 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
34931 !isORCandidate(SDValue(OR, 0)))
34934 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
34936 // or(srl(ctlz),srl(ctlz)).
34937 // The dag combiner can then fold it into:
34938 // srl(or(ctlz, ctlz)).
34939 EVT VT = OR->getValueType(0);
34940 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
34941 SDValue Ret, NewRHS;
34942 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
34943 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
34948 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
34949 while (ORNodes.size() > 0) {
34950 OR = ORNodes.pop_back_val();
34951 LHS = OR->getOperand(0);
34952 RHS = OR->getOperand(1);
34953 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
34954 if (RHS->getOpcode() == ISD::OR)
34955 std::swap(LHS, RHS);
34956 EVT VT = OR->getValueType(0);
34957 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
34960 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
34964 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
34969 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
34970 TargetLowering::DAGCombinerInfo &DCI,
34971 const X86Subtarget &Subtarget) {
34972 SDValue N0 = N->getOperand(0);
34973 SDValue N1 = N->getOperand(1);
34974 EVT VT = N->getValueType(0);
34976 // If this is SSE1 only convert to FOR to avoid scalarization.
34977 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34978 return DAG.getBitcast(MVT::v4i32,
34979 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
34980 DAG.getBitcast(MVT::v4f32, N0),
34981 DAG.getBitcast(MVT::v4f32, N1)));
34984 if (DCI.isBeforeLegalizeOps())
34987 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34990 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34993 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
34996 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
34999 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
35000 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
35002 // SHLD/SHRD instructions have lower register pressure, but on some
35003 // platforms they have higher latency than the equivalent
35004 // series of shifts/or that would otherwise be generated.
35005 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
35006 // have higher latencies and we are not optimizing for size.
35007 if (!OptForSize && Subtarget.isSHLDSlow())
35010 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
35012 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
35014 if (!N0.hasOneUse() || !N1.hasOneUse())
35017 SDValue ShAmt0 = N0.getOperand(1);
35018 if (ShAmt0.getValueType() != MVT::i8)
35020 SDValue ShAmt1 = N1.getOperand(1);
35021 if (ShAmt1.getValueType() != MVT::i8)
35023 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
35024 ShAmt0 = ShAmt0.getOperand(0);
35025 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
35026 ShAmt1 = ShAmt1.getOperand(0);
35029 unsigned Opc = X86ISD::SHLD;
35030 SDValue Op0 = N0.getOperand(0);
35031 SDValue Op1 = N1.getOperand(0);
35032 if (ShAmt0.getOpcode() == ISD::SUB ||
35033 ShAmt0.getOpcode() == ISD::XOR) {
35034 Opc = X86ISD::SHRD;
35035 std::swap(Op0, Op1);
35036 std::swap(ShAmt0, ShAmt1);
35039 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
35040 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
35041 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
35042 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
35043 unsigned Bits = VT.getSizeInBits();
35044 if (ShAmt1.getOpcode() == ISD::SUB) {
35045 SDValue Sum = ShAmt1.getOperand(0);
35046 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
35047 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
35048 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
35049 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
35050 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
35051 return DAG.getNode(Opc, DL, VT,
35053 DAG.getNode(ISD::TRUNCATE, DL,
35056 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
35057 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
35058 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
35059 return DAG.getNode(Opc, DL, VT,
35060 N0.getOperand(0), N1.getOperand(0),
35061 DAG.getNode(ISD::TRUNCATE, DL,
35063 } else if (ShAmt1.getOpcode() == ISD::XOR) {
35064 SDValue Mask = ShAmt1.getOperand(1);
35065 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
35066 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
35067 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
35068 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
35069 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
35070 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
35071 if (Op1.getOpcode() == InnerShift &&
35072 isa<ConstantSDNode>(Op1.getOperand(1)) &&
35073 Op1.getConstantOperandVal(1) == 1) {
35074 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35075 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35077 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
35078 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
35079 Op1.getOperand(0) == Op1.getOperand(1)) {
35080 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35081 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35090 /// Try to turn tests against the signbit in the form of:
35091 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
35094 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
35095 // This is only worth doing if the output type is i8 or i1.
35096 EVT ResultType = N->getValueType(0);
35097 if (ResultType != MVT::i8 && ResultType != MVT::i1)
35100 SDValue N0 = N->getOperand(0);
35101 SDValue N1 = N->getOperand(1);
35103 // We should be performing an xor against a truncated shift.
35104 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
35107 // Make sure we are performing an xor against one.
35108 if (!isOneConstant(N1))
35111 // SetCC on x86 zero extends so only act on this if it's a logical shift.
35112 SDValue Shift = N0.getOperand(0);
35113 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
35116 // Make sure we are truncating from one of i16, i32 or i64.
35117 EVT ShiftTy = Shift.getValueType();
35118 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
35121 // Make sure the shift amount extracts the sign bit.
35122 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
35123 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
35126 // Create a greater-than comparison against -1.
35127 // N.B. Using SETGE against 0 works but we want a canonical looking
35128 // comparison, using SETGT matches up with what TranslateX86CC.
35130 SDValue ShiftOp = Shift.getOperand(0);
35131 EVT ShiftOpTy = ShiftOp.getValueType();
35132 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35133 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
35134 *DAG.getContext(), ResultType);
35135 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
35136 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
35137 if (SetCCResultType != ResultType)
35138 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
35142 /// Turn vector tests of the signbit in the form of:
35143 /// xor (sra X, elt_size(X)-1), -1
35147 /// This should be called before type legalization because the pattern may not
35148 /// persist after that.
35149 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
35150 const X86Subtarget &Subtarget) {
35151 EVT VT = N->getValueType(0);
35152 if (!VT.isSimple())
35155 switch (VT.getSimpleVT().SimpleTy) {
35156 default: return SDValue();
35159 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
35160 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
35164 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
35167 // There must be a shift right algebraic before the xor, and the xor must be a
35168 // 'not' operation.
35169 SDValue Shift = N->getOperand(0);
35170 SDValue Ones = N->getOperand(1);
35171 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
35172 !ISD::isBuildVectorAllOnes(Ones.getNode()))
35175 // The shift should be smearing the sign bit across each vector element.
35176 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
35180 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
35181 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
35182 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
35185 // Create a greater-than comparison against -1. We don't use the more obvious
35186 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
35187 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
35190 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
35191 /// is valid for the given \p Subtarget.
35192 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
35193 const X86Subtarget &Subtarget) {
35194 if (!Subtarget.hasAVX512())
35197 // FIXME: Scalar type may be supported if we move it to vector register.
35198 if (!SrcVT.isVector())
35201 EVT SrcElVT = SrcVT.getScalarType();
35202 EVT DstElVT = DstVT.getScalarType();
35203 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
35205 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
35206 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
35210 /// Detect patterns of truncation with unsigned saturation:
35212 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35213 /// Return the source value x to be truncated or SDValue() if the pattern was
35216 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
35217 /// where C1 >= 0 and C2 is unsigned max of destination type.
35219 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
35220 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
35222 /// These two patterns are equivalent to:
35223 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
35224 /// So return the smax(x, C1) value to be truncated or SDValue() if the
35225 /// pattern was not matched.
35226 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35228 EVT InVT = In.getValueType();
35230 // Saturation with truncation. We truncate from InVT to VT.
35231 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
35232 "Unexpected types for truncate operation");
35234 // Match min/max and return limit value as a parameter.
35235 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
35236 if (V.getOpcode() == Opcode &&
35237 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
35238 return V.getOperand(0);
35243 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
35244 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
35245 // the element size of the destination type.
35246 if (C2.isMask(VT.getScalarSizeInBits()))
35249 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
35250 if (MatchMinMax(SMin, ISD::SMAX, C1))
35251 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
35254 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
35255 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
35256 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
35258 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
35264 /// Detect patterns of truncation with signed saturation:
35265 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
35266 /// signed_max_of_dest_type)) to dest_type)
35268 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
35269 /// signed_min_of_dest_type)) to dest_type).
35270 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
35271 /// Return the source value to be truncated or SDValue() if the pattern was not
35273 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
35274 unsigned NumDstBits = VT.getScalarSizeInBits();
35275 unsigned NumSrcBits = In.getScalarValueSizeInBits();
35276 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
35278 auto MatchMinMax = [](SDValue V, unsigned Opcode,
35279 const APInt &Limit) -> SDValue {
35281 if (V.getOpcode() == Opcode &&
35282 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
35283 return V.getOperand(0);
35287 APInt SignedMax, SignedMin;
35289 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
35290 SignedMin = APInt(NumSrcBits, 0);
35292 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
35293 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
35296 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
35297 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
35300 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
35301 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
35307 /// Detect a pattern of truncation with signed saturation.
35308 /// The types should allow to use VPMOVSS* instruction on AVX512.
35309 /// Return the source value to be truncated or SDValue() if the pattern was not
35311 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
35312 const X86Subtarget &Subtarget,
35313 const TargetLowering &TLI) {
35314 if (!TLI.isTypeLegal(In.getValueType()))
35316 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35318 return detectSSatPattern(In, VT);
35321 /// Detect a pattern of truncation with saturation:
35322 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35323 /// The types should allow to use VPMOVUS* instruction on AVX512.
35324 /// Return the source value to be truncated or SDValue() if the pattern was not
35326 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35328 const X86Subtarget &Subtarget,
35329 const TargetLowering &TLI) {
35330 if (!TLI.isTypeLegal(In.getValueType()))
35332 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35334 return detectUSatPattern(In, VT, DAG, DL);
35337 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
35339 const X86Subtarget &Subtarget) {
35340 EVT SVT = VT.getScalarType();
35341 EVT InVT = In.getValueType();
35342 EVT InSVT = InVT.getScalarType();
35343 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35344 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
35345 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
35346 if (auto SSatVal = detectSSatPattern(In, VT))
35347 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
35348 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
35349 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
35351 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
35352 (SVT == MVT::i8 || SVT == MVT::i16) &&
35353 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
35354 if (auto USatVal = detectSSatPattern(In, VT, true)) {
35355 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
35356 if (SVT == MVT::i8 && InSVT == MVT::i32) {
35357 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35358 VT.getVectorNumElements());
35359 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
35362 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
35364 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
35365 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
35368 if (auto SSatVal = detectSSatPattern(In, VT))
35369 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
35375 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
35376 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
35377 /// X86ISD::AVG instruction.
35378 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35379 const X86Subtarget &Subtarget,
35381 if (!VT.isVector())
35383 EVT InVT = In.getValueType();
35384 unsigned NumElems = VT.getVectorNumElements();
35386 EVT ScalarVT = VT.getVectorElementType();
35387 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
35388 isPowerOf2_32(NumElems)))
35391 // InScalarVT is the intermediate type in AVG pattern and it should be greater
35392 // than the original input type (i8/i16).
35393 EVT InScalarVT = InVT.getVectorElementType();
35394 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
35397 if (!Subtarget.hasSSE2())
35400 // Detect the following pattern:
35402 // %1 = zext <N x i8> %a to <N x i32>
35403 // %2 = zext <N x i8> %b to <N x i32>
35404 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
35405 // %4 = add nuw nsw <N x i32> %3, %2
35406 // %5 = lshr <N x i32> %N, <i32 1 x N>
35407 // %6 = trunc <N x i32> %5 to <N x i8>
35409 // In AVX512, the last instruction can also be a trunc store.
35410 if (In.getOpcode() != ISD::SRL)
35413 // A lambda checking the given SDValue is a constant vector and each element
35414 // is in the range [Min, Max].
35415 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
35416 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
35417 if (!BV || !BV->isConstant())
35419 for (SDValue Op : V->ops()) {
35420 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
35423 const APInt &Val = C->getAPIntValue();
35424 if (Val.ult(Min) || Val.ugt(Max))
35430 // Check if each element of the vector is left-shifted by one.
35431 auto LHS = In.getOperand(0);
35432 auto RHS = In.getOperand(1);
35433 if (!IsConstVectorInRange(RHS, 1, 1))
35435 if (LHS.getOpcode() != ISD::ADD)
35438 // Detect a pattern of a + b + 1 where the order doesn't matter.
35439 SDValue Operands[3];
35440 Operands[0] = LHS.getOperand(0);
35441 Operands[1] = LHS.getOperand(1);
35443 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35444 ArrayRef<SDValue> Ops) {
35445 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
35448 // Take care of the case when one of the operands is a constant vector whose
35449 // element is in the range [1, 256].
35450 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
35451 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
35452 Operands[0].getOperand(0).getValueType() == VT) {
35453 // The pattern is detected. Subtract one from the constant vector, then
35454 // demote it and emit X86ISD::AVG instruction.
35455 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
35456 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
35457 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
35458 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35459 { Operands[0].getOperand(0), Operands[1] },
35463 if (Operands[0].getOpcode() == ISD::ADD)
35464 std::swap(Operands[0], Operands[1]);
35465 else if (Operands[1].getOpcode() != ISD::ADD)
35467 Operands[2] = Operands[1].getOperand(0);
35468 Operands[1] = Operands[1].getOperand(1);
35470 // Now we have three operands of two additions. Check that one of them is a
35471 // constant vector with ones, and the other two are promoted from i8/i16.
35472 for (int i = 0; i < 3; ++i) {
35473 if (!IsConstVectorInRange(Operands[i], 1, 1))
35475 std::swap(Operands[i], Operands[2]);
35477 // Check if Operands[0] and Operands[1] are results of type promotion.
35478 for (int j = 0; j < 2; ++j)
35479 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
35480 Operands[j].getOperand(0).getValueType() != VT)
35483 // The pattern is detected, emit X86ISD::AVG instruction(s).
35484 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35485 { Operands[0].getOperand(0),
35486 Operands[1].getOperand(0) }, AVGBuilder);
35492 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
35493 TargetLowering::DAGCombinerInfo &DCI,
35494 const X86Subtarget &Subtarget) {
35495 LoadSDNode *Ld = cast<LoadSDNode>(N);
35496 EVT RegVT = Ld->getValueType(0);
35497 EVT MemVT = Ld->getMemoryVT();
35499 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35501 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
35502 // into two 16-byte operations. Also split non-temporal aligned loads on
35503 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
35504 ISD::LoadExtType Ext = Ld->getExtensionType();
35506 unsigned AddressSpace = Ld->getAddressSpace();
35507 unsigned Alignment = Ld->getAlignment();
35508 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
35509 Ext == ISD::NON_EXTLOAD &&
35510 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
35511 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
35512 AddressSpace, Alignment, &Fast) && !Fast))) {
35513 unsigned NumElems = RegVT.getVectorNumElements();
35517 SDValue Ptr = Ld->getBasePtr();
35519 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
35522 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
35523 Alignment, Ld->getMemOperand()->getFlags());
35525 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
35527 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
35528 Ld->getPointerInfo().getWithOffset(16),
35529 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
35530 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
35532 Load2.getValue(1));
35534 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
35535 return DCI.CombineTo(N, NewVec, TF, true);
35541 /// If V is a build vector of boolean constants and exactly one of those
35542 /// constants is true, return the operand index of that true element.
35543 /// Otherwise, return -1.
35544 static int getOneTrueElt(SDValue V) {
35545 // This needs to be a build vector of booleans.
35546 // TODO: Checking for the i1 type matches the IR definition for the mask,
35547 // but the mask check could be loosened to i8 or other types. That might
35548 // also require checking more than 'allOnesValue'; eg, the x86 HW
35549 // instructions only require that the MSB is set for each mask element.
35550 // The ISD::MSTORE comments/definition do not specify how the mask operand
35552 auto *BV = dyn_cast<BuildVectorSDNode>(V);
35553 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
35556 int TrueIndex = -1;
35557 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
35558 for (unsigned i = 0; i < NumElts; ++i) {
35559 const SDValue &Op = BV->getOperand(i);
35562 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
35565 if (ConstNode->getAPIntValue().isAllOnesValue()) {
35566 // If we already found a one, this is too many.
35567 if (TrueIndex >= 0)
35575 /// Given a masked memory load/store operation, return true if it has one mask
35576 /// bit set. If it has one mask bit set, then also return the memory address of
35577 /// the scalar element to load/store, the vector index to insert/extract that
35578 /// scalar element, and the alignment for the scalar memory access.
35579 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
35580 SelectionDAG &DAG, SDValue &Addr,
35581 SDValue &Index, unsigned &Alignment) {
35582 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
35583 if (TrueMaskElt < 0)
35586 // Get the address of the one scalar element that is specified by the mask
35587 // using the appropriate offset from the base pointer.
35588 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
35589 Addr = MaskedOp->getBasePtr();
35590 if (TrueMaskElt != 0) {
35591 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
35592 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
35595 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
35596 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
35600 /// If exactly one element of the mask is set for a non-extending masked load,
35601 /// it is a scalar load and vector insert.
35602 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35603 /// mask have already been optimized in IR, so we don't bother with those here.
35605 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35606 TargetLowering::DAGCombinerInfo &DCI) {
35607 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35608 // However, some target hooks may need to be added to know when the transform
35609 // is profitable. Endianness would also have to be considered.
35611 SDValue Addr, VecIndex;
35612 unsigned Alignment;
35613 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
35616 // Load the one scalar element that is specified by the mask using the
35617 // appropriate offset from the base pointer.
35619 EVT VT = ML->getValueType(0);
35620 EVT EltVT = VT.getVectorElementType();
35622 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
35623 Alignment, ML->getMemOperand()->getFlags());
35625 // Insert the loaded element into the appropriate place in the vector.
35626 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
35628 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
35632 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35633 TargetLowering::DAGCombinerInfo &DCI) {
35634 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
35638 EVT VT = ML->getValueType(0);
35640 // If we are loading the first and last elements of a vector, it is safe and
35641 // always faster to load the whole vector. Replace the masked load with a
35642 // vector load and select.
35643 unsigned NumElts = VT.getVectorNumElements();
35644 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
35645 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
35646 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
35647 if (LoadFirstElt && LoadLastElt) {
35648 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35649 ML->getMemOperand());
35650 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
35651 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
35654 // Convert a masked load with a constant mask into a masked load and a select.
35655 // This allows the select operation to use a faster kind of select instruction
35656 // (for example, vblendvps -> vblendps).
35658 // Don't try this if the pass-through operand is already undefined. That would
35659 // cause an infinite loop because that's what we're about to create.
35660 if (ML->getSrc0().isUndef())
35663 // The new masked load has an undef pass-through operand. The select uses the
35664 // original pass-through operand.
35665 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35666 ML->getMask(), DAG.getUNDEF(VT),
35667 ML->getMemoryVT(), ML->getMemOperand(),
35668 ML->getExtensionType());
35669 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
35671 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
35674 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
35675 TargetLowering::DAGCombinerInfo &DCI,
35676 const X86Subtarget &Subtarget) {
35677 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
35679 // TODO: Expanding load with constant mask may be optimized as well.
35680 if (Mld->isExpandingLoad())
35683 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
35684 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
35686 // TODO: Do some AVX512 subsets benefit from this transform?
35687 if (!Subtarget.hasAVX512())
35688 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
35692 if (Mld->getExtensionType() != ISD::SEXTLOAD)
35695 // Resolve extending loads.
35696 EVT VT = Mld->getValueType(0);
35697 unsigned NumElems = VT.getVectorNumElements();
35698 EVT LdVT = Mld->getMemoryVT();
35701 assert(LdVT != VT && "Cannot extend to the same type");
35702 unsigned ToSz = VT.getScalarSizeInBits();
35703 unsigned FromSz = LdVT.getScalarSizeInBits();
35704 // From/To sizes and ElemCount must be pow of two.
35705 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35706 "Unexpected size for extending masked load");
35708 unsigned SizeRatio = ToSz / FromSz;
35709 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
35711 // Create a type on which we perform the shuffle.
35712 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35713 LdVT.getScalarType(), NumElems*SizeRatio);
35714 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35716 // Convert Src0 value.
35717 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
35718 if (!Mld->getSrc0().isUndef()) {
35719 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35720 for (unsigned i = 0; i != NumElems; ++i)
35721 ShuffleVec[i] = i * SizeRatio;
35723 // Can't shuffle using an illegal type.
35724 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35725 "WideVecVT should be legal");
35726 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
35727 DAG.getUNDEF(WideVecVT), ShuffleVec);
35730 // Prepare the new mask.
35732 SDValue Mask = Mld->getMask();
35733 if (Mask.getValueType() == VT) {
35734 // Mask and original value have the same type.
35735 NewMask = DAG.getBitcast(WideVecVT, Mask);
35736 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35737 for (unsigned i = 0; i != NumElems; ++i)
35738 ShuffleVec[i] = i * SizeRatio;
35739 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
35740 ShuffleVec[i] = NumElems * SizeRatio;
35741 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35742 DAG.getConstant(0, dl, WideVecVT),
35745 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35746 unsigned WidenNumElts = NumElems*SizeRatio;
35747 unsigned MaskNumElts = VT.getVectorNumElements();
35748 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35751 unsigned NumConcat = WidenNumElts / MaskNumElts;
35752 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35753 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35755 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35758 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
35759 Mld->getBasePtr(), NewMask, WideSrc0,
35760 Mld->getMemoryVT(), Mld->getMemOperand(),
35762 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
35763 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
35766 /// If exactly one element of the mask is set for a non-truncating masked store,
35767 /// it is a vector extract and scalar store.
35768 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35769 /// mask have already been optimized in IR, so we don't bother with those here.
35770 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
35771 SelectionDAG &DAG) {
35772 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35773 // However, some target hooks may need to be added to know when the transform
35774 // is profitable. Endianness would also have to be considered.
35776 SDValue Addr, VecIndex;
35777 unsigned Alignment;
35778 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
35781 // Extract the one scalar element that is actually being stored.
35783 EVT VT = MS->getValue().getValueType();
35784 EVT EltVT = VT.getVectorElementType();
35785 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
35786 MS->getValue(), VecIndex);
35788 // Store that element at the appropriate offset from the base pointer.
35789 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
35790 Alignment, MS->getMemOperand()->getFlags());
35793 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
35794 const X86Subtarget &Subtarget) {
35795 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
35797 if (Mst->isCompressingStore())
35800 if (!Mst->isTruncatingStore()) {
35801 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
35802 return ScalarStore;
35804 // If the mask is checking (0 > X), we're creating a vector with all-zeros
35805 // or all-ones elements based on the sign bits of X. AVX1 masked store only
35806 // cares about the sign bit of each mask element, so eliminate the compare:
35807 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
35808 // Note that by waiting to match an x86-specific PCMPGT node, we're
35809 // eliminating potentially more complex matching of a setcc node which has
35810 // a full range of predicates.
35811 SDValue Mask = Mst->getMask();
35812 if (Mask.getOpcode() == X86ISD::PCMPGT &&
35813 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
35814 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
35815 "Unexpected type for PCMPGT");
35816 return DAG.getMaskedStore(
35817 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
35818 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
35821 // TODO: AVX512 targets should also be able to simplify something like the
35822 // pattern above, but that pattern will be different. It will either need to
35823 // match setcc more generally or match PCMPGTM later (in tablegen?).
35828 // Resolve truncating stores.
35829 EVT VT = Mst->getValue().getValueType();
35830 unsigned NumElems = VT.getVectorNumElements();
35831 EVT StVT = Mst->getMemoryVT();
35834 assert(StVT != VT && "Cannot truncate to the same type");
35835 unsigned FromSz = VT.getScalarSizeInBits();
35836 unsigned ToSz = StVT.getScalarSizeInBits();
35838 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35840 // The truncating store is legal in some cases. For example
35841 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35842 // are designated for truncate store.
35843 // In this case we don't need any further transformations.
35844 if (TLI.isTruncStoreLegal(VT, StVT))
35847 // From/To sizes and ElemCount must be pow of two.
35848 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35849 "Unexpected size for truncating masked store");
35850 // We are going to use the original vector elt for storing.
35851 // Accumulated smaller vector elements must be a multiple of the store size.
35852 assert (((NumElems * FromSz) % ToSz) == 0 &&
35853 "Unexpected ratio for truncating masked store");
35855 unsigned SizeRatio = FromSz / ToSz;
35856 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35858 // Create a type on which we perform the shuffle.
35859 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35860 StVT.getScalarType(), NumElems*SizeRatio);
35862 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35864 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
35865 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35866 for (unsigned i = 0; i != NumElems; ++i)
35867 ShuffleVec[i] = i * SizeRatio;
35869 // Can't shuffle using an illegal type.
35870 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35871 "WideVecVT should be legal");
35873 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
35874 DAG.getUNDEF(WideVecVT),
35878 SDValue Mask = Mst->getMask();
35879 if (Mask.getValueType() == VT) {
35880 // Mask and original value have the same type.
35881 NewMask = DAG.getBitcast(WideVecVT, Mask);
35882 for (unsigned i = 0; i != NumElems; ++i)
35883 ShuffleVec[i] = i * SizeRatio;
35884 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
35885 ShuffleVec[i] = NumElems*SizeRatio;
35886 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35887 DAG.getConstant(0, dl, WideVecVT),
35890 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35891 unsigned WidenNumElts = NumElems*SizeRatio;
35892 unsigned MaskNumElts = VT.getVectorNumElements();
35893 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35896 unsigned NumConcat = WidenNumElts / MaskNumElts;
35897 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35898 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35900 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35903 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
35904 Mst->getBasePtr(), NewMask, StVT,
35905 Mst->getMemOperand(), false);
35908 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
35909 const X86Subtarget &Subtarget) {
35910 StoreSDNode *St = cast<StoreSDNode>(N);
35911 EVT VT = St->getValue().getValueType();
35912 EVT StVT = St->getMemoryVT();
35914 SDValue StoredVal = St->getOperand(1);
35915 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35917 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
35918 // This will avoid a copy to k-register.
35919 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
35920 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35921 StoredVal.getOperand(0).getValueType() == MVT::i8) {
35922 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
35923 St->getBasePtr(), St->getPointerInfo(),
35924 St->getAlignment(), St->getMemOperand()->getFlags());
35927 // Widen v2i1/v4i1 stores to v8i1.
35928 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
35929 Subtarget.hasAVX512()) {
35930 unsigned NumConcats = 8 / VT.getVectorNumElements();
35931 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
35932 Ops[0] = StoredVal;
35933 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
35934 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
35935 St->getPointerInfo(), St->getAlignment(),
35936 St->getMemOperand()->getFlags());
35939 // Turn vXi1 stores of constants into a scalar store.
35940 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
35941 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
35942 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
35943 // If its a v64i1 store without 64-bit support, we need two stores.
35944 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
35945 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
35946 StoredVal->ops().slice(0, 32));
35947 Lo = combinevXi1ConstantToInteger(Lo, DAG);
35948 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
35949 StoredVal->ops().slice(32, 32));
35950 Hi = combinevXi1ConstantToInteger(Hi, DAG);
35952 unsigned Alignment = St->getAlignment();
35954 SDValue Ptr0 = St->getBasePtr();
35955 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
35958 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
35959 Alignment, St->getMemOperand()->getFlags());
35961 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
35962 St->getPointerInfo().getWithOffset(4),
35963 MinAlign(Alignment, 4U),
35964 St->getMemOperand()->getFlags());
35965 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
35968 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
35969 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
35970 St->getPointerInfo(), St->getAlignment(),
35971 St->getMemOperand()->getFlags());
35974 // If we are saving a concatenation of two XMM registers and 32-byte stores
35975 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
35977 unsigned AddressSpace = St->getAddressSpace();
35978 unsigned Alignment = St->getAlignment();
35979 if (VT.is256BitVector() && StVT == VT &&
35980 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
35981 AddressSpace, Alignment, &Fast) &&
35983 unsigned NumElems = VT.getVectorNumElements();
35987 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
35988 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
35990 SDValue Ptr0 = St->getBasePtr();
35991 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
35994 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
35995 Alignment, St->getMemOperand()->getFlags());
35997 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
35998 St->getPointerInfo().getWithOffset(16),
35999 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
36000 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36003 // Optimize trunc store (of multiple scalars) to shuffle and store.
36004 // First, pack all of the elements in one place. Next, store to memory
36005 // in fewer chunks.
36006 if (St->isTruncatingStore() && VT.isVector()) {
36007 // Check if we can detect an AVG pattern from the truncation. If yes,
36008 // replace the trunc store by a normal store with the result of X86ISD::AVG
36010 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
36012 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
36013 St->getPointerInfo(), St->getAlignment(),
36014 St->getMemOperand()->getFlags());
36016 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36018 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
36020 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
36021 dl, Val, St->getBasePtr(),
36022 St->getMemoryVT(), St->getMemOperand(), DAG);
36023 if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
36024 DAG, dl, Subtarget, TLI))
36025 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
36026 dl, Val, St->getBasePtr(),
36027 St->getMemoryVT(), St->getMemOperand(), DAG);
36029 unsigned NumElems = VT.getVectorNumElements();
36030 assert(StVT != VT && "Cannot truncate to the same type");
36031 unsigned FromSz = VT.getScalarSizeInBits();
36032 unsigned ToSz = StVT.getScalarSizeInBits();
36034 // The truncating store is legal in some cases. For example
36035 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
36036 // are designated for truncate store.
36037 // In this case we don't need any further transformations.
36038 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
36041 // From, To sizes and ElemCount must be pow of two
36042 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
36043 // We are going to use the original vector elt for storing.
36044 // Accumulated smaller vector elements must be a multiple of the store size.
36045 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
36047 unsigned SizeRatio = FromSz / ToSz;
36049 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
36051 // Create a type on which we perform the shuffle
36052 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
36053 StVT.getScalarType(), NumElems*SizeRatio);
36055 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
36057 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
36058 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
36059 for (unsigned i = 0; i != NumElems; ++i)
36060 ShuffleVec[i] = i * SizeRatio;
36062 // Can't shuffle using an illegal type.
36063 if (!TLI.isTypeLegal(WideVecVT))
36066 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36067 DAG.getUNDEF(WideVecVT),
36069 // At this point all of the data is stored at the bottom of the
36070 // register. We now need to save it to mem.
36072 // Find the largest store unit
36073 MVT StoreType = MVT::i8;
36074 for (MVT Tp : MVT::integer_valuetypes()) {
36075 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
36079 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
36080 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
36081 (64 <= NumElems * ToSz))
36082 StoreType = MVT::f64;
36084 // Bitcast the original vector into a vector of store-size units
36085 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
36086 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
36087 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
36088 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
36089 SmallVector<SDValue, 8> Chains;
36090 SDValue Ptr = St->getBasePtr();
36092 // Perform one or more big stores into memory.
36093 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
36094 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
36095 StoreType, ShuffWide,
36096 DAG.getIntPtrConstant(i, dl));
36098 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
36099 St->getAlignment(), St->getMemOperand()->getFlags());
36100 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
36101 Chains.push_back(Ch);
36104 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
36107 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
36108 // the FP state in cases where an emms may be missing.
36109 // A preferable solution to the general problem is to figure out the right
36110 // places to insert EMMS. This qualifies as a quick hack.
36112 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
36113 if (VT.getSizeInBits() != 64)
36116 const Function &F = DAG.getMachineFunction().getFunction();
36117 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
36119 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
36120 if ((VT.isVector() ||
36121 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
36122 isa<LoadSDNode>(St->getValue()) &&
36123 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
36124 St->getChain().hasOneUse() && !St->isVolatile()) {
36125 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
36126 SmallVector<SDValue, 8> Ops;
36128 if (!ISD::isNormalLoad(Ld))
36131 // If this is not the MMX case, i.e. we are just turning i64 load/store
36132 // into f64 load/store, avoid the transformation if there are multiple
36133 // uses of the loaded value.
36134 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
36139 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
36140 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
36142 if (Subtarget.is64Bit() || F64IsLegal) {
36143 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
36144 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
36145 Ld->getMemOperand());
36147 // Make sure new load is placed in same chain order.
36148 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
36149 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
36150 St->getMemOperand());
36153 // Otherwise, lower to two pairs of 32-bit loads / stores.
36154 SDValue LoAddr = Ld->getBasePtr();
36155 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
36157 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
36158 Ld->getPointerInfo(), Ld->getAlignment(),
36159 Ld->getMemOperand()->getFlags());
36160 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
36161 Ld->getPointerInfo().getWithOffset(4),
36162 MinAlign(Ld->getAlignment(), 4),
36163 Ld->getMemOperand()->getFlags());
36164 // Make sure new loads are placed in same chain order.
36165 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
36166 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
36168 LoAddr = St->getBasePtr();
36169 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
36172 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
36173 St->getAlignment(), St->getMemOperand()->getFlags());
36174 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
36175 St->getPointerInfo().getWithOffset(4),
36176 MinAlign(St->getAlignment(), 4),
36177 St->getMemOperand()->getFlags());
36178 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
36181 // This is similar to the above case, but here we handle a scalar 64-bit
36182 // integer store that is extracted from a vector on a 32-bit target.
36183 // If we have SSE2, then we can treat it like a floating-point double
36184 // to get past legalization. The execution dependencies fixup pass will
36185 // choose the optimal machine instruction for the store if this really is
36186 // an integer or v2f32 rather than an f64.
36187 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
36188 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
36189 SDValue OldExtract = St->getOperand(1);
36190 SDValue ExtOp0 = OldExtract.getOperand(0);
36191 unsigned VecSize = ExtOp0.getValueSizeInBits();
36192 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
36193 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
36194 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
36195 BitCast, OldExtract.getOperand(1));
36196 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
36197 St->getPointerInfo(), St->getAlignment(),
36198 St->getMemOperand()->getFlags());
36204 /// Return 'true' if this vector operation is "horizontal"
36205 /// and return the operands for the horizontal operation in LHS and RHS. A
36206 /// horizontal operation performs the binary operation on successive elements
36207 /// of its first operand, then on successive elements of its second operand,
36208 /// returning the resulting values in a vector. For example, if
36209 /// A = < float a0, float a1, float a2, float a3 >
36211 /// B = < float b0, float b1, float b2, float b3 >
36212 /// then the result of doing a horizontal operation on A and B is
36213 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
36214 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
36215 /// A horizontal-op B, for some already available A and B, and if so then LHS is
36216 /// set to A, RHS to B, and the routine returns 'true'.
36217 /// Note that the binary operation should have the property that if one of the
36218 /// operands is UNDEF then the result is UNDEF.
36219 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
36220 // Look for the following pattern: if
36221 // A = < float a0, float a1, float a2, float a3 >
36222 // B = < float b0, float b1, float b2, float b3 >
36224 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
36225 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
36226 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
36227 // which is A horizontal-op B.
36229 // At least one of the operands should be a vector shuffle.
36230 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
36231 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
36234 MVT VT = LHS.getSimpleValueType();
36236 assert((VT.is128BitVector() || VT.is256BitVector()) &&
36237 "Unsupported vector type for horizontal add/sub");
36239 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
36240 // operate independently on 128-bit lanes.
36241 unsigned NumElts = VT.getVectorNumElements();
36242 unsigned NumLanes = VT.getSizeInBits()/128;
36243 unsigned NumLaneElts = NumElts / NumLanes;
36244 assert((NumLaneElts % 2 == 0) &&
36245 "Vector type should have an even number of elements in each lane");
36246 unsigned HalfLaneElts = NumLaneElts/2;
36248 // View LHS in the form
36249 // LHS = VECTOR_SHUFFLE A, B, LMask
36250 // If LHS is not a shuffle then pretend it is the shuffle
36251 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
36252 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
36255 SmallVector<int, 16> LMask(NumElts);
36256 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36257 if (!LHS.getOperand(0).isUndef())
36258 A = LHS.getOperand(0);
36259 if (!LHS.getOperand(1).isUndef())
36260 B = LHS.getOperand(1);
36261 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
36262 std::copy(Mask.begin(), Mask.end(), LMask.begin());
36264 if (!LHS.isUndef())
36266 for (unsigned i = 0; i != NumElts; ++i)
36270 // Likewise, view RHS in the form
36271 // RHS = VECTOR_SHUFFLE C, D, RMask
36273 SmallVector<int, 16> RMask(NumElts);
36274 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36275 if (!RHS.getOperand(0).isUndef())
36276 C = RHS.getOperand(0);
36277 if (!RHS.getOperand(1).isUndef())
36278 D = RHS.getOperand(1);
36279 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
36280 std::copy(Mask.begin(), Mask.end(), RMask.begin());
36282 if (!RHS.isUndef())
36284 for (unsigned i = 0; i != NumElts; ++i)
36288 // Check that the shuffles are both shuffling the same vectors.
36289 if (!(A == C && B == D) && !(A == D && B == C))
36292 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
36293 if (!A.getNode() && !B.getNode())
36296 // If A and B occur in reverse order in RHS, then "swap" them (which means
36297 // rewriting the mask).
36299 ShuffleVectorSDNode::commuteMask(RMask);
36301 // At this point LHS and RHS are equivalent to
36302 // LHS = VECTOR_SHUFFLE A, B, LMask
36303 // RHS = VECTOR_SHUFFLE A, B, RMask
36304 // Check that the masks correspond to performing a horizontal operation.
36305 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
36306 for (unsigned i = 0; i != NumLaneElts; ++i) {
36307 int LIdx = LMask[i+l], RIdx = RMask[i+l];
36309 // Ignore any UNDEF components.
36310 if (LIdx < 0 || RIdx < 0 ||
36311 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
36312 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
36315 // Check that successive elements are being operated on. If not, this is
36316 // not a horizontal operation.
36317 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
36318 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
36319 if (!(LIdx == Index && RIdx == Index + 1) &&
36320 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
36325 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
36326 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
36330 /// Do target-specific dag combines on floating-point adds/subs.
36331 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
36332 const X86Subtarget &Subtarget) {
36333 EVT VT = N->getValueType(0);
36334 SDValue LHS = N->getOperand(0);
36335 SDValue RHS = N->getOperand(1);
36336 bool IsFadd = N->getOpcode() == ISD::FADD;
36337 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
36339 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
36340 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
36341 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
36342 isHorizontalBinOp(LHS, RHS, IsFadd)) {
36343 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
36344 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
36349 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
36351 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
36352 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
36353 const X86Subtarget &Subtarget,
36355 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
36356 SDValue Src = N->getOperand(0);
36357 unsigned Opcode = Src.getOpcode();
36358 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36360 EVT VT = N->getValueType(0);
36361 EVT SrcVT = Src.getValueType();
36363 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
36364 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
36366 // Repeated operand, so we are only trading one output truncation for
36367 // one input truncation.
36371 // See if either operand has been extended from a smaller/equal size to
36372 // the truncation size, allowing a truncation to combine with the extend.
36373 unsigned Opcode0 = Op0.getOpcode();
36374 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
36375 Opcode0 == ISD::ZERO_EXTEND) &&
36376 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36379 unsigned Opcode1 = Op1.getOpcode();
36380 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
36381 Opcode1 == ISD::ZERO_EXTEND) &&
36382 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36385 // See if either operand is a single use constant which can be constant
36387 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
36388 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
36389 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
36390 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
36393 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
36394 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
36395 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
36396 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
36399 // Don't combine if the operation has other uses.
36400 if (!N->isOnlyUserOf(Src.getNode()))
36403 // Only support vector truncation for now.
36404 // TODO: i64 scalar math would benefit as well.
36405 if (!VT.isVector())
36408 // In most cases its only worth pre-truncating if we're only facing the cost
36409 // of one truncation.
36410 // i.e. if one of the inputs will constant fold or the input is repeated.
36415 SDValue Op0 = Src.getOperand(0);
36416 SDValue Op1 = Src.getOperand(1);
36417 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
36418 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36419 return TruncateArithmetic(Op0, Op1);
36424 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
36425 // better to truncate if we have the chance.
36426 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
36427 !TLI.isOperationLegal(Opcode, SrcVT))
36428 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
36431 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
36432 SDValue Op0 = Src.getOperand(0);
36433 SDValue Op1 = Src.getOperand(1);
36434 if (TLI.isOperationLegal(Opcode, VT) &&
36435 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36436 return TruncateArithmetic(Op0, Op1);
36444 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
36445 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
36446 const X86Subtarget &Subtarget,
36447 SelectionDAG &DAG) {
36448 SDValue In = N->getOperand(0);
36449 EVT InVT = In.getValueType();
36450 EVT InSVT = InVT.getVectorElementType();
36451 EVT OutVT = N->getValueType(0);
36452 EVT OutSVT = OutVT.getVectorElementType();
36454 // Split a long vector into vectors of legal type and mask to unset all bits
36455 // that won't appear in the result to prevent saturation.
36456 // TODO - we should be doing this at the maximum legal size but this is
36457 // causing regressions where we're concatenating back to max width just to
36458 // perform the AND and then extracting back again.....
36459 unsigned NumSubRegs = InVT.getSizeInBits() / 128;
36460 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
36461 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
36462 SmallVector<SDValue, 8> SubVecs(NumSubRegs);
36465 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
36466 SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
36468 for (unsigned i = 0; i < NumSubRegs; i++) {
36469 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
36470 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
36471 SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
36473 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
36475 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
36478 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
36479 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
36480 const X86Subtarget &Subtarget,
36481 SelectionDAG &DAG) {
36482 SDValue In = N->getOperand(0);
36483 EVT InVT = In.getValueType();
36484 EVT OutVT = N->getValueType(0);
36485 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
36486 DAG.getValueType(OutVT));
36487 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
36490 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
36491 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
36492 /// legalization the truncation will be translated into a BUILD_VECTOR with each
36493 /// element that is extracted from a vector and then truncated, and it is
36494 /// difficult to do this optimization based on them.
36495 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
36496 const X86Subtarget &Subtarget) {
36497 EVT OutVT = N->getValueType(0);
36498 if (!OutVT.isVector())
36501 SDValue In = N->getOperand(0);
36502 if (!In.getValueType().isSimple())
36505 EVT InVT = In.getValueType();
36506 unsigned NumElems = OutVT.getVectorNumElements();
36508 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
36509 // SSE2, and we need to take care of it specially.
36510 // AVX512 provides vpmovdb.
36511 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
36514 EVT OutSVT = OutVT.getVectorElementType();
36515 EVT InSVT = InVT.getVectorElementType();
36516 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
36517 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
36521 // SSSE3's pshufb results in less instructions in the cases below.
36522 if (Subtarget.hasSSSE3() && NumElems == 8 &&
36523 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
36524 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
36528 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
36529 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
36530 // truncate 2 x v4i32 to v8i16.
36531 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
36532 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
36533 if (InSVT == MVT::i32)
36534 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
36539 /// This function transforms vector truncation of 'extended sign-bits' or
36540 /// 'extended zero-bits' values.
36541 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
36542 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
36544 const X86Subtarget &Subtarget) {
36545 // Requires SSE2 but AVX512 has fast truncate.
36546 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36549 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
36552 SDValue In = N->getOperand(0);
36553 if (!In.getValueType().isSimple())
36556 MVT VT = N->getValueType(0).getSimpleVT();
36557 MVT SVT = VT.getScalarType();
36559 MVT InVT = In.getValueType().getSimpleVT();
36560 MVT InSVT = InVT.getScalarType();
36562 // Check we have a truncation suited for PACKSS/PACKUS.
36563 if (!VT.is128BitVector() && !VT.is256BitVector())
36565 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
36567 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
36570 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
36571 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
36573 // Use PACKUS if the input has zero-bits that extend all the way to the
36574 // packed/truncated value. e.g. masks, zext_in_reg, etc.
36576 DAG.computeKnownBits(In, Known);
36577 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
36578 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
36579 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
36581 // Use PACKSS if the input has sign-bits that extend all the way to the
36582 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
36583 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
36584 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
36585 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
36590 // Try to form a MULHU or MULHS node by looking for
36591 // (trunc (srl (mul ext, ext), 16))
36592 // TODO: This is X86 specific because we want to be able to handle wide types
36593 // before type legalization. But we can only do it if the vector will be
36594 // legalized via widening/splitting. Type legalization can't handle promotion
36595 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
36597 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
36598 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36599 // First instruction should be a right shift of a multiply.
36600 if (Src.getOpcode() != ISD::SRL ||
36601 Src.getOperand(0).getOpcode() != ISD::MUL)
36604 if (!Subtarget.hasSSE2())
36607 // Only handle vXi16 types that are at least 128-bits.
36608 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
36609 VT.getVectorNumElements() < 8)
36612 // Input type should be vXi32.
36613 EVT InVT = Src.getValueType();
36614 if (InVT.getVectorElementType() != MVT::i32)
36617 // Need a shift by 16.
36619 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
36623 SDValue LHS = Src.getOperand(0).getOperand(0);
36624 SDValue RHS = Src.getOperand(0).getOperand(1);
36626 unsigned ExtOpc = LHS.getOpcode();
36627 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
36628 RHS.getOpcode() != ExtOpc)
36631 // Peek through the extends.
36632 LHS = LHS.getOperand(0);
36633 RHS = RHS.getOperand(0);
36635 // Ensure the input types match.
36636 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
36639 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
36640 return DAG.getNode(Opc, DL, VT, LHS, RHS);
36643 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
36644 const X86Subtarget &Subtarget) {
36645 EVT VT = N->getValueType(0);
36646 SDValue Src = N->getOperand(0);
36649 // Attempt to pre-truncate inputs to arithmetic ops instead.
36650 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
36653 // Try to detect AVG pattern first.
36654 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
36657 // Try to combine truncation with signed/unsigned saturation.
36658 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
36661 // Try to combine PMULHUW/PMULHW for vXi16.
36662 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
36665 // The bitcast source is a direct mmx result.
36666 // Detect bitcasts between i32 to x86mmx
36667 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
36668 SDValue BCSrc = Src.getOperand(0);
36669 if (BCSrc.getValueType() == MVT::x86mmx)
36670 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
36673 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
36674 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
36677 return combineVectorTruncation(N, DAG, Subtarget);
36680 /// Returns the negated value if the node \p N flips sign of FP value.
36682 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
36683 /// AVX512F does not have FXOR, so FNEG is lowered as
36684 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
36685 /// In this case we go though all bitcasts.
36686 static SDValue isFNEG(SDNode *N) {
36687 if (N->getOpcode() == ISD::FNEG)
36688 return N->getOperand(0);
36690 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
36691 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
36694 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
36695 if (!Op1.getValueType().isFloatingPoint())
36698 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
36700 unsigned EltBits = Op1.getScalarValueSizeInBits();
36701 auto isSignMask = [&](const ConstantFP *C) {
36702 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
36705 // There is more than one way to represent the same constant on
36706 // the different X86 targets. The type of the node may also depend on size.
36707 // - load scalar value and broadcast
36708 // - BUILD_VECTOR node
36709 // - load from a constant pool.
36710 // We check all variants here.
36711 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
36712 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
36713 if (isSignMask(cast<ConstantFP>(C)))
36716 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
36717 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
36718 if (isSignMask(CN->getConstantFPValue()))
36721 } else if (auto *C = getTargetConstantFromNode(Op1)) {
36722 if (C->getType()->isVectorTy()) {
36723 if (auto *SplatV = C->getSplatValue())
36724 if (isSignMask(cast<ConstantFP>(SplatV)))
36726 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
36727 if (isSignMask(FPConst))
36733 /// Do target-specific dag combines on floating point negations.
36734 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
36735 const X86Subtarget &Subtarget) {
36736 EVT OrigVT = N->getValueType(0);
36737 SDValue Arg = isFNEG(N);
36738 assert(Arg.getNode() && "N is expected to be an FNEG node");
36740 EVT VT = Arg.getValueType();
36741 EVT SVT = VT.getScalarType();
36744 // Let legalize expand this if it isn't a legal type yet.
36745 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36748 // If we're negating a FMUL node on a target with FMA, then we can avoid the
36749 // use of a constant by performing (-0 - A*B) instead.
36750 // FIXME: Check rounding control flags as well once it becomes available.
36751 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
36752 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
36753 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
36754 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
36755 Arg.getOperand(1), Zero);
36756 return DAG.getBitcast(OrigVT, NewNode);
36759 // If we're negating an FMA node, then we can adjust the
36760 // instruction to include the extra negation.
36761 unsigned NewOpcode = 0;
36762 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
36763 switch (Arg.getOpcode()) {
36764 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
36765 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
36766 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
36767 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
36768 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
36769 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
36770 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
36771 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
36772 // We can't handle scalar intrinsic node here because it would only
36773 // invert one element and not the whole vector. But we could try to handle
36774 // a negation of the lower element only.
36778 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
36779 Arg.getNode()->ops()));
36784 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
36785 const X86Subtarget &Subtarget) {
36786 MVT VT = N->getSimpleValueType(0);
36787 // If we have integer vector types available, use the integer opcodes.
36788 if (VT.isVector() && Subtarget.hasSSE2()) {
36791 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
36793 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
36794 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
36795 unsigned IntOpcode;
36796 switch (N->getOpcode()) {
36797 default: llvm_unreachable("Unexpected FP logic op");
36798 case X86ISD::FOR: IntOpcode = ISD::OR; break;
36799 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
36800 case X86ISD::FAND: IntOpcode = ISD::AND; break;
36801 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
36803 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
36804 return DAG.getBitcast(VT, IntOp);
36810 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
36811 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
36812 if (N->getOpcode() != ISD::XOR)
36815 SDValue LHS = N->getOperand(0);
36816 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
36817 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
36820 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
36821 X86::CondCode(LHS->getConstantOperandVal(0)));
36823 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
36826 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
36827 TargetLowering::DAGCombinerInfo &DCI,
36828 const X86Subtarget &Subtarget) {
36829 // If this is SSE1 only convert to FXOR to avoid scalarization.
36830 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
36831 N->getValueType(0) == MVT::v4i32) {
36832 return DAG.getBitcast(
36833 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
36834 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
36835 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
36838 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
36841 if (DCI.isBeforeLegalizeOps())
36844 if (SDValue SetCC = foldXor1SetCC(N, DAG))
36847 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
36850 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
36854 return combineFneg(N, DAG, Subtarget);
36858 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
36859 TargetLowering::DAGCombinerInfo &DCI,
36860 const X86Subtarget &Subtarget) {
36861 SDValue Op0 = N->getOperand(0);
36862 SDValue Op1 = N->getOperand(1);
36863 EVT VT = N->getValueType(0);
36864 unsigned NumBits = VT.getSizeInBits();
36866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36867 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36868 !DCI.isBeforeLegalizeOps());
36870 // TODO - Constant Folding.
36871 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36872 // Reduce Cst1 to the bottom 16-bits.
36873 // NOTE: SimplifyDemandedBits won't do this for constants.
36874 const APInt &Val1 = Cst1->getAPIntValue();
36875 APInt MaskedVal1 = Val1 & 0xFFFF;
36876 if (MaskedVal1 != Val1)
36877 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
36878 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
36881 // Only bottom 16-bits of the control bits are required.
36883 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
36884 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
36885 DCI.CommitTargetLoweringOpt(TLO);
36886 return SDValue(N, 0);
36892 static bool isNullFPScalarOrVectorConst(SDValue V) {
36893 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
36896 /// If a value is a scalar FP zero or a vector FP zero (potentially including
36897 /// undefined elements), return a zero constant that may be used to fold away
36898 /// that value. In the case of a vector, the returned constant will not contain
36899 /// undefined elements even if the input parameter does. This makes it suitable
36900 /// to be used as a replacement operand with operations (eg, bitwise-and) where
36901 /// an undef should not propagate.
36902 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
36903 const X86Subtarget &Subtarget) {
36904 if (!isNullFPScalarOrVectorConst(V))
36907 if (V.getValueType().isVector())
36908 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
36913 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
36914 const X86Subtarget &Subtarget) {
36915 SDValue N0 = N->getOperand(0);
36916 SDValue N1 = N->getOperand(1);
36917 EVT VT = N->getValueType(0);
36920 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
36921 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
36922 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
36923 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
36926 auto isAllOnesConstantFP = [](SDValue V) {
36927 if (V.getSimpleValueType().isVector())
36928 return ISD::isBuildVectorAllOnes(V.getNode());
36929 auto *C = dyn_cast<ConstantFPSDNode>(V);
36930 return C && C->getConstantFPValue()->isAllOnesValue();
36933 // fand (fxor X, -1), Y --> fandn X, Y
36934 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
36935 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
36937 // fand X, (fxor Y, -1) --> fandn Y, X
36938 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
36939 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
36944 /// Do target-specific dag combines on X86ISD::FAND nodes.
36945 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
36946 const X86Subtarget &Subtarget) {
36947 // FAND(0.0, x) -> 0.0
36948 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
36951 // FAND(x, 0.0) -> 0.0
36952 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
36955 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
36958 return lowerX86FPLogicOp(N, DAG, Subtarget);
36961 /// Do target-specific dag combines on X86ISD::FANDN nodes.
36962 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
36963 const X86Subtarget &Subtarget) {
36964 // FANDN(0.0, x) -> x
36965 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
36966 return N->getOperand(1);
36968 // FANDN(x, 0.0) -> 0.0
36969 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
36972 return lowerX86FPLogicOp(N, DAG, Subtarget);
36975 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
36976 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
36977 const X86Subtarget &Subtarget) {
36978 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
36980 // F[X]OR(0.0, x) -> x
36981 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
36982 return N->getOperand(1);
36984 // F[X]OR(x, 0.0) -> x
36985 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
36986 return N->getOperand(0);
36989 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
36992 return lowerX86FPLogicOp(N, DAG, Subtarget);
36995 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
36996 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
36997 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
36999 // Only perform optimizations if UnsafeMath is used.
37000 if (!DAG.getTarget().Options.UnsafeFPMath)
37003 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
37004 // into FMINC and FMAXC, which are Commutative operations.
37005 unsigned NewOp = 0;
37006 switch (N->getOpcode()) {
37007 default: llvm_unreachable("unknown opcode");
37008 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
37009 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
37012 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
37013 N->getOperand(0), N->getOperand(1));
37016 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
37017 const X86Subtarget &Subtarget) {
37018 if (Subtarget.useSoftFloat())
37021 // TODO: If an operand is already known to be a NaN or not a NaN, this
37022 // should be an optional swap and FMAX/FMIN.
37024 EVT VT = N->getValueType(0);
37025 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
37026 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
37027 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
37030 SDValue Op0 = N->getOperand(0);
37031 SDValue Op1 = N->getOperand(1);
37033 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
37035 // If we don't have to respect NaN inputs, this is a direct translation to x86
37036 // min/max instructions.
37037 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
37038 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
37040 // If we have to respect NaN inputs, this takes at least 3 instructions.
37041 // Favor a library call when operating on a scalar and minimizing code size.
37042 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
37045 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
37046 DAG.getDataLayout(), *DAG.getContext(), VT);
37048 // There are 4 possibilities involving NaN inputs, and these are the required
37052 // ----------------
37053 // Num | Max | Op0 |
37054 // Op0 ----------------
37055 // NaN | Op1 | NaN |
37056 // ----------------
37058 // The SSE FP max/min instructions were not designed for this case, but rather
37060 // Min = Op1 < Op0 ? Op1 : Op0
37061 // Max = Op1 > Op0 ? Op1 : Op0
37063 // So they always return Op0 if either input is a NaN. However, we can still
37064 // use those instructions for fmaxnum by selecting away a NaN input.
37066 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
37067 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
37068 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
37070 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
37071 // are NaN, the NaN value of Op1 is the result.
37072 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
37075 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
37076 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
37077 TargetLowering::DAGCombinerInfo &DCI,
37078 const X86Subtarget &Subtarget) {
37079 // ANDNP(0, x) -> x
37080 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
37081 return N->getOperand(1);
37083 // ANDNP(x, 0) -> 0
37084 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
37085 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
37087 EVT VT = N->getValueType(0);
37089 // Attempt to recursively combine a bitmask ANDNP with shuffles.
37090 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
37092 if (SDValue Res = combineX86ShufflesRecursively(
37093 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
37094 /*HasVarMask*/ false, DAG, Subtarget)) {
37095 DCI.CombineTo(N, Res);
37103 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
37104 TargetLowering::DAGCombinerInfo &DCI) {
37105 SDValue N0 = N->getOperand(0);
37106 SDValue N1 = N->getOperand(1);
37108 // BT ignores high bits in the bit index operand.
37109 unsigned BitWidth = N1.getValueSizeInBits();
37110 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
37111 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
37112 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
37117 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
37118 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
37119 EVT VT = N->getValueType(0);
37121 SDValue N0 = N->getOperand(0);
37122 SDValue N1 = N->getOperand(1);
37123 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37125 if (ExtraVT != MVT::i16)
37128 // Look through single use any_extends.
37129 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
37130 N0 = N0.getOperand(0);
37132 // See if we have a single use cmov.
37133 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
37136 SDValue CMovOp0 = N0.getOperand(0);
37137 SDValue CMovOp1 = N0.getOperand(1);
37139 // Make sure both operands are constants.
37140 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37141 !isa<ConstantSDNode>(CMovOp1.getNode()))
37146 // If we looked through an any_extend above, add one to the constants.
37147 if (N0.getValueType() != VT) {
37148 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
37149 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
37152 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
37153 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
37155 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
37156 N0.getOperand(2), N0.getOperand(3));
37159 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
37160 const X86Subtarget &Subtarget) {
37161 if (SDValue V = combineSextInRegCmov(N, DAG))
37164 EVT VT = N->getValueType(0);
37165 SDValue N0 = N->getOperand(0);
37166 SDValue N1 = N->getOperand(1);
37167 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37170 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
37171 // both SSE and AVX2 since there is no sign-extended shift right
37172 // operation on a vector with 64-bit elements.
37173 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
37174 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
37175 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
37176 N0.getOpcode() == ISD::SIGN_EXTEND)) {
37177 SDValue N00 = N0.getOperand(0);
37179 // EXTLOAD has a better solution on AVX2,
37180 // it may be replaced with X86ISD::VSEXT node.
37181 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
37182 if (!ISD::isNormalLoad(N00.getNode()))
37185 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
37186 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
37188 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
37194 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
37195 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
37196 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
37197 /// opportunities to combine math ops, use an LEA, or use a complex addressing
37198 /// mode. This can eliminate extend, add, and shift instructions.
37199 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
37200 const X86Subtarget &Subtarget) {
37201 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
37202 Ext->getOpcode() != ISD::ZERO_EXTEND)
37205 // TODO: This should be valid for other integer types.
37206 EVT VT = Ext->getValueType(0);
37207 if (VT != MVT::i64)
37210 SDValue Add = Ext->getOperand(0);
37211 if (Add.getOpcode() != ISD::ADD)
37214 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
37215 bool NSW = Add->getFlags().hasNoSignedWrap();
37216 bool NUW = Add->getFlags().hasNoUnsignedWrap();
37218 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
37220 if ((Sext && !NSW) || (!Sext && !NUW))
37223 // Having a constant operand to the 'add' ensures that we are not increasing
37224 // the instruction count because the constant is extended for free below.
37225 // A constant operand can also become the displacement field of an LEA.
37226 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
37230 // Don't make the 'add' bigger if there's no hope of combining it with some
37231 // other 'add' or 'shl' instruction.
37232 // TODO: It may be profitable to generate simpler LEA instructions in place
37233 // of single 'add' instructions, but the cost model for selecting an LEA
37234 // currently has a high threshold.
37235 bool HasLEAPotential = false;
37236 for (auto *User : Ext->uses()) {
37237 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
37238 HasLEAPotential = true;
37242 if (!HasLEAPotential)
37245 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
37246 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
37247 SDValue AddOp0 = Add.getOperand(0);
37248 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
37249 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
37251 // The wider add is guaranteed to not wrap because both operands are
37254 Flags.setNoSignedWrap(NSW);
37255 Flags.setNoUnsignedWrap(NUW);
37256 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
37259 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
37260 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
37261 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
37262 /// extends from AH (which we otherwise need to do contortions to access).
37263 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
37264 SDValue N0 = N->getOperand(0);
37265 auto OpcodeN = N->getOpcode();
37266 auto OpcodeN0 = N0.getOpcode();
37267 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
37268 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
37271 EVT VT = N->getValueType(0);
37272 EVT InVT = N0.getValueType();
37273 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
37274 !(VT == MVT::i32 || VT == MVT::i64))
37277 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
37278 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
37279 : X86ISD::UDIVREM8_ZEXT_HREG;
37280 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
37282 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
37283 // If this was a 64-bit extend, complete it.
37284 if (VT == MVT::i64)
37285 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
37286 return R.getValue(1);
37289 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
37290 // operands and the result of CMOV is not used anywhere else - promote CMOV
37291 // itself instead of promoting its result. This could be beneficial, because:
37292 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
37293 // (or more) pseudo-CMOVs only when they go one-after-another and
37294 // getting rid of result extension code after CMOV will help that.
37295 // 2) Promotion of constant CMOV arguments is free, hence the
37296 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
37297 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
37298 // promotion is also good in terms of code-size.
37299 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
37301 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
37302 SDValue CMovN = Extend->getOperand(0);
37303 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
37306 EVT TargetVT = Extend->getValueType(0);
37307 unsigned ExtendOpcode = Extend->getOpcode();
37310 EVT VT = CMovN.getValueType();
37311 SDValue CMovOp0 = CMovN.getOperand(0);
37312 SDValue CMovOp1 = CMovN.getOperand(1);
37314 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37315 !isa<ConstantSDNode>(CMovOp1.getNode()))
37318 // Only extend to i32 or i64.
37319 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
37322 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
37324 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
37327 // If this a zero extend to i64, we should only extend to i32 and use a free
37328 // zero extend to finish.
37329 EVT ExtendVT = TargetVT;
37330 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
37331 ExtendVT = MVT::i32;
37333 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
37334 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
37336 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
37337 CMovN.getOperand(2), CMovN.getOperand(3));
37339 // Finish extending if needed.
37340 if (ExtendVT != TargetVT)
37341 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
37346 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
37347 // This is more or less the reverse of combineBitcastvxi1.
37349 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
37350 TargetLowering::DAGCombinerInfo &DCI,
37351 const X86Subtarget &Subtarget) {
37352 unsigned Opcode = N->getOpcode();
37353 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
37354 Opcode != ISD::ANY_EXTEND)
37356 if (!DCI.isBeforeLegalizeOps())
37358 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
37361 SDValue N0 = N->getOperand(0);
37362 EVT VT = N->getValueType(0);
37363 EVT SVT = VT.getScalarType();
37364 EVT InSVT = N0.getValueType().getScalarType();
37365 unsigned EltSizeInBits = SVT.getSizeInBits();
37367 // Input type must be extending a bool vector (bit-casted from a scalar
37368 // integer) to legal integer types.
37369 if (!VT.isVector())
37371 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
37373 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
37376 SDValue N00 = N0.getOperand(0);
37377 EVT SclVT = N0.getOperand(0).getValueType();
37378 if (!SclVT.isScalarInteger())
37383 SmallVector<int, 32> ShuffleMask;
37384 unsigned NumElts = VT.getVectorNumElements();
37385 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
37387 // Broadcast the scalar integer to the vector elements.
37388 if (NumElts > EltSizeInBits) {
37389 // If the scalar integer is greater than the vector element size, then we
37390 // must split it down into sub-sections for broadcasting. For example:
37391 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
37392 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
37393 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
37394 unsigned Scale = NumElts / EltSizeInBits;
37396 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
37397 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
37398 Vec = DAG.getBitcast(VT, Vec);
37400 for (unsigned i = 0; i != Scale; ++i)
37401 ShuffleMask.append(EltSizeInBits, i);
37403 // For smaller scalar integers, we can simply any-extend it to the vector
37404 // element size (we don't care about the upper bits) and broadcast it to all
37406 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
37407 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37408 ShuffleMask.append(NumElts, 0);
37410 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
37412 // Now, mask the relevant bit in each element.
37413 SmallVector<SDValue, 32> Bits;
37414 for (unsigned i = 0; i != NumElts; ++i) {
37415 int BitIdx = (i % EltSizeInBits);
37416 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
37417 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
37419 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
37420 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
37422 // Compare against the bitmask and extend the result.
37423 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
37424 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
37425 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
37427 // For SEXT, this is now done, otherwise shift the result down for
37429 if (Opcode == ISD::SIGN_EXTEND)
37431 return DAG.getNode(ISD::SRL, DL, VT, Vec,
37432 DAG.getConstant(EltSizeInBits - 1, DL, VT));
37435 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
37436 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
37437 /// with UNDEFs) of the input to vectors of the same size as the target type
37438 /// which then extends the lowest elements.
37439 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
37440 TargetLowering::DAGCombinerInfo &DCI,
37441 const X86Subtarget &Subtarget) {
37442 unsigned Opcode = N->getOpcode();
37443 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
37445 if (!DCI.isBeforeLegalizeOps())
37447 if (!Subtarget.hasSSE2())
37450 SDValue N0 = N->getOperand(0);
37451 EVT VT = N->getValueType(0);
37452 EVT SVT = VT.getScalarType();
37453 EVT InVT = N0.getValueType();
37454 EVT InSVT = InVT.getScalarType();
37456 // Input type must be a vector and we must be extending legal integer types.
37457 if (!VT.isVector())
37459 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
37461 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
37464 // On AVX2+ targets, if the input/output types are both legal then we will be
37465 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
37466 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
37467 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
37472 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
37473 EVT InVT = N.getValueType();
37474 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
37475 Size / InVT.getScalarSizeInBits());
37476 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
37477 DAG.getUNDEF(InVT));
37479 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
37482 // If target-size is less than 128-bits, extend to a type that would extend
37483 // to 128 bits, extend that and extract the original target vector.
37484 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
37485 unsigned Scale = 128 / VT.getSizeInBits();
37487 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
37488 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
37489 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
37490 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
37491 DAG.getIntPtrConstant(0, DL));
37494 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
37495 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
37496 // Also use this if we don't have SSE41 to allow the legalizer do its job.
37497 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
37498 (VT.is256BitVector() && Subtarget.hasInt256()) ||
37499 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
37500 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
37501 return Opcode == ISD::SIGN_EXTEND
37502 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
37503 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
37506 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
37507 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
37508 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
37509 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
37510 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
37512 SmallVector<SDValue, 8> Opnds;
37513 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
37514 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
37515 DAG.getIntPtrConstant(Offset, DL));
37516 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
37517 SrcVec = Opcode == ISD::SIGN_EXTEND
37518 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
37519 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
37520 Opnds.push_back(SrcVec);
37522 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
37525 // On pre-AVX2 targets, split into 128-bit nodes of
37526 // ISD::*_EXTEND_VECTOR_INREG.
37527 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
37528 return SplitAndExtendInReg(128);
37530 // On pre-AVX512 targets, split into 256-bit nodes of
37531 // ISD::*_EXTEND_VECTOR_INREG.
37532 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
37533 return SplitAndExtendInReg(256);
37538 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
37540 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
37541 const X86Subtarget &Subtarget) {
37542 SDValue N0 = N->getOperand(0);
37543 EVT VT = N->getValueType(0);
37546 // Only do this combine with AVX512 for vector extends.
37547 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
37550 // Only combine legal element types.
37551 EVT SVT = VT.getVectorElementType();
37552 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
37553 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
37556 // We can only do this if the vector size in 256 bits or less.
37557 unsigned Size = VT.getSizeInBits();
37561 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
37562 // that's the only integer compares with we have.
37563 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
37564 if (ISD::isUnsignedIntSetCC(CC))
37567 // Only do this combine if the extension will be fully consumed by the setcc.
37568 EVT N00VT = N0.getOperand(0).getValueType();
37569 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
37570 if (Size != MatchingVecType.getSizeInBits())
37573 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
37575 if (N->getOpcode() == ISD::ZERO_EXTEND)
37576 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
37581 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
37582 TargetLowering::DAGCombinerInfo &DCI,
37583 const X86Subtarget &Subtarget) {
37584 SDValue N0 = N->getOperand(0);
37585 EVT VT = N->getValueType(0);
37586 EVT InVT = N0.getValueType();
37589 if (SDValue DivRem8 = getDivRem8(N, DAG))
37592 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37595 if (!DCI.isBeforeLegalizeOps())
37598 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37601 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
37602 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
37603 // Invert and sign-extend a boolean is the same as zero-extend and subtract
37604 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
37605 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
37606 // sext (xor Bool, -1) --> sub (zext Bool), 1
37607 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
37608 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
37611 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37614 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37618 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37621 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37627 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
37628 const X86Subtarget &Subtarget) {
37629 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
37631 EVT VT = N->getValueType(0);
37633 // Let legalize expand this if it isn't a legal type yet.
37634 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
37637 EVT ScalarVT = VT.getScalarType();
37638 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
37641 SDValue A = N->getOperand(0);
37642 SDValue B = N->getOperand(1);
37643 SDValue C = N->getOperand(2);
37645 auto invertIfNegative = [](SDValue &V) {
37646 if (SDValue NegVal = isFNEG(V.getNode())) {
37653 // Do not convert the passthru input of scalar intrinsics.
37654 // FIXME: We could allow negations of the lower element only.
37655 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
37656 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
37657 bool NegB = invertIfNegative(B);
37658 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
37659 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
37661 // Negative multiplication when NegA xor NegB
37662 bool NegMul = (NegA != NegB);
37663 bool HasNeg = NegA || NegB || NegC;
37665 unsigned NewOpcode;
37667 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
37669 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
37671 // For FMA, we risk reconstructing the node we started with.
37672 // In order to avoid this, we check for negation or opcode change. If
37673 // one of the two happened, then it is a new node and we return it.
37674 if (N->getOpcode() == ISD::FMA) {
37675 if (HasNeg || NewOpcode != N->getOpcode())
37676 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37680 if (N->getOpcode() == X86ISD::FMADD_RND) {
37681 switch (NewOpcode) {
37682 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
37683 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
37684 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
37685 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
37687 } else if (N->getOpcode() == X86ISD::FMADDS1) {
37688 switch (NewOpcode) {
37689 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
37690 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
37691 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
37692 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
37694 } else if (N->getOpcode() == X86ISD::FMADDS3) {
37695 switch (NewOpcode) {
37696 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
37697 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
37698 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
37699 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
37701 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
37702 switch (NewOpcode) {
37703 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
37704 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
37705 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
37706 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
37708 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
37709 switch (NewOpcode) {
37710 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
37711 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
37712 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
37713 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
37715 } else if (N->getOpcode() == X86ISD::FMADD4S) {
37716 switch (NewOpcode) {
37717 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
37718 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
37719 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
37720 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
37723 llvm_unreachable("Unexpected opcode!");
37726 // Only return the node is the opcode was changed or one of the
37727 // operand was negated. If not, we'll just recreate the same node.
37728 if (HasNeg || NewOpcode != N->getOpcode()) {
37729 if (N->getNumOperands() == 4)
37730 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
37731 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37737 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
37738 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
37739 const X86Subtarget &Subtarget) {
37741 EVT VT = N->getValueType(0);
37743 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
37747 unsigned NewOpcode;
37748 switch (N->getOpcode()) {
37749 default: llvm_unreachable("Unexpected opcode!");
37750 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
37751 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
37752 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
37753 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
37756 if (N->getNumOperands() == 4)
37757 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37758 NegVal, N->getOperand(3));
37759 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37763 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
37764 TargetLowering::DAGCombinerInfo &DCI,
37765 const X86Subtarget &Subtarget) {
37766 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
37767 // (and (i32 x86isd::setcc_carry), 1)
37768 // This eliminates the zext. This transformation is necessary because
37769 // ISD::SETCC is always legalized to i8.
37771 SDValue N0 = N->getOperand(0);
37772 EVT VT = N->getValueType(0);
37774 if (N0.getOpcode() == ISD::AND &&
37776 N0.getOperand(0).hasOneUse()) {
37777 SDValue N00 = N0.getOperand(0);
37778 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37779 if (!isOneConstant(N0.getOperand(1)))
37781 return DAG.getNode(ISD::AND, dl, VT,
37782 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37783 N00.getOperand(0), N00.getOperand(1)),
37784 DAG.getConstant(1, dl, VT));
37788 if (N0.getOpcode() == ISD::TRUNCATE &&
37790 N0.getOperand(0).hasOneUse()) {
37791 SDValue N00 = N0.getOperand(0);
37792 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37793 return DAG.getNode(ISD::AND, dl, VT,
37794 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37795 N00.getOperand(0), N00.getOperand(1)),
37796 DAG.getConstant(1, dl, VT));
37800 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37803 if (DCI.isBeforeLegalizeOps())
37804 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37807 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37810 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37814 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37817 if (SDValue DivRem8 = getDivRem8(N, DAG))
37820 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37823 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
37829 /// Try to map a 128-bit or larger integer comparison to vector instructions
37830 /// before type legalization splits it up into chunks.
37831 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
37832 const X86Subtarget &Subtarget) {
37833 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
37834 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
37836 // We're looking for an oversized integer equality comparison.
37837 SDValue X = SetCC->getOperand(0);
37838 SDValue Y = SetCC->getOperand(1);
37839 EVT OpVT = X.getValueType();
37840 unsigned OpSize = OpVT.getSizeInBits();
37841 if (!OpVT.isScalarInteger() || OpSize < 128)
37844 // Ignore a comparison with zero because that gets special treatment in
37845 // EmitTest(). But make an exception for the special case of a pair of
37846 // logically-combined vector-sized operands compared to zero. This pattern may
37847 // be generated by the memcmp expansion pass with oversized integer compares
37849 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
37850 X.getOperand(0).getOpcode() == ISD::XOR &&
37851 X.getOperand(1).getOpcode() == ISD::XOR;
37852 if (isNullConstant(Y) && !IsOrXorXorCCZero)
37855 // Bail out if we know that this is not really just an oversized integer.
37856 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
37857 peekThroughBitcasts(Y).getValueType() == MVT::f128)
37860 // TODO: Use PXOR + PTEST for SSE4.1 or later?
37861 // TODO: Add support for AVX-512.
37862 EVT VT = SetCC->getValueType(0);
37864 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
37865 (OpSize == 256 && Subtarget.hasAVX2())) {
37866 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
37868 if (IsOrXorXorCCZero) {
37869 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
37870 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
37871 // Use 2 vector equality compares and 'and' the results before doing a
37873 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
37874 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
37875 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
37876 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
37877 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
37878 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
37879 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
37881 SDValue VecX = DAG.getBitcast(VecVT, X);
37882 SDValue VecY = DAG.getBitcast(VecVT, Y);
37883 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
37885 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
37886 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
37887 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
37888 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
37889 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
37890 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
37891 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
37893 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
37899 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
37900 const X86Subtarget &Subtarget) {
37901 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
37902 SDValue LHS = N->getOperand(0);
37903 SDValue RHS = N->getOperand(1);
37904 EVT VT = N->getValueType(0);
37905 EVT OpVT = LHS.getValueType();
37908 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
37909 // 0-x == y --> x+y == 0
37910 // 0-x != y --> x+y != 0
37911 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
37913 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
37914 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
37916 // x == 0-y --> x+y == 0
37917 // x != 0-y --> x+y != 0
37918 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
37920 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
37921 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
37924 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
37928 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
37929 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
37930 // Put build_vectors on the right.
37931 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
37932 std::swap(LHS, RHS);
37933 CC = ISD::getSetCCSwappedOperands(CC);
37937 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
37938 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
37939 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
37941 if (IsSEXT0 && IsVZero1) {
37942 assert(VT == LHS.getOperand(0).getValueType() &&
37943 "Uexpected operand type");
37944 if (CC == ISD::SETGT)
37945 return DAG.getConstant(0, DL, VT);
37946 if (CC == ISD::SETLE)
37947 return DAG.getConstant(1, DL, VT);
37948 if (CC == ISD::SETEQ || CC == ISD::SETGE)
37949 return DAG.getNOT(DL, LHS.getOperand(0), VT);
37951 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
37952 "Unexpected condition code!");
37953 return LHS.getOperand(0);
37957 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
37958 // pre-promote its result type since vXi1 vectors don't get promoted
37959 // during type legalization.
37960 // NOTE: The element count check is to ignore operand types that need to
37961 // go through type promotion to a 128-bit vector.
37962 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
37963 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
37964 (OpVT.getVectorElementType() == MVT::i8 ||
37965 OpVT.getVectorElementType() == MVT::i16)) {
37966 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
37968 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
37971 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
37972 // to avoid scalarization via legalization because v4i32 is not a legal type.
37973 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
37974 LHS.getValueType() == MVT::v4f32)
37975 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
37980 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
37981 TargetLowering::DAGCombinerInfo &DCI) {
37982 SDValue Src = N->getOperand(0);
37983 MVT SrcVT = Src.getSimpleValueType();
37985 // Perform constant folding.
37986 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
37987 assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
37989 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
37990 SDValue In = Src.getOperand(Idx);
37991 if (!In.isUndef() &&
37992 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
37995 return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
37998 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37999 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38000 !DCI.isBeforeLegalizeOps());
38002 // MOVMSK only uses the MSB from each vector element.
38004 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
38005 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
38006 DCI.AddToWorklist(Src.getNode());
38007 DCI.CommitTargetLoweringOpt(TLO);
38008 return SDValue(N, 0);
38014 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
38015 TargetLowering::DAGCombinerInfo &DCI,
38016 const X86Subtarget &Subtarget) {
38019 if (DCI.isBeforeLegalizeOps()) {
38020 SDValue Index = N->getOperand(4);
38021 // Remove any sign extends from 32 or smaller to larger than 32.
38022 // Only do this before LegalizeOps in case we need the sign extend for
38024 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
38025 if (Index.getScalarValueSizeInBits() > 32 &&
38026 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
38027 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38028 NewOps[4] = Index.getOperand(0);
38029 DAG.UpdateNodeOperands(N, NewOps);
38030 // The original sign extend has less users, add back to worklist in case
38031 // it needs to be removed
38032 DCI.AddToWorklist(Index.getNode());
38033 DCI.AddToWorklist(N);
38034 return SDValue(N, 0);
38038 // Make sure the index is either i32 or i64
38039 unsigned ScalarSize = Index.getScalarValueSizeInBits();
38040 if (ScalarSize != 32 && ScalarSize != 64) {
38041 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
38042 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
38043 Index.getValueType().getVectorNumElements());
38044 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
38045 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38047 DAG.UpdateNodeOperands(N, NewOps);
38048 DCI.AddToWorklist(N);
38049 return SDValue(N, 0);
38052 // Try to remove zero extends from 32->64 if we know the sign bit of
38053 // the input is zero.
38054 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
38055 Index.getScalarValueSizeInBits() == 64 &&
38056 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
38057 if (DAG.SignBitIsZero(Index.getOperand(0))) {
38058 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38059 NewOps[4] = Index.getOperand(0);
38060 DAG.UpdateNodeOperands(N, NewOps);
38061 // The original zero extend has less users, add back to worklist in case
38062 // it needs to be removed
38063 DCI.AddToWorklist(Index.getNode());
38064 DCI.AddToWorklist(N);
38065 return SDValue(N, 0);
38070 // With AVX2 we only demand the upper bit of the mask.
38071 if (!Subtarget.hasAVX512()) {
38072 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38073 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38074 !DCI.isBeforeLegalizeOps());
38075 SDValue Mask = N->getOperand(2);
38077 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
38078 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
38079 DCI.AddToWorklist(Mask.getNode());
38080 DCI.CommitTargetLoweringOpt(TLO);
38081 return SDValue(N, 0);
38088 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
38089 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
38090 const X86Subtarget &Subtarget) {
38092 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
38093 SDValue EFLAGS = N->getOperand(1);
38095 // Try to simplify the EFLAGS and condition code operands.
38096 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
38097 return getSETCC(CC, Flags, DL, DAG);
38102 /// Optimize branch condition evaluation.
38103 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
38104 const X86Subtarget &Subtarget) {
38106 SDValue EFLAGS = N->getOperand(3);
38107 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
38109 // Try to simplify the EFLAGS and condition code operands.
38110 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
38111 // RAUW them under us.
38112 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
38113 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
38114 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
38115 N->getOperand(1), Cond, Flags);
38121 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
38122 SelectionDAG &DAG) {
38123 // Take advantage of vector comparisons producing 0 or -1 in each lane to
38124 // optimize away operation when it's from a constant.
38126 // The general transformation is:
38127 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
38128 // AND(VECTOR_CMP(x,y), constant2)
38129 // constant2 = UNARYOP(constant)
38131 // Early exit if this isn't a vector operation, the operand of the
38132 // unary operation isn't a bitwise AND, or if the sizes of the operations
38133 // aren't the same.
38134 EVT VT = N->getValueType(0);
38135 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
38136 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
38137 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
38140 // Now check that the other operand of the AND is a constant. We could
38141 // make the transformation for non-constant splats as well, but it's unclear
38142 // that would be a benefit as it would not eliminate any operations, just
38143 // perform one more step in scalar code before moving to the vector unit.
38144 if (BuildVectorSDNode *BV =
38145 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
38146 // Bail out if the vector isn't a constant.
38147 if (!BV->isConstant())
38150 // Everything checks out. Build up the new and improved node.
38152 EVT IntVT = BV->getValueType(0);
38153 // Create a new constant of the appropriate type for the transformed
38155 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
38156 // The AND node needs bitcasts to/from an integer vector type around it.
38157 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
38158 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
38159 N->getOperand(0)->getOperand(0), MaskConst);
38160 SDValue Res = DAG.getBitcast(VT, NewAnd);
38167 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
38168 const X86Subtarget &Subtarget) {
38169 SDValue Op0 = N->getOperand(0);
38170 EVT VT = N->getValueType(0);
38171 EVT InVT = Op0.getValueType();
38173 // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38174 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
38175 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
38176 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38178 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38179 InVT.getVectorNumElements());
38180 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
38182 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
38183 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38186 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
38187 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
38188 // the optimization here.
38189 if (DAG.SignBitIsZero(Op0))
38190 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
38195 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
38196 const X86Subtarget &Subtarget) {
38197 // First try to optimize away the conversion entirely when it's
38198 // conditionally from a constant. Vectors only.
38199 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
38202 // Now move on to more general possibilities.
38203 SDValue Op0 = N->getOperand(0);
38204 EVT VT = N->getValueType(0);
38205 EVT InVT = Op0.getValueType();
38207 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38208 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
38209 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
38210 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38212 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38213 InVT.getVectorNumElements());
38214 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
38215 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38218 // Without AVX512DQ we only support i64 to float scalar conversion. For both
38219 // vectors and scalars, see if we know that the upper bits are all the sign
38220 // bit, in which case we can truncate the input to i32 and convert from that.
38221 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
38222 unsigned BitWidth = InVT.getScalarSizeInBits();
38223 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
38224 if (NumSignBits >= (BitWidth - 31)) {
38225 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
38226 if (InVT.isVector())
38227 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
38228 InVT.getVectorNumElements());
38230 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
38231 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
38235 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
38236 // a 32-bit target where SSE doesn't support i64->FP operations.
38237 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
38238 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
38239 EVT LdVT = Ld->getValueType(0);
38241 // This transformation is not supported if the result type is f16 or f128.
38242 if (VT == MVT::f16 || VT == MVT::f128)
38245 // If we have AVX512DQ we can use packed conversion instructions unless
38247 if (Subtarget.hasDQI() && VT != MVT::f80)
38250 if (!Ld->isVolatile() && !VT.isVector() &&
38251 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
38252 !Subtarget.is64Bit() && LdVT == MVT::i64) {
38253 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
38254 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
38255 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
38262 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
38263 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38264 MVT VT = N->getSimpleValueType(0);
38265 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38266 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
38267 N->getOperand(0), N->getOperand(1),
38274 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
38275 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
38276 TargetLowering::DAGCombinerInfo &DCI) {
38277 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
38278 // the result is either zero or one (depending on the input carry bit).
38279 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
38280 if (X86::isZeroNode(N->getOperand(0)) &&
38281 X86::isZeroNode(N->getOperand(1)) &&
38282 // We don't have a good way to replace an EFLAGS use, so only do this when
38284 SDValue(N, 1).use_empty()) {
38286 EVT VT = N->getValueType(0);
38287 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
38288 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
38289 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38290 DAG.getConstant(X86::COND_B, DL,
38293 DAG.getConstant(1, DL, VT));
38294 return DCI.CombineTo(N, Res1, CarryOut);
38297 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38298 MVT VT = N->getSimpleValueType(0);
38299 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38300 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
38301 N->getOperand(0), N->getOperand(1),
38308 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
38309 /// which is more useful than 0/1 in some cases.
38310 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
38312 // "Condition code B" is also known as "the carry flag" (CF).
38313 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
38314 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
38315 MVT VT = N->getSimpleValueType(0);
38317 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
38319 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
38320 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
38323 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
38324 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
38325 /// with CMP+{ADC, SBB}.
38326 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
38327 bool IsSub = N->getOpcode() == ISD::SUB;
38328 SDValue X = N->getOperand(0);
38329 SDValue Y = N->getOperand(1);
38331 // If this is an add, canonicalize a zext operand to the RHS.
38332 // TODO: Incomplete? What if both sides are zexts?
38333 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
38334 Y.getOpcode() != ISD::ZERO_EXTEND)
38337 // Look through a one-use zext.
38338 bool PeekedThroughZext = false;
38339 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
38340 Y = Y.getOperand(0);
38341 PeekedThroughZext = true;
38344 // If this is an add, canonicalize a setcc operand to the RHS.
38345 // TODO: Incomplete? What if both sides are setcc?
38346 // TODO: Should we allow peeking through a zext of the other operand?
38347 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
38348 Y.getOpcode() != X86ISD::SETCC)
38351 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
38355 EVT VT = N->getValueType(0);
38356 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
38358 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38359 // the general case below.
38360 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
38362 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
38363 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
38364 // This is a complicated way to get -1 or 0 from the carry flag:
38365 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38366 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38367 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38368 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38372 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
38373 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
38374 SDValue EFLAGS = Y->getOperand(1);
38375 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38376 EFLAGS.getValueType().isInteger() &&
38377 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38378 // Swap the operands of a SUB, and we have the same pattern as above.
38379 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
38380 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
38381 SDValue NewSub = DAG.getNode(
38382 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
38383 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38384 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38385 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38386 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38392 if (CC == X86::COND_B) {
38393 // X + SETB Z --> X + (mask SBB Z, Z)
38394 // X - SETB Z --> X - (mask SBB Z, Z)
38395 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
38396 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
38397 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38398 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38399 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38402 if (CC == X86::COND_A) {
38403 SDValue EFLAGS = Y->getOperand(1);
38404 // Try to convert COND_A into COND_B in an attempt to facilitate
38405 // materializing "setb reg".
38407 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
38408 // cannot take an immediate as its first operand.
38410 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38411 EFLAGS.getValueType().isInteger() &&
38412 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38413 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
38414 EFLAGS.getNode()->getVTList(),
38415 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38416 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38417 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
38418 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38419 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38420 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38424 if (CC != X86::COND_E && CC != X86::COND_NE)
38427 SDValue Cmp = Y.getOperand(1);
38428 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
38429 !X86::isZeroNode(Cmp.getOperand(1)) ||
38430 !Cmp.getOperand(0).getValueType().isInteger())
38433 SDValue Z = Cmp.getOperand(0);
38434 EVT ZVT = Z.getValueType();
38436 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38437 // the general case below.
38439 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
38441 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
38442 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
38443 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
38444 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
38445 SDValue Zero = DAG.getConstant(0, DL, ZVT);
38446 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
38447 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
38448 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38449 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38450 SDValue(Neg.getNode(), 1));
38453 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
38454 // with fake operands:
38455 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
38456 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
38457 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
38458 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
38459 SDValue One = DAG.getConstant(1, DL, ZVT);
38460 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38461 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38462 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
38466 // (cmp Z, 1) sets the carry flag if Z is 0.
38467 SDValue One = DAG.getConstant(1, DL, ZVT);
38468 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38470 // Add the flags type for ADC/SBB nodes.
38471 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38473 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
38474 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
38475 if (CC == X86::COND_NE)
38476 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
38477 DAG.getConstant(-1ULL, DL, VT), Cmp1);
38479 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
38480 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
38481 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
38482 DAG.getConstant(0, DL, VT), Cmp1);
38485 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
38486 const X86Subtarget &Subtarget) {
38487 if (!Subtarget.hasSSE2())
38490 SDValue MulOp = N->getOperand(0);
38491 SDValue Phi = N->getOperand(1);
38493 if (MulOp.getOpcode() != ISD::MUL)
38494 std::swap(MulOp, Phi);
38495 if (MulOp.getOpcode() != ISD::MUL)
38499 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
38502 EVT VT = N->getValueType(0);
38504 unsigned RegSize = 128;
38505 if (Subtarget.useBWIRegs())
38507 else if (Subtarget.hasAVX())
38509 unsigned VectorSize = VT.getVectorNumElements() * 16;
38510 // If the vector size is less than 128, or greater than the supported RegSize,
38511 // do not use PMADD.
38512 if (VectorSize < 128 || VectorSize > RegSize)
38516 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38517 VT.getVectorNumElements());
38518 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38519 VT.getVectorNumElements() / 2);
38521 // Shrink the operands of mul.
38522 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
38523 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
38525 // Madd vector size is half of the original vector size
38526 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38527 ArrayRef<SDValue> Ops) {
38528 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
38529 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
38531 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
38533 // Fill the rest of the output with 0
38534 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
38535 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
38536 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
38539 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
38540 const X86Subtarget &Subtarget) {
38541 if (!Subtarget.hasSSE2())
38545 EVT VT = N->getValueType(0);
38546 SDValue Op0 = N->getOperand(0);
38547 SDValue Op1 = N->getOperand(1);
38549 // TODO: There's nothing special about i32, any integer type above i16 should
38550 // work just as well.
38551 if (!VT.isVector() || !VT.isSimple() ||
38552 !(VT.getVectorElementType() == MVT::i32))
38555 unsigned RegSize = 128;
38556 if (Subtarget.useBWIRegs())
38558 else if (Subtarget.hasAVX())
38561 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
38562 // TODO: We should be able to handle larger vectors by splitting them before
38563 // feeding them into several SADs, and then reducing over those.
38564 if (VT.getSizeInBits() / 4 > RegSize)
38567 // We know N is a reduction add, which means one of its operands is a phi.
38568 // To match SAD, we need the other operand to be a vector select.
38569 SDValue SelectOp, Phi;
38570 if (Op0.getOpcode() == ISD::VSELECT) {
38573 } else if (Op1.getOpcode() == ISD::VSELECT) {
38579 // Check whether we have an abs-diff pattern feeding into the select.
38580 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
38583 // SAD pattern detected. Now build a SAD instruction and an addition for
38584 // reduction. Note that the number of elements of the result of SAD is less
38585 // than the number of elements of its input. Therefore, we could only update
38586 // part of elements in the reduction vector.
38587 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
38589 // The output of PSADBW is a vector of i64.
38590 // We need to turn the vector of i64 into a vector of i32.
38591 // If the reduction vector is at least as wide as the psadbw result, just
38592 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
38594 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
38595 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
38596 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
38598 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
38600 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
38601 // Fill the upper elements with zero to match the add width.
38602 SDValue Zero = DAG.getConstant(0, DL, VT);
38603 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
38604 DAG.getIntPtrConstant(0, DL));
38607 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
38610 /// Convert vector increment or decrement to sub/add with an all-ones constant:
38611 /// add X, <1, 1...> --> sub X, <-1, -1...>
38612 /// sub X, <1, 1...> --> add X, <-1, -1...>
38613 /// The all-ones vector constant can be materialized using a pcmpeq instruction
38614 /// that is commonly recognized as an idiom (has no register dependency), so
38615 /// that's better/smaller than loading a splat 1 constant.
38616 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
38617 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
38618 "Unexpected opcode for increment/decrement transform");
38620 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
38621 // out and wait for legalization if we have an unsupported vector length.
38622 EVT VT = N->getValueType(0);
38623 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
38626 SDNode *N1 = N->getOperand(1).getNode();
38628 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
38629 !SplatVal.isOneValue())
38632 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
38633 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
38634 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
38637 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
38638 const SDLoc &DL, EVT VT,
38639 const X86Subtarget &Subtarget) {
38640 // Example of pattern we try to detect:
38641 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
38642 //(add (build_vector (extract_elt t, 0),
38643 // (extract_elt t, 2),
38644 // (extract_elt t, 4),
38645 // (extract_elt t, 6)),
38646 // (build_vector (extract_elt t, 1),
38647 // (extract_elt t, 3),
38648 // (extract_elt t, 5),
38649 // (extract_elt t, 7)))
38651 if (!Subtarget.hasSSE2())
38654 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
38655 Op1.getOpcode() != ISD::BUILD_VECTOR)
38658 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38659 VT.getVectorNumElements() < 4 ||
38660 !isPowerOf2_32(VT.getVectorNumElements()))
38663 // Check if one of Op0,Op1 is of the form:
38664 // (build_vector (extract_elt Mul, 0),
38665 // (extract_elt Mul, 2),
38666 // (extract_elt Mul, 4),
38668 // the other is of the form:
38669 // (build_vector (extract_elt Mul, 1),
38670 // (extract_elt Mul, 3),
38671 // (extract_elt Mul, 5),
38673 // and identify Mul.
38675 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
38676 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
38677 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
38678 // TODO: Be more tolerant to undefs.
38679 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38680 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38681 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38682 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38684 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
38685 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
38686 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
38687 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
38688 if (!Const0L || !Const1L || !Const0H || !Const1H)
38690 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
38691 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
38692 // Commutativity of mul allows factors of a product to reorder.
38694 std::swap(Idx0L, Idx1L);
38696 std::swap(Idx0H, Idx1H);
38697 // Commutativity of add allows pairs of factors to reorder.
38698 if (Idx0L > Idx0H) {
38699 std::swap(Idx0L, Idx0H);
38700 std::swap(Idx1L, Idx1H);
38702 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
38703 Idx1H != 2 * i + 3)
38706 // First time an extract_elt's source vector is visited. Must be a MUL
38707 // with 2X number of vector elements than the BUILD_VECTOR.
38708 // Both extracts must be from same MUL.
38709 Mul = Op0L->getOperand(0);
38710 if (Mul->getOpcode() != ISD::MUL ||
38711 Mul.getValueType().getVectorNumElements() != 2 * e)
38714 // Check that the extract is from the same MUL previously seen.
38715 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
38716 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
38720 // Check if the Mul source can be safely shrunk.
38722 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
38725 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38726 ArrayRef<SDValue> Ops) {
38727 // Shrink by adding truncate nodes and let DAGCombine fold with the
38729 EVT InVT = Ops[0].getValueType();
38730 assert(InVT.getScalarType() == MVT::i32 &&
38731 "Unexpected scalar element type");
38732 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38733 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38734 InVT.getVectorNumElements() / 2);
38735 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38736 InVT.getVectorNumElements());
38737 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
38738 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
38739 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
38741 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
38742 { Mul.getOperand(0), Mul.getOperand(1) },
38746 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
38747 const X86Subtarget &Subtarget) {
38748 const SDNodeFlags Flags = N->getFlags();
38749 if (Flags.hasVectorReduction()) {
38750 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
38752 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
38755 EVT VT = N->getValueType(0);
38756 SDValue Op0 = N->getOperand(0);
38757 SDValue Op1 = N->getOperand(1);
38759 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
38762 // Try to synthesize horizontal adds from adds of shuffles.
38763 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
38764 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
38765 isHorizontalBinOp(Op0, Op1, true))
38766 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
38768 if (SDValue V = combineIncDecVector(N, DAG))
38771 return combineAddOrSubToADCOrSBB(N, DAG);
38774 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
38775 const X86Subtarget &Subtarget) {
38776 SDValue Op0 = N->getOperand(0);
38777 SDValue Op1 = N->getOperand(1);
38778 EVT VT = N->getValueType(0);
38780 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
38781 // is only worth it with SSSE3 (PSHUFB).
38782 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
38783 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
38784 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
38785 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
38786 VT == MVT::v16i32 || VT == MVT::v8i64)))
38789 SDValue SubusLHS, SubusRHS;
38790 // Try to find umax(a,b) - b or a - umin(a,b) patterns
38791 // they may be converted to subus(a,b).
38792 // TODO: Need to add IR canonicalization for this code.
38793 if (Op0.getOpcode() == ISD::UMAX) {
38795 SDValue MaxLHS = Op0.getOperand(0);
38796 SDValue MaxRHS = Op0.getOperand(1);
38799 else if (MaxRHS == Op1)
38803 } else if (Op1.getOpcode() == ISD::UMIN) {
38805 SDValue MinLHS = Op1.getOperand(0);
38806 SDValue MinRHS = Op1.getOperand(1);
38809 else if (MinRHS == Op0)
38816 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38817 ArrayRef<SDValue> Ops) {
38818 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
38821 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
38822 // special preprocessing in some cases.
38823 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
38824 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
38825 { SubusLHS, SubusRHS }, SUBUSBuilder);
38827 // Special preprocessing case can be only applied
38828 // if the value was zero extended from 16 bit,
38829 // so we require first 16 bits to be zeros for 32 bit
38830 // values, or first 48 bits for 64 bit values.
38832 DAG.computeKnownBits(SubusLHS, Known);
38833 unsigned NumZeros = Known.countMinLeadingZeros();
38834 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
38837 EVT ExtType = SubusLHS.getValueType();
38839 if (VT == MVT::v8i32 || VT == MVT::v8i64)
38840 ShrinkedType = MVT::v8i16;
38842 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
38844 // If SubusLHS is zeroextended - truncate SubusRHS to it's
38845 // size SubusRHS = umin(0xFFF.., SubusRHS).
38846 SDValue SaturationConst =
38847 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
38848 ShrinkedType.getScalarSizeInBits()),
38849 SDLoc(SubusLHS), ExtType);
38850 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
38852 SDValue NewSubusLHS =
38853 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
38854 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
38856 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
38857 { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
38858 // Zero extend the result, it may be used somewhere as 32 bit,
38859 // if not zext and following trunc will shrink.
38860 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
38863 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
38864 const X86Subtarget &Subtarget) {
38865 SDValue Op0 = N->getOperand(0);
38866 SDValue Op1 = N->getOperand(1);
38868 // X86 can't encode an immediate LHS of a sub. See if we can push the
38869 // negation into a preceding instruction.
38870 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
38871 // If the RHS of the sub is a XOR with one use and a constant, invert the
38872 // immediate. Then add one to the LHS of the sub so we can turn
38873 // X-Y -> X+~Y+1, saving one register.
38874 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
38875 isa<ConstantSDNode>(Op1.getOperand(1))) {
38876 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
38877 EVT VT = Op0.getValueType();
38878 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
38880 DAG.getConstant(~XorC, SDLoc(Op1), VT));
38881 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
38882 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
38886 // Try to synthesize horizontal subs from subs of shuffles.
38887 EVT VT = N->getValueType(0);
38888 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
38889 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
38890 isHorizontalBinOp(Op0, Op1, false))
38891 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
38893 if (SDValue V = combineIncDecVector(N, DAG))
38896 // Try to create PSUBUS if SUB's argument is max/min
38897 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
38900 return combineAddOrSubToADCOrSBB(N, DAG);
38903 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
38904 TargetLowering::DAGCombinerInfo &DCI,
38905 const X86Subtarget &Subtarget) {
38906 if (DCI.isBeforeLegalize())
38910 unsigned Opcode = N->getOpcode();
38911 MVT VT = N->getSimpleValueType(0);
38912 MVT SVT = VT.getVectorElementType();
38913 unsigned NumElts = VT.getVectorNumElements();
38914 unsigned EltSizeInBits = SVT.getSizeInBits();
38916 SDValue Op = N->getOperand(0);
38917 MVT OpVT = Op.getSimpleValueType();
38918 MVT OpEltVT = OpVT.getVectorElementType();
38919 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
38920 unsigned InputBits = OpEltSizeInBits * NumElts;
38922 // Perform any constant folding.
38923 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
38925 SmallVector<APInt, 64> EltBits;
38926 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
38927 APInt Undefs(NumElts, 0);
38928 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
38930 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
38931 for (unsigned i = 0; i != NumElts; ++i) {
38932 if (UndefElts[i]) {
38936 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
38937 : EltBits[i].sextOrTrunc(EltSizeInBits);
38939 return getConstVector(Vals, Undefs, VT, DAG, DL);
38942 // (vzext (bitcast (vzext (x)) -> (vzext x)
38943 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
38944 SDValue V = peekThroughBitcasts(Op);
38945 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
38946 MVT InnerVT = V.getSimpleValueType();
38947 MVT InnerEltVT = InnerVT.getVectorElementType();
38949 // If the element sizes match exactly, we can just do one larger vzext. This
38950 // is always an exact type match as vzext operates on integer types.
38951 if (OpEltVT == InnerEltVT) {
38952 assert(OpVT == InnerVT && "Types must match for vzext!");
38953 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
38956 // The only other way we can combine them is if only a single element of the
38957 // inner vzext is used in the input to the outer vzext.
38958 if (InnerEltVT.getSizeInBits() < InputBits)
38961 // In this case, the inner vzext is completely dead because we're going to
38962 // only look at bits inside of the low element. Just do the outer vzext on
38963 // a bitcast of the input to the inner.
38964 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
38967 // Check if we can bypass extracting and re-inserting an element of an input
38968 // vector. Essentially:
38969 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
38970 // TODO: Add X86ISD::VSEXT support
38971 if (Opcode == X86ISD::VZEXT &&
38972 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38973 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
38974 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
38975 SDValue ExtractedV = V.getOperand(0);
38976 SDValue OrigV = ExtractedV.getOperand(0);
38977 if (isNullConstant(ExtractedV.getOperand(1))) {
38978 MVT OrigVT = OrigV.getSimpleValueType();
38979 // Extract a subvector if necessary...
38980 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
38981 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
38982 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
38983 OrigVT.getVectorNumElements() / Ratio);
38984 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
38985 DAG.getIntPtrConstant(0, DL));
38987 Op = DAG.getBitcast(OpVT, OrigV);
38988 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
38995 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
38996 const X86Subtarget &Subtarget) {
38997 MVT VT = N->getSimpleValueType(0);
39000 if (N->getOperand(0) == N->getOperand(1)) {
39001 if (N->getOpcode() == X86ISD::PCMPEQ)
39002 return getOnesVector(VT, DAG, DL);
39003 if (N->getOpcode() == X86ISD::PCMPGT)
39004 return getZeroVector(VT, Subtarget, DAG, DL);
39010 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
39011 TargetLowering::DAGCombinerInfo &DCI,
39012 const X86Subtarget &Subtarget) {
39013 if (DCI.isBeforeLegalizeOps())
39016 MVT OpVT = N->getSimpleValueType(0);
39018 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
39021 SDValue Vec = N->getOperand(0);
39022 SDValue SubVec = N->getOperand(1);
39024 unsigned IdxVal = N->getConstantOperandVal(2);
39025 MVT SubVecVT = SubVec.getSimpleValueType();
39027 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
39028 // Inserting zeros into zeros is a nop.
39029 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39030 return getZeroVector(OpVT, Subtarget, DAG, dl);
39032 // If we're inserting into a zero vector and then into a larger zero vector,
39033 // just insert into the larger zero vector directly.
39034 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39035 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
39036 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
39037 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39038 getZeroVector(OpVT, Subtarget, DAG, dl),
39039 SubVec.getOperand(1),
39040 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
39043 // If we're inserting into a zero vector and our input was extracted from an
39044 // insert into a zero vector of the same type and the extraction was at
39045 // least as large as the original insertion. Just insert the original
39046 // subvector into a zero vector.
39047 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
39048 SubVec.getConstantOperandVal(1) == 0 &&
39049 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
39050 SDValue Ins = SubVec.getOperand(0);
39051 if (Ins.getConstantOperandVal(2) == 0 &&
39052 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
39053 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
39054 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39055 getZeroVector(OpVT, Subtarget, DAG, dl),
39056 Ins.getOperand(1), N->getOperand(2));
39059 // If we're inserting a bitcast into zeros, rewrite the insert and move the
39060 // bitcast to the other side. This helps with detecting zero extending
39062 // TODO: Is this useful for other indices than 0?
39063 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
39064 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
39065 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
39066 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
39067 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
39068 DAG.getBitcast(NewVT, Vec),
39069 SubVec.getOperand(0), N->getOperand(2));
39070 return DAG.getBitcast(OpVT, Insert);
39074 // Stop here if this is an i1 vector.
39078 // If this is an insert of an extract, combine to a shuffle. Don't do this
39079 // if the insert or extract can be represented with a subregister operation.
39080 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39081 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
39082 (IdxVal != 0 || !Vec.isUndef())) {
39083 int ExtIdxVal = SubVec.getConstantOperandVal(1);
39084 if (ExtIdxVal != 0) {
39085 int VecNumElts = OpVT.getVectorNumElements();
39086 int SubVecNumElts = SubVecVT.getVectorNumElements();
39087 SmallVector<int, 64> Mask(VecNumElts);
39088 // First create an identity shuffle mask.
39089 for (int i = 0; i != VecNumElts; ++i)
39091 // Now insert the extracted portion.
39092 for (int i = 0; i != SubVecNumElts; ++i)
39093 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
39095 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
39099 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
39101 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39102 // (load16 addr + 16), Elts/2)
39105 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39106 // (load32 addr + 32), Elts/2)
39108 // or a 16-byte or 32-byte broadcast:
39109 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39110 // (load16 addr), Elts/2)
39111 // --> X86SubVBroadcast(load16 addr)
39113 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39114 // (load32 addr), Elts/2)
39115 // --> X86SubVBroadcast(load32 addr)
39116 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
39117 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39118 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
39119 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
39120 if (Idx2 && Idx2->getZExtValue() == 0) {
39121 SDValue SubVec2 = Vec.getOperand(1);
39122 // If needed, look through bitcasts to get to the load.
39123 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
39125 unsigned Alignment = FirstLd->getAlignment();
39126 unsigned AS = FirstLd->getAddressSpace();
39127 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
39128 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
39129 OpVT, AS, Alignment, &Fast) && Fast) {
39130 SDValue Ops[] = {SubVec2, SubVec};
39131 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
39136 // If lower/upper loads are the same and the only users of the load, then
39137 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
39138 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
39139 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
39140 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
39141 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
39143 // If this is subv_broadcast insert into both halves, use a larger
39145 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
39146 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
39147 SubVec.getOperand(0));
39149 // If we're inserting all zeros into the upper half, change this to
39150 // an insert into an all zeros vector. We will match this to a move
39151 // with implicit upper bit zeroing during isel.
39152 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39153 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39154 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
39155 Vec.getOperand(2));
39157 // If we are inserting into both halves of the vector, the starting
39158 // vector should be undef. If it isn't, make it so. Only do this if the
39159 // the early insert has no other uses.
39160 // TODO: Should this be a generic DAG combine?
39161 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
39162 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
39163 SubVec2, Vec.getOperand(2));
39164 DCI.AddToWorklist(Vec.getNode());
39165 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
39175 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
39176 TargetLowering::DAGCombinerInfo &DCI,
39177 const X86Subtarget &Subtarget) {
39178 if (DCI.isBeforeLegalizeOps())
39181 MVT OpVT = N->getSimpleValueType(0);
39182 SDValue InVec = N->getOperand(0);
39183 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
39185 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
39186 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
39188 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
39189 if (OpVT.getScalarType() == MVT::i1)
39190 return DAG.getConstant(1, SDLoc(N), OpVT);
39191 return getOnesVector(OpVT, DAG, SDLoc(N));
39194 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
39195 return DAG.getBuildVector(
39197 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
39202 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
39203 EVT VT = N->getValueType(0);
39204 SDValue Src = N->getOperand(0);
39206 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
39207 // This occurs frequently in our masked scalar intrinsic code and our
39208 // floating point select lowering with AVX512.
39209 // TODO: SimplifyDemandedBits instead?
39210 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
39211 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
39212 if (C->getAPIntValue().isOneValue())
39213 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
39214 Src.getOperand(0));
39219 // Simplify PMULDQ and PMULUDQ operations.
39220 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
39221 TargetLowering::DAGCombinerInfo &DCI) {
39222 SDValue LHS = N->getOperand(0);
39223 SDValue RHS = N->getOperand(1);
39225 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39226 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
39227 !DCI.isBeforeLegalizeOps());
39228 APInt DemandedMask(APInt::getLowBitsSet(64, 32));
39230 // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
39231 KnownBits LHSKnown;
39232 if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
39233 DCI.CommitTargetLoweringOpt(TLO);
39234 return SDValue(N, 0);
39237 KnownBits RHSKnown;
39238 if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
39239 DCI.CommitTargetLoweringOpt(TLO);
39240 return SDValue(N, 0);
39246 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
39247 DAGCombinerInfo &DCI) const {
39248 SelectionDAG &DAG = DCI.DAG;
39249 switch (N->getOpcode()) {
39251 case ISD::SCALAR_TO_VECTOR:
39252 return combineScalarToVector(N, DAG);
39253 case ISD::EXTRACT_VECTOR_ELT:
39254 case X86ISD::PEXTRW:
39255 case X86ISD::PEXTRB:
39256 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
39257 case ISD::INSERT_SUBVECTOR:
39258 return combineInsertSubvector(N, DAG, DCI, Subtarget);
39259 case ISD::EXTRACT_SUBVECTOR:
39260 return combineExtractSubvector(N, DAG, DCI, Subtarget);
39263 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
39264 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
39265 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
39266 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
39267 case ISD::SUB: return combineSub(N, DAG, Subtarget);
39268 case X86ISD::SBB: return combineSBB(N, DAG);
39269 case X86ISD::ADC: return combineADC(N, DAG, DCI);
39270 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
39273 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
39274 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
39275 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
39276 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
39277 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
39278 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
39279 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
39280 case ISD::STORE: return combineStore(N, DAG, Subtarget);
39281 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
39282 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
39283 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
39285 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
39286 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
39287 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
39288 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
39289 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
39290 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
39292 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
39294 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
39296 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
39297 case X86ISD::BT: return combineBT(N, DAG, DCI);
39298 case ISD::ANY_EXTEND:
39299 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
39300 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
39301 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
39302 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
39303 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
39304 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
39305 case X86ISD::PACKSS:
39306 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
39307 case X86ISD::VSHLI:
39308 case X86ISD::VSRAI:
39309 case X86ISD::VSRLI:
39310 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
39311 case ISD::SIGN_EXTEND_VECTOR_INREG:
39312 case ISD::ZERO_EXTEND_VECTOR_INREG:
39313 case X86ISD::VSEXT:
39314 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
39315 case X86ISD::PINSRB:
39316 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
39317 case X86ISD::SHUFP: // Handle all target specific shuffles
39318 case X86ISD::INSERTPS:
39319 case X86ISD::EXTRQI:
39320 case X86ISD::INSERTQI:
39321 case X86ISD::PALIGNR:
39322 case X86ISD::VSHLDQ:
39323 case X86ISD::VSRLDQ:
39324 case X86ISD::BLENDI:
39325 case X86ISD::UNPCKH:
39326 case X86ISD::UNPCKL:
39327 case X86ISD::MOVHLPS:
39328 case X86ISD::MOVLHPS:
39329 case X86ISD::PSHUFB:
39330 case X86ISD::PSHUFD:
39331 case X86ISD::PSHUFHW:
39332 case X86ISD::PSHUFLW:
39333 case X86ISD::MOVSHDUP:
39334 case X86ISD::MOVSLDUP:
39335 case X86ISD::MOVDDUP:
39336 case X86ISD::MOVSS:
39337 case X86ISD::MOVSD:
39338 case X86ISD::VBROADCAST:
39339 case X86ISD::VPPERM:
39340 case X86ISD::VPERMI:
39341 case X86ISD::VPERMV:
39342 case X86ISD::VPERMV3:
39343 case X86ISD::VPERMIL2:
39344 case X86ISD::VPERMILPI:
39345 case X86ISD::VPERMILPV:
39346 case X86ISD::VPERM2X128:
39347 case X86ISD::VZEXT_MOVL:
39348 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
39349 case X86ISD::FMADD_RND:
39350 case X86ISD::FMADDS1_RND:
39351 case X86ISD::FMADDS3_RND:
39352 case X86ISD::FMADDS1:
39353 case X86ISD::FMADDS3:
39354 case X86ISD::FMADD4S:
39355 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
39356 case X86ISD::FMADDSUB_RND:
39357 case X86ISD::FMSUBADD_RND:
39358 case X86ISD::FMADDSUB:
39359 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
39360 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
39361 case X86ISD::MGATHER:
39362 case X86ISD::MSCATTER:
39364 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
39365 case X86ISD::PCMPEQ:
39366 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
39367 case X86ISD::PMULDQ:
39368 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
39374 /// Return true if the target has native support for the specified value type
39375 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
39376 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
39377 /// some i16 instructions are slow.
39378 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
39379 if (!isTypeLegal(VT))
39382 // There are no vXi8 shifts.
39383 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
39386 if (VT != MVT::i16)
39393 case ISD::SIGN_EXTEND:
39394 case ISD::ZERO_EXTEND:
39395 case ISD::ANY_EXTEND:
39408 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
39409 SDValue Value, SDValue Addr,
39410 SelectionDAG &DAG) const {
39411 const Module *M = DAG.getMachineFunction().getMMI().getModule();
39412 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
39413 if (IsCFProtectionSupported) {
39414 // In case control-flow branch protection is enabled, we need to add
39415 // notrack prefix to the indirect branch.
39416 // In order to do that we create NT_BRIND SDNode.
39417 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
39418 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
39421 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
39424 /// This method query the target whether it is beneficial for dag combiner to
39425 /// promote the specified node. If true, it should return the desired promotion
39426 /// type by reference.
39427 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
39428 EVT VT = Op.getValueType();
39429 if (VT != MVT::i16)
39432 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
39433 if (!Op.hasOneUse())
39435 SDNode *User = *Op->use_begin();
39436 if (!ISD::isNormalStore(User))
39438 auto *Ld = cast<LoadSDNode>(Load);
39439 auto *St = cast<StoreSDNode>(User);
39440 return Ld->getBasePtr() == St->getBasePtr();
39443 bool Commute = false;
39444 switch (Op.getOpcode()) {
39445 default: return false;
39446 case ISD::SIGN_EXTEND:
39447 case ISD::ZERO_EXTEND:
39448 case ISD::ANY_EXTEND:
39452 SDValue N0 = Op.getOperand(0);
39453 // Look out for (store (shl (load), x)).
39454 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
39466 SDValue N0 = Op.getOperand(0);
39467 SDValue N1 = Op.getOperand(1);
39468 // Avoid disabling potential load folding opportunities.
39469 if (MayFoldLoad(N1) &&
39470 (!Commute || !isa<ConstantSDNode>(N0) ||
39471 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
39473 if (MayFoldLoad(N0) &&
39474 ((Commute && !isa<ConstantSDNode>(N1)) ||
39475 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
39484 bool X86TargetLowering::
39485 isDesirableToCombineBuildVectorToShuffleTruncate(
39486 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
39488 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
39489 "Element count mismatch");
39491 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
39492 "Shuffle Mask expected to be legal");
39494 // For 32-bit elements VPERMD is better than shuffle+truncate.
39495 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
39496 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
39499 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
39505 //===----------------------------------------------------------------------===//
39506 // X86 Inline Assembly Support
39507 //===----------------------------------------------------------------------===//
39509 // Helper to match a string separated by whitespace.
39510 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
39511 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
39513 for (StringRef Piece : Pieces) {
39514 if (!S.startswith(Piece)) // Check if the piece matches.
39517 S = S.substr(Piece.size());
39518 StringRef::size_type Pos = S.find_first_not_of(" \t");
39519 if (Pos == 0) // We matched a prefix.
39528 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
39530 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
39531 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
39532 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
39533 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
39535 if (AsmPieces.size() == 3)
39537 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
39544 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
39545 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
39547 const std::string &AsmStr = IA->getAsmString();
39549 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
39550 if (!Ty || Ty->getBitWidth() % 16 != 0)
39553 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
39554 SmallVector<StringRef, 4> AsmPieces;
39555 SplitString(AsmStr, AsmPieces, ";\n");
39557 switch (AsmPieces.size()) {
39558 default: return false;
39560 // FIXME: this should verify that we are targeting a 486 or better. If not,
39561 // we will turn this bswap into something that will be lowered to logical
39562 // ops instead of emitting the bswap asm. For now, we don't support 486 or
39563 // lower so don't worry about this.
39565 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
39566 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
39567 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
39568 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
39569 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
39570 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
39571 // No need to check constraints, nothing other than the equivalent of
39572 // "=r,0" would be valid here.
39573 return IntrinsicLowering::LowerToByteSwap(CI);
39576 // rorw $$8, ${0:w} --> llvm.bswap.i16
39577 if (CI->getType()->isIntegerTy(16) &&
39578 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39579 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
39580 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
39582 StringRef ConstraintsStr = IA->getConstraintString();
39583 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39584 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39585 if (clobbersFlagRegisters(AsmPieces))
39586 return IntrinsicLowering::LowerToByteSwap(CI);
39590 if (CI->getType()->isIntegerTy(32) &&
39591 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39592 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
39593 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
39594 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
39596 StringRef ConstraintsStr = IA->getConstraintString();
39597 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39598 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39599 if (clobbersFlagRegisters(AsmPieces))
39600 return IntrinsicLowering::LowerToByteSwap(CI);
39603 if (CI->getType()->isIntegerTy(64)) {
39604 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
39605 if (Constraints.size() >= 2 &&
39606 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
39607 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
39608 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
39609 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
39610 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
39611 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
39612 return IntrinsicLowering::LowerToByteSwap(CI);
39620 /// Given a constraint letter, return the type of constraint for this target.
39621 X86TargetLowering::ConstraintType
39622 X86TargetLowering::getConstraintType(StringRef Constraint) const {
39623 if (Constraint.size() == 1) {
39624 switch (Constraint[0]) {
39636 case 'k': // AVX512 masking registers.
39637 return C_RegisterClass;
39661 else if (Constraint.size() == 2) {
39662 switch (Constraint[0]) {
39666 switch (Constraint[1]) {
39677 return C_RegisterClass;
39681 return TargetLowering::getConstraintType(Constraint);
39684 /// Examine constraint type and operand type and determine a weight value.
39685 /// This object must already have been set up with the operand type
39686 /// and the current alternative constraint selected.
39687 TargetLowering::ConstraintWeight
39688 X86TargetLowering::getSingleConstraintMatchWeight(
39689 AsmOperandInfo &info, const char *constraint) const {
39690 ConstraintWeight weight = CW_Invalid;
39691 Value *CallOperandVal = info.CallOperandVal;
39692 // If we don't have a value, we can't do a match,
39693 // but allow it at the lowest weight.
39694 if (!CallOperandVal)
39696 Type *type = CallOperandVal->getType();
39697 // Look at the constraint type.
39698 switch (*constraint) {
39700 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
39712 if (CallOperandVal->getType()->isIntegerTy())
39713 weight = CW_SpecificReg;
39718 if (type->isFloatingPointTy())
39719 weight = CW_SpecificReg;
39722 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39723 weight = CW_SpecificReg;
39726 unsigned Size = StringRef(constraint).size();
39727 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
39728 char NextChar = Size == 2 ? constraint[1] : 'i';
39731 switch (NextChar) {
39737 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
39738 return CW_SpecificReg;
39740 // Conditional OpMask regs (AVX512)
39742 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39743 return CW_Register;
39747 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39750 // Any SSE reg when ISA >= SSE2, same as 'Y'
39754 if (!Subtarget.hasSSE2())
39758 // Fall through (handle "Y" constraint).
39762 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
39763 weight = CW_Register;
39766 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
39767 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
39768 weight = CW_Register;
39771 // Enable conditional vector operations using %k<#> registers.
39772 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39773 weight = CW_Register;
39776 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
39777 if (C->getZExtValue() <= 31)
39778 weight = CW_Constant;
39782 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39783 if (C->getZExtValue() <= 63)
39784 weight = CW_Constant;
39788 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39789 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
39790 weight = CW_Constant;
39794 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39795 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
39796 weight = CW_Constant;
39800 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39801 if (C->getZExtValue() <= 3)
39802 weight = CW_Constant;
39806 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39807 if (C->getZExtValue() <= 0xff)
39808 weight = CW_Constant;
39813 if (isa<ConstantFP>(CallOperandVal)) {
39814 weight = CW_Constant;
39818 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39819 if ((C->getSExtValue() >= -0x80000000LL) &&
39820 (C->getSExtValue() <= 0x7fffffffLL))
39821 weight = CW_Constant;
39825 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39826 if (C->getZExtValue() <= 0xffffffff)
39827 weight = CW_Constant;
39834 /// Try to replace an X constraint, which matches anything, with another that
39835 /// has more specific requirements based on the type of the corresponding
39837 const char *X86TargetLowering::
39838 LowerXConstraint(EVT ConstraintVT) const {
39839 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
39840 // 'f' like normal targets.
39841 if (ConstraintVT.isFloatingPoint()) {
39842 if (Subtarget.hasSSE2())
39844 if (Subtarget.hasSSE1())
39848 return TargetLowering::LowerXConstraint(ConstraintVT);
39851 /// Lower the specified operand into the Ops vector.
39852 /// If it is invalid, don't add anything to Ops.
39853 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
39854 std::string &Constraint,
39855 std::vector<SDValue>&Ops,
39856 SelectionDAG &DAG) const {
39859 // Only support length 1 constraints for now.
39860 if (Constraint.length() > 1) return;
39862 char ConstraintLetter = Constraint[0];
39863 switch (ConstraintLetter) {
39866 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39867 if (C->getZExtValue() <= 31) {
39868 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39869 Op.getValueType());
39875 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39876 if (C->getZExtValue() <= 63) {
39877 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39878 Op.getValueType());
39884 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39885 if (isInt<8>(C->getSExtValue())) {
39886 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39887 Op.getValueType());
39893 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39894 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
39895 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
39896 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
39897 Op.getValueType());
39903 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39904 if (C->getZExtValue() <= 3) {
39905 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39906 Op.getValueType());
39912 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39913 if (C->getZExtValue() <= 255) {
39914 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39915 Op.getValueType());
39921 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39922 if (C->getZExtValue() <= 127) {
39923 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39924 Op.getValueType());
39930 // 32-bit signed value
39931 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39932 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
39933 C->getSExtValue())) {
39934 // Widen to 64 bits here to get it sign extended.
39935 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
39938 // FIXME gcc accepts some relocatable values here too, but only in certain
39939 // memory models; it's complicated.
39944 // 32-bit unsigned value
39945 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39946 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
39947 C->getZExtValue())) {
39948 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39949 Op.getValueType());
39953 // FIXME gcc accepts some relocatable values here too, but only in certain
39954 // memory models; it's complicated.
39958 // Literal immediates are always ok.
39959 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
39960 // Widen to 64 bits here to get it sign extended.
39961 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
39965 // In any sort of PIC mode addresses need to be computed at runtime by
39966 // adding in a register or some sort of table lookup. These can't
39967 // be used as immediates.
39968 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
39971 // If we are in non-pic codegen mode, we allow the address of a global (with
39972 // an optional displacement) to be used with 'i'.
39973 GlobalAddressSDNode *GA = nullptr;
39974 int64_t Offset = 0;
39976 // Match either (GA), (GA+C), (GA+C1+C2), etc.
39978 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
39979 Offset += GA->getOffset();
39981 } else if (Op.getOpcode() == ISD::ADD) {
39982 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
39983 Offset += C->getZExtValue();
39984 Op = Op.getOperand(0);
39987 } else if (Op.getOpcode() == ISD::SUB) {
39988 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
39989 Offset += -C->getZExtValue();
39990 Op = Op.getOperand(0);
39995 // Otherwise, this isn't something we can handle, reject it.
39999 const GlobalValue *GV = GA->getGlobal();
40000 // If we require an extra load to get this address, as in PIC mode, we
40001 // can't accept it.
40002 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
40005 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
40006 GA->getValueType(0), Offset);
40011 if (Result.getNode()) {
40012 Ops.push_back(Result);
40015 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
40018 /// Check if \p RC is a general purpose register class.
40019 /// I.e., GR* or one of their variant.
40020 static bool isGRClass(const TargetRegisterClass &RC) {
40021 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
40022 RC.hasSuperClassEq(&X86::GR16RegClass) ||
40023 RC.hasSuperClassEq(&X86::GR32RegClass) ||
40024 RC.hasSuperClassEq(&X86::GR64RegClass) ||
40025 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
40028 /// Check if \p RC is a vector register class.
40029 /// I.e., FR* / VR* or one of their variant.
40030 static bool isFRClass(const TargetRegisterClass &RC) {
40031 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
40032 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
40033 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
40034 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
40035 RC.hasSuperClassEq(&X86::VR512RegClass);
40038 std::pair<unsigned, const TargetRegisterClass *>
40039 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
40040 StringRef Constraint,
40042 // First, see if this is a constraint that directly corresponds to an LLVM
40044 if (Constraint.size() == 1) {
40045 // GCC Constraint Letters
40046 switch (Constraint[0]) {
40048 // TODO: Slight differences here in allocation order and leaving
40049 // RIP in the class. Do they matter any more here than they do
40050 // in the normal allocation?
40052 if (Subtarget.hasAVX512()) {
40053 // Only supported in AVX512 or later.
40054 switch (VT.SimpleTy) {
40057 return std::make_pair(0U, &X86::VK32RegClass);
40059 return std::make_pair(0U, &X86::VK16RegClass);
40061 return std::make_pair(0U, &X86::VK8RegClass);
40063 return std::make_pair(0U, &X86::VK1RegClass);
40065 return std::make_pair(0U, &X86::VK64RegClass);
40069 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
40070 if (Subtarget.is64Bit()) {
40071 if (VT == MVT::i32 || VT == MVT::f32)
40072 return std::make_pair(0U, &X86::GR32RegClass);
40073 if (VT == MVT::i16)
40074 return std::make_pair(0U, &X86::GR16RegClass);
40075 if (VT == MVT::i8 || VT == MVT::i1)
40076 return std::make_pair(0U, &X86::GR8RegClass);
40077 if (VT == MVT::i64 || VT == MVT::f64)
40078 return std::make_pair(0U, &X86::GR64RegClass);
40082 // 32-bit fallthrough
40083 case 'Q': // Q_REGS
40084 if (VT == MVT::i32 || VT == MVT::f32)
40085 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
40086 if (VT == MVT::i16)
40087 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
40088 if (VT == MVT::i8 || VT == MVT::i1)
40089 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
40090 if (VT == MVT::i64)
40091 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
40093 case 'r': // GENERAL_REGS
40094 case 'l': // INDEX_REGS
40095 if (VT == MVT::i8 || VT == MVT::i1)
40096 return std::make_pair(0U, &X86::GR8RegClass);
40097 if (VT == MVT::i16)
40098 return std::make_pair(0U, &X86::GR16RegClass);
40099 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
40100 return std::make_pair(0U, &X86::GR32RegClass);
40101 return std::make_pair(0U, &X86::GR64RegClass);
40102 case 'R': // LEGACY_REGS
40103 if (VT == MVT::i8 || VT == MVT::i1)
40104 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
40105 if (VT == MVT::i16)
40106 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
40107 if (VT == MVT::i32 || !Subtarget.is64Bit())
40108 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
40109 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
40110 case 'f': // FP Stack registers.
40111 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
40112 // value to the correct fpstack register class.
40113 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
40114 return std::make_pair(0U, &X86::RFP32RegClass);
40115 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
40116 return std::make_pair(0U, &X86::RFP64RegClass);
40117 return std::make_pair(0U, &X86::RFP80RegClass);
40118 case 'y': // MMX_REGS if MMX allowed.
40119 if (!Subtarget.hasMMX()) break;
40120 return std::make_pair(0U, &X86::VR64RegClass);
40121 case 'Y': // SSE_REGS if SSE2 allowed
40122 if (!Subtarget.hasSSE2()) break;
40125 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
40126 if (!Subtarget.hasSSE1()) break;
40127 bool VConstraint = (Constraint[0] == 'v');
40129 switch (VT.SimpleTy) {
40131 // Scalar SSE types.
40134 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
40135 return std::make_pair(0U, &X86::FR32XRegClass);
40136 return std::make_pair(0U, &X86::FR32RegClass);
40139 if (VConstraint && Subtarget.hasVLX())
40140 return std::make_pair(0U, &X86::FR64XRegClass);
40141 return std::make_pair(0U, &X86::FR64RegClass);
40142 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40150 if (VConstraint && Subtarget.hasVLX())
40151 return std::make_pair(0U, &X86::VR128XRegClass);
40152 return std::make_pair(0U, &X86::VR128RegClass);
40160 if (VConstraint && Subtarget.hasVLX())
40161 return std::make_pair(0U, &X86::VR256XRegClass);
40162 return std::make_pair(0U, &X86::VR256RegClass);
40167 return std::make_pair(0U, &X86::VR512RegClass);
40171 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
40172 switch (Constraint[1]) {
40178 return getRegForInlineAsmConstraint(TRI, "Y", VT);
40180 if (!Subtarget.hasMMX()) break;
40181 return std::make_pair(0U, &X86::VR64RegClass);
40184 if (!Subtarget.hasSSE1()) break;
40185 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
40187 // This register class doesn't allocate k0 for masked vector operation.
40188 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
40189 switch (VT.SimpleTy) {
40192 return std::make_pair(0U, &X86::VK32WMRegClass);
40194 return std::make_pair(0U, &X86::VK16WMRegClass);
40196 return std::make_pair(0U, &X86::VK8WMRegClass);
40198 return std::make_pair(0U, &X86::VK1WMRegClass);
40200 return std::make_pair(0U, &X86::VK64WMRegClass);
40207 // Use the default implementation in TargetLowering to convert the register
40208 // constraint into a member of a register class.
40209 std::pair<unsigned, const TargetRegisterClass*> Res;
40210 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
40212 // Not found as a standard register?
40214 // Map st(0) -> st(7) -> ST0
40215 if (Constraint.size() == 7 && Constraint[0] == '{' &&
40216 tolower(Constraint[1]) == 's' &&
40217 tolower(Constraint[2]) == 't' &&
40218 Constraint[3] == '(' &&
40219 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
40220 Constraint[5] == ')' &&
40221 Constraint[6] == '}') {
40223 Res.first = X86::FP0+Constraint[4]-'0';
40224 Res.second = &X86::RFP80RegClass;
40228 // GCC allows "st(0)" to be called just plain "st".
40229 if (StringRef("{st}").equals_lower(Constraint)) {
40230 Res.first = X86::FP0;
40231 Res.second = &X86::RFP80RegClass;
40236 if (StringRef("{flags}").equals_lower(Constraint)) {
40237 Res.first = X86::EFLAGS;
40238 Res.second = &X86::CCRRegClass;
40242 // 'A' means [ER]AX + [ER]DX.
40243 if (Constraint == "A") {
40244 if (Subtarget.is64Bit()) {
40245 Res.first = X86::RAX;
40246 Res.second = &X86::GR64_ADRegClass;
40248 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
40249 "Expecting 64, 32 or 16 bit subtarget");
40250 Res.first = X86::EAX;
40251 Res.second = &X86::GR32_ADRegClass;
40258 // Make sure it isn't a register that requires 64-bit mode.
40259 if (!Subtarget.is64Bit() &&
40260 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
40261 TRI->getEncodingValue(Res.first) >= 8) {
40262 // Register requires REX prefix, but we're in 32-bit mode.
40264 Res.second = nullptr;
40268 // Make sure it isn't a register that requires AVX512.
40269 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
40270 TRI->getEncodingValue(Res.first) & 0x10) {
40271 // Register requires EVEX prefix.
40273 Res.second = nullptr;
40277 // Otherwise, check to see if this is a register class of the wrong value
40278 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
40279 // turn into {ax},{dx}.
40280 // MVT::Other is used to specify clobber names.
40281 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
40282 return Res; // Correct type already, nothing to do.
40284 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
40285 // return "eax". This should even work for things like getting 64bit integer
40286 // registers when given an f64 type.
40287 const TargetRegisterClass *Class = Res.second;
40288 // The generic code will match the first register class that contains the
40289 // given register. Thus, based on the ordering of the tablegened file,
40290 // the "plain" GR classes might not come first.
40291 // Therefore, use a helper method.
40292 if (isGRClass(*Class)) {
40293 unsigned Size = VT.getSizeInBits();
40294 if (Size == 1) Size = 8;
40295 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
40297 bool is64Bit = Subtarget.is64Bit();
40298 const TargetRegisterClass *RC =
40299 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
40300 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
40301 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
40302 : &X86::GR64RegClass;
40303 if (RC->contains(DestReg))
40304 Res = std::make_pair(DestReg, RC);
40306 // No register found/type mismatch.
40308 Res.second = nullptr;
40310 } else if (isFRClass(*Class)) {
40311 // Handle references to XMM physical registers that got mapped into the
40312 // wrong class. This can happen with constraints like {xmm0} where the
40313 // target independent register mapper will just pick the first match it can
40314 // find, ignoring the required type.
40316 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40317 if (VT == MVT::f32 || VT == MVT::i32)
40318 Res.second = &X86::FR32RegClass;
40319 else if (VT == MVT::f64 || VT == MVT::i64)
40320 Res.second = &X86::FR64RegClass;
40321 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
40322 Res.second = &X86::VR128RegClass;
40323 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
40324 Res.second = &X86::VR256RegClass;
40325 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
40326 Res.second = &X86::VR512RegClass;
40328 // Type mismatch and not a clobber: Return an error;
40330 Res.second = nullptr;
40337 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
40338 const AddrMode &AM, Type *Ty,
40339 unsigned AS) const {
40340 // Scaling factors are not free at all.
40341 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
40342 // will take 2 allocations in the out of order engine instead of 1
40343 // for plain addressing mode, i.e. inst (reg1).
40345 // vaddps (%rsi,%rdx), %ymm0, %ymm1
40346 // Requires two allocations (one for the load, one for the computation)
40348 // vaddps (%rsi), %ymm0, %ymm1
40349 // Requires just 1 allocation, i.e., freeing allocations for other operations
40350 // and having less micro operations to execute.
40352 // For some X86 architectures, this is even worse because for instance for
40353 // stores, the complex addressing mode forces the instruction to use the
40354 // "load" ports instead of the dedicated "store" port.
40355 // E.g., on Haswell:
40356 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
40357 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
40358 if (isLegalAddressingMode(DL, AM, Ty, AS))
40359 // Scale represents reg2 * scale, thus account for 1
40360 // as soon as we use a second register.
40361 return AM.Scale != 0;
40365 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
40366 // Integer division on x86 is expensive. However, when aggressively optimizing
40367 // for code size, we prefer to use a div instruction, as it is usually smaller
40368 // than the alternative sequence.
40369 // The exception to this is vector division. Since x86 doesn't have vector
40370 // integer division, leaving the division as-is is a loss even in terms of
40371 // size, because it will have to be scalarized, while the alternative code
40372 // sequence can be performed in vector form.
40374 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
40375 return OptSize && !VT.isVector();
40378 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
40379 if (!Subtarget.is64Bit())
40382 // Update IsSplitCSR in X86MachineFunctionInfo.
40383 X86MachineFunctionInfo *AFI =
40384 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
40385 AFI->setIsSplitCSR(true);
40388 void X86TargetLowering::insertCopiesSplitCSR(
40389 MachineBasicBlock *Entry,
40390 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
40391 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
40392 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
40396 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
40397 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
40398 MachineBasicBlock::iterator MBBI = Entry->begin();
40399 for (const MCPhysReg *I = IStart; *I; ++I) {
40400 const TargetRegisterClass *RC = nullptr;
40401 if (X86::GR64RegClass.contains(*I))
40402 RC = &X86::GR64RegClass;
40404 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
40406 unsigned NewVR = MRI->createVirtualRegister(RC);
40407 // Create copy from CSR to a virtual register.
40408 // FIXME: this currently does not emit CFI pseudo-instructions, it works
40409 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
40410 // nounwind. If we want to generalize this later, we may need to emit
40411 // CFI pseudo-instructions.
40412 assert(Entry->getParent()->getFunction().hasFnAttribute(
40413 Attribute::NoUnwind) &&
40414 "Function should be nounwind in insertCopiesSplitCSR!");
40415 Entry->addLiveIn(*I);
40416 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
40419 // Insert the copy-back instructions right before the terminator.
40420 for (auto *Exit : Exits)
40421 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
40422 TII->get(TargetOpcode::COPY), *I)
40427 bool X86TargetLowering::supportSwiftError() const {
40428 return Subtarget.is64Bit();
40431 /// Returns the name of the symbol used to emit stack probes or the empty
40432 /// string if not applicable.
40433 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
40434 // If the function specifically requests stack probes, emit them.
40435 if (MF.getFunction().hasFnAttribute("probe-stack"))
40436 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
40438 // Generally, if we aren't on Windows, the platform ABI does not include
40439 // support for stack probes, so don't emit them.
40440 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
40441 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
40444 // We need a stack probe to conform to the Windows ABI. Choose the right
40446 if (Subtarget.is64Bit())
40447 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
40448 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";