1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
220 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
223 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
225 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
228 if (!Subtarget.useSoftFloat()) {
229 // SSE has no i16 to fp conversion, only i32.
230 if (X86ScalarSSEf32) {
231 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
232 // f32 and f64 cases are Legal, f80 case is not
233 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
235 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
236 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
239 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
240 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
243 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
245 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
246 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
248 if (!Subtarget.useSoftFloat()) {
249 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
250 // are Legal, f80 is custom lowered.
251 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
252 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
254 if (X86ScalarSSEf32) {
255 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
256 // f32 and f64 cases are Legal, f80 case is not
257 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
260 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
264 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
265 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
268 // Handle FP_TO_UINT by promoting the destination to a larger signed
270 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
272 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
274 if (Subtarget.is64Bit()) {
275 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
276 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
277 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
280 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
281 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
283 } else if (!Subtarget.useSoftFloat()) {
284 // Since AVX is a superset of SSE3, only check for SSE here.
285 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
286 // Expand FP_TO_UINT into a select.
287 // FIXME: We would like to use a Custom expander here eventually to do
288 // the optimal thing for SSE vs. the default expansion in the legalizer.
289 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
291 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
292 // With SSE3 we can use fisttpll to convert to a signed i64; without
293 // SSE, we're stuck with a fistpll.
294 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
296 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
299 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
300 if (!X86ScalarSSEf64) {
301 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
302 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
303 if (Subtarget.is64Bit()) {
304 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
305 // Without SSE, i64->f64 goes through memory.
306 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
308 } else if (!Subtarget.is64Bit())
309 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
311 // Scalar integer divide and remainder are lowered to use operations that
312 // produce two results, to match the available instructions. This exposes
313 // the two-result form to trivial CSE, which is able to combine x/y and x%y
314 // into a single instruction.
316 // Scalar integer multiply-high is also lowered to use two-result
317 // operations, to match the available instructions. However, plain multiply
318 // (low) operations are left as Legal, as there are single-result
319 // instructions for this in x86. Using the two-result multiply instructions
320 // when both high and low results are needed must be arranged by dagcombine.
321 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
322 setOperationAction(ISD::MULHS, VT, Expand);
323 setOperationAction(ISD::MULHU, VT, Expand);
324 setOperationAction(ISD::SDIV, VT, Expand);
325 setOperationAction(ISD::UDIV, VT, Expand);
326 setOperationAction(ISD::SREM, VT, Expand);
327 setOperationAction(ISD::UREM, VT, Expand);
330 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
331 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
332 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
333 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
334 setOperationAction(ISD::BR_CC, VT, Expand);
335 setOperationAction(ISD::SELECT_CC, VT, Expand);
337 if (Subtarget.is64Bit())
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
340 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
341 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
342 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
344 setOperationAction(ISD::FREM , MVT::f32 , Expand);
345 setOperationAction(ISD::FREM , MVT::f64 , Expand);
346 setOperationAction(ISD::FREM , MVT::f80 , Expand);
347 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
349 // Promote the i8 variants and force them on up to i32 which has a shorter
351 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
352 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
353 if (!Subtarget.hasBMI()) {
354 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
355 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
356 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
358 if (Subtarget.is64Bit()) {
359 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
360 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
364 if (Subtarget.hasLZCNT()) {
365 // When promoting the i8 variants, force them to i32 for a shorter
367 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
368 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
370 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
371 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
372 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
374 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
375 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
376 if (Subtarget.is64Bit()) {
377 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
378 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
382 // Special handling for half-precision floating point conversions.
383 // If we don't have F16C support, then lower half float conversions
384 // into library calls.
385 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
386 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
387 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
390 // There's never any support for operations beyond MVT::f32.
391 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
392 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
393 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
394 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
397 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
398 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
400 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
401 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
403 if (Subtarget.hasPOPCNT()) {
404 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
406 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
407 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
408 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
409 if (Subtarget.is64Bit())
410 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
413 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
415 if (!Subtarget.hasMOVBE())
416 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
418 // These should be promoted to a larger select which is supported.
419 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
420 // X86 wants to expand cmov itself.
421 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
422 setOperationAction(ISD::SELECT, VT, Custom);
423 setOperationAction(ISD::SETCC, VT, Custom);
425 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
426 if (VT == MVT::i64 && !Subtarget.is64Bit())
428 setOperationAction(ISD::SELECT, VT, Custom);
429 setOperationAction(ISD::SETCC, VT, Custom);
432 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
433 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
434 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
436 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
437 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
438 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
439 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
440 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
441 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
442 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
443 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
446 for (auto VT : { MVT::i32, MVT::i64 }) {
447 if (VT == MVT::i64 && !Subtarget.is64Bit())
449 setOperationAction(ISD::ConstantPool , VT, Custom);
450 setOperationAction(ISD::JumpTable , VT, Custom);
451 setOperationAction(ISD::GlobalAddress , VT, Custom);
452 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
453 setOperationAction(ISD::ExternalSymbol , VT, Custom);
454 setOperationAction(ISD::BlockAddress , VT, Custom);
457 // 64-bit shl, sra, srl (iff 32-bit x86)
458 for (auto VT : { MVT::i32, MVT::i64 }) {
459 if (VT == MVT::i64 && !Subtarget.is64Bit())
461 setOperationAction(ISD::SHL_PARTS, VT, Custom);
462 setOperationAction(ISD::SRA_PARTS, VT, Custom);
463 setOperationAction(ISD::SRL_PARTS, VT, Custom);
466 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
467 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
469 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
471 // Expand certain atomics
472 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
477 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
478 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
479 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
482 if (Subtarget.hasCmpxchg16b()) {
483 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
486 // FIXME - use subtarget debug flags
487 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
488 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
489 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
490 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
493 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
494 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
496 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
499 setOperationAction(ISD::TRAP, MVT::Other, Legal);
500 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
502 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
503 setOperationAction(ISD::VASTART , MVT::Other, Custom);
504 setOperationAction(ISD::VAEND , MVT::Other, Expand);
505 bool Is64Bit = Subtarget.is64Bit();
506 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
509 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
510 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
512 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
514 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
515 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
516 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
518 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
519 // f32 and f64 use SSE.
520 // Set up the FP register classes.
521 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
522 : &X86::FR32RegClass);
523 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
524 : &X86::FR64RegClass);
526 for (auto VT : { MVT::f32, MVT::f64 }) {
527 // Use ANDPD to simulate FABS.
528 setOperationAction(ISD::FABS, VT, Custom);
530 // Use XORP to simulate FNEG.
531 setOperationAction(ISD::FNEG, VT, Custom);
533 // Use ANDPD and ORPD to simulate FCOPYSIGN.
534 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
536 // We don't support sin/cos/fmod
537 setOperationAction(ISD::FSIN , VT, Expand);
538 setOperationAction(ISD::FCOS , VT, Expand);
539 setOperationAction(ISD::FSINCOS, VT, Expand);
542 // Lower this to MOVMSK plus an AND.
543 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
544 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
546 // Expand FP immediates into loads from the stack, except for the special
548 addLegalFPImmediate(APFloat(+0.0)); // xorpd
549 addLegalFPImmediate(APFloat(+0.0f)); // xorps
550 } else if (UseX87 && X86ScalarSSEf32) {
551 // Use SSE for f32, x87 for f64.
552 // Set up the FP register classes.
553 addRegisterClass(MVT::f32, &X86::FR32RegClass);
554 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
556 // Use ANDPS to simulate FABS.
557 setOperationAction(ISD::FABS , MVT::f32, Custom);
559 // Use XORP to simulate FNEG.
560 setOperationAction(ISD::FNEG , MVT::f32, Custom);
562 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
564 // Use ANDPS and ORPS to simulate FCOPYSIGN.
565 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
566 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
568 // We don't support sin/cos/fmod
569 setOperationAction(ISD::FSIN , MVT::f32, Expand);
570 setOperationAction(ISD::FCOS , MVT::f32, Expand);
571 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
573 // Special cases we handle for FP constants.
574 addLegalFPImmediate(APFloat(+0.0f)); // xorps
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
580 // Always expand sin/cos functions even though x87 has an instruction.
581 setOperationAction(ISD::FSIN , MVT::f64, Expand);
582 setOperationAction(ISD::FCOS , MVT::f64, Expand);
583 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
585 // f32 and f64 in x87.
586 // Set up the FP register classes.
587 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
588 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
590 for (auto VT : { MVT::f32, MVT::f64 }) {
591 setOperationAction(ISD::UNDEF, VT, Expand);
592 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
594 // Always expand sin/cos functions even though x87 has an instruction.
595 setOperationAction(ISD::FSIN , VT, Expand);
596 setOperationAction(ISD::FCOS , VT, Expand);
597 setOperationAction(ISD::FSINCOS, VT, Expand);
599 addLegalFPImmediate(APFloat(+0.0)); // FLD0
600 addLegalFPImmediate(APFloat(+1.0)); // FLD1
601 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
602 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
603 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
604 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
605 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
606 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
609 // We don't support FMA.
610 setOperationAction(ISD::FMA, MVT::f64, Expand);
611 setOperationAction(ISD::FMA, MVT::f32, Expand);
613 // Long double always uses X87, except f128 in MMX.
615 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
616 addRegisterClass(MVT::f128, &X86::FR128RegClass);
617 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
618 setOperationAction(ISD::FABS , MVT::f128, Custom);
619 setOperationAction(ISD::FNEG , MVT::f128, Custom);
620 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
623 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
624 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
625 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
627 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
628 addLegalFPImmediate(TmpFlt); // FLD0
630 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
633 APFloat TmpFlt2(+1.0);
634 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
636 addLegalFPImmediate(TmpFlt2); // FLD1
637 TmpFlt2.changeSign();
638 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
641 // Always expand sin/cos functions even though x87 has an instruction.
642 setOperationAction(ISD::FSIN , MVT::f80, Expand);
643 setOperationAction(ISD::FCOS , MVT::f80, Expand);
644 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
646 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
647 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
648 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
649 setOperationAction(ISD::FRINT, MVT::f80, Expand);
650 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
651 setOperationAction(ISD::FMA, MVT::f80, Expand);
654 // Always use a library call for pow.
655 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
657 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
659 setOperationAction(ISD::FLOG, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
661 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP, MVT::f80, Expand);
663 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
664 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
665 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
667 // Some FP actions are always expanded for vector types.
668 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
669 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
670 setOperationAction(ISD::FSIN, VT, Expand);
671 setOperationAction(ISD::FSINCOS, VT, Expand);
672 setOperationAction(ISD::FCOS, VT, Expand);
673 setOperationAction(ISD::FREM, VT, Expand);
674 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
675 setOperationAction(ISD::FPOW, VT, Expand);
676 setOperationAction(ISD::FLOG, VT, Expand);
677 setOperationAction(ISD::FLOG2, VT, Expand);
678 setOperationAction(ISD::FLOG10, VT, Expand);
679 setOperationAction(ISD::FEXP, VT, Expand);
680 setOperationAction(ISD::FEXP2, VT, Expand);
683 // First set operation action for all vector types to either promote
684 // (for widening) or expand (for scalarization). Then we will selectively
685 // turn on ones that can be effectively codegen'd.
686 for (MVT VT : MVT::vector_valuetypes()) {
687 setOperationAction(ISD::SDIV, VT, Expand);
688 setOperationAction(ISD::UDIV, VT, Expand);
689 setOperationAction(ISD::SREM, VT, Expand);
690 setOperationAction(ISD::UREM, VT, Expand);
691 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
692 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
693 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
695 setOperationAction(ISD::FMA, VT, Expand);
696 setOperationAction(ISD::FFLOOR, VT, Expand);
697 setOperationAction(ISD::FCEIL, VT, Expand);
698 setOperationAction(ISD::FTRUNC, VT, Expand);
699 setOperationAction(ISD::FRINT, VT, Expand);
700 setOperationAction(ISD::FNEARBYINT, VT, Expand);
701 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHS, VT, Expand);
703 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
704 setOperationAction(ISD::MULHU, VT, Expand);
705 setOperationAction(ISD::SDIVREM, VT, Expand);
706 setOperationAction(ISD::UDIVREM, VT, Expand);
707 setOperationAction(ISD::CTPOP, VT, Expand);
708 setOperationAction(ISD::CTTZ, VT, Expand);
709 setOperationAction(ISD::CTLZ, VT, Expand);
710 setOperationAction(ISD::ROTL, VT, Expand);
711 setOperationAction(ISD::ROTR, VT, Expand);
712 setOperationAction(ISD::BSWAP, VT, Expand);
713 setOperationAction(ISD::SETCC, VT, Expand);
714 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
715 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
716 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
719 setOperationAction(ISD::TRUNCATE, VT, Expand);
720 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
721 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
722 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
723 setOperationAction(ISD::SELECT_CC, VT, Expand);
724 for (MVT InnerVT : MVT::vector_valuetypes()) {
725 setTruncStoreAction(InnerVT, VT, Expand);
727 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
728 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
730 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
731 // types, we have to deal with them whether we ask for Expansion or not.
732 // Setting Expand causes its own optimisation problems though, so leave
734 if (VT.getVectorElementType() == MVT::i1)
735 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
737 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
738 // split/scalarized right now.
739 if (VT.getVectorElementType() == MVT::f16)
740 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
744 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
745 // with -msoft-float, disable use of MMX as well.
746 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
747 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
748 // No operations on x86mmx supported, everything uses intrinsics.
751 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
752 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
755 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
756 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
757 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
758 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
759 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
760 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
762 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
763 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
766 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
767 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
768 : &X86::VR128RegClass);
770 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
771 // registers cannot be used even for integer operations.
772 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
778 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
779 : &X86::VR128RegClass);
781 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
782 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
783 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
784 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
786 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
788 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
789 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
790 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
791 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
792 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
793 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
795 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
796 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
797 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
798 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
799 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
803 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
804 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
806 // Provide custom widening for v2f32 setcc. This is really for VLX when
807 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
808 // type legalization changing the result type to v4i1 during widening.
809 // It works fine for SSE2 and is probably faster so no need to qualify with
811 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
813 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
814 setOperationAction(ISD::SETCC, VT, Custom);
815 setOperationAction(ISD::CTPOP, VT, Custom);
816 setOperationAction(ISD::CTTZ, VT, Custom);
818 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
819 // setcc all the way to isel and prefer SETGT in some isel patterns.
820 setCondCodeAction(ISD::SETLT, VT, Custom);
821 setCondCodeAction(ISD::SETLE, VT, Custom);
824 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
825 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
826 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
827 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
828 setOperationAction(ISD::VSELECT, VT, Custom);
829 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
832 // We support custom legalizing of sext and anyext loads for specific
833 // memory vector types which we can load as a scalar (or sequence of
834 // scalars) and extend in-register to a legal 128-bit vector type. For sext
835 // loads these must work with a single scalar load.
836 for (MVT VT : MVT::integer_vector_valuetypes()) {
837 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
838 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
839 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
840 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
841 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
842 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
843 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
844 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
845 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
848 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
849 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
850 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
851 setOperationAction(ISD::VSELECT, VT, Custom);
853 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
856 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
857 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
860 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
861 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
862 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
863 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
864 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
865 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
866 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
869 // Custom lower v2i64 and v2f64 selects.
870 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
871 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
873 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
874 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
876 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
877 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
879 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
881 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
882 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
884 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
885 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
887 for (MVT VT : MVT::fp_vector_valuetypes())
888 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
890 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
891 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
892 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
893 if (!Subtarget.hasAVX512())
894 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
896 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
897 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
898 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
900 // In the customized shift lowering, the legal v4i32/v2i64 cases
901 // in AVX2 will be recognized.
902 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
903 setOperationAction(ISD::SRL, VT, Custom);
904 setOperationAction(ISD::SHL, VT, Custom);
905 setOperationAction(ISD::SRA, VT, Custom);
908 setOperationAction(ISD::ROTL, MVT::v4i32, Custom);
909 setOperationAction(ISD::ROTL, MVT::v8i16, Custom);
910 setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
913 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
914 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
915 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
916 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
917 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
918 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
919 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
920 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
921 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
924 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
925 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
926 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
927 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
928 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
929 setOperationAction(ISD::FRINT, RoundedTy, Legal);
930 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
933 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
934 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
935 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
936 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
937 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
938 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
939 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
940 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
942 // FIXME: Do we need to handle scalar-to-vector here?
943 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
945 // We directly match byte blends in the backend as they match the VSELECT
947 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
949 // SSE41 brings specific instructions for doing vector sign extend even in
950 // cases where we don't have SRA.
951 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
952 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
953 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
956 for (MVT VT : MVT::integer_vector_valuetypes()) {
957 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
958 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
959 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
962 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
963 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
964 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
965 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
966 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
967 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
968 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
969 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
970 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
973 // i8 vectors are custom because the source register and source
974 // source memory operand types are not the same width.
975 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
978 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
979 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
980 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
981 setOperationAction(ISD::ROTL, VT, Custom);
983 // XOP can efficiently perform BITREVERSE with VPPERM.
984 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
985 setOperationAction(ISD::BITREVERSE, VT, Custom);
987 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
988 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
989 setOperationAction(ISD::BITREVERSE, VT, Custom);
992 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
993 bool HasInt256 = Subtarget.hasInt256();
995 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
996 : &X86::VR256RegClass);
997 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
998 : &X86::VR256RegClass);
999 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1000 : &X86::VR256RegClass);
1001 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
1002 : &X86::VR256RegClass);
1003 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1004 : &X86::VR256RegClass);
1005 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
1006 : &X86::VR256RegClass);
1008 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
1009 setOperationAction(ISD::FFLOOR, VT, Legal);
1010 setOperationAction(ISD::FCEIL, VT, Legal);
1011 setOperationAction(ISD::FTRUNC, VT, Legal);
1012 setOperationAction(ISD::FRINT, VT, Legal);
1013 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1014 setOperationAction(ISD::FNEG, VT, Custom);
1015 setOperationAction(ISD::FABS, VT, Custom);
1016 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1019 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1020 // even though v8i16 is a legal type.
1021 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1022 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1023 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1025 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1026 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1028 if (!Subtarget.hasAVX512())
1029 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1031 for (MVT VT : MVT::fp_vector_valuetypes())
1032 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1034 // In the customized shift lowering, the legal v8i32/v4i64 cases
1035 // in AVX2 will be recognized.
1036 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1037 setOperationAction(ISD::SRL, VT, Custom);
1038 setOperationAction(ISD::SHL, VT, Custom);
1039 setOperationAction(ISD::SRA, VT, Custom);
1042 setOperationAction(ISD::ROTL, MVT::v8i32, Custom);
1043 setOperationAction(ISD::ROTL, MVT::v16i16, Custom);
1044 setOperationAction(ISD::ROTL, MVT::v32i8, Custom);
1046 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1047 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1048 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1050 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1051 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1052 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1053 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1056 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1057 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1058 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1059 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1061 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1062 setOperationAction(ISD::SETCC, VT, Custom);
1063 setOperationAction(ISD::CTPOP, VT, Custom);
1064 setOperationAction(ISD::CTTZ, VT, Custom);
1065 setOperationAction(ISD::CTLZ, VT, Custom);
1067 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1068 // setcc all the way to isel and prefer SETGT in some isel patterns.
1069 setCondCodeAction(ISD::SETLT, VT, Custom);
1070 setCondCodeAction(ISD::SETLE, VT, Custom);
1073 if (Subtarget.hasAnyFMA()) {
1074 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1075 MVT::v2f64, MVT::v4f64 })
1076 setOperationAction(ISD::FMA, VT, Legal);
1079 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1080 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1081 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1085 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1086 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1087 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1089 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1090 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1092 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1093 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1094 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1095 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1097 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1098 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1099 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1100 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1102 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1103 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1104 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1105 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1106 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1107 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1111 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1112 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1113 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1115 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1116 // when we have a 256bit-wide blend with immediate.
1117 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1119 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1120 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1121 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1122 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1123 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1124 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1125 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1126 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1130 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1131 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1132 setOperationAction(ISD::MLOAD, VT, Legal);
1133 setOperationAction(ISD::MSTORE, VT, Legal);
1136 // Extract subvector is special because the value type
1137 // (result) is 128-bit but the source is 256-bit wide.
1138 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1139 MVT::v4f32, MVT::v2f64 }) {
1140 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1143 // Custom lower several nodes for 256-bit types.
1144 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1145 MVT::v8f32, MVT::v4f64 }) {
1146 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1147 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1148 setOperationAction(ISD::VSELECT, VT, Custom);
1149 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1150 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1151 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1152 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1153 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1157 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1159 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1160 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1161 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1162 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1163 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1164 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1165 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1169 // Custom legalize 2x32 to get a little better code.
1170 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1171 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1173 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1174 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1175 setOperationAction(ISD::MGATHER, VT, Custom);
1179 // This block controls legalization of the mask vector sizes that are
1180 // available with AVX512. 512-bit vectors are in a separate block controlled
1181 // by useAVX512Regs.
1182 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1183 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1184 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1185 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1186 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1187 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1189 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1190 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1191 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1193 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1194 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1195 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1196 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1197 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1200 // There is no byte sized k-register load or store without AVX512DQ.
1201 if (!Subtarget.hasDQI()) {
1202 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1203 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1204 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1205 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1207 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1208 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1209 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1210 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1213 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1214 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1215 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1216 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1217 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1220 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1221 setOperationAction(ISD::ADD, VT, Custom);
1222 setOperationAction(ISD::SUB, VT, Custom);
1223 setOperationAction(ISD::MUL, VT, Custom);
1224 setOperationAction(ISD::SETCC, VT, Custom);
1225 setOperationAction(ISD::SELECT, VT, Custom);
1226 setOperationAction(ISD::TRUNCATE, VT, Custom);
1228 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1229 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1230 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1231 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1232 setOperationAction(ISD::VSELECT, VT, Expand);
1235 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1236 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1237 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1238 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1239 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1240 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1241 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1242 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1243 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1246 // This block controls legalization for 512-bit operations with 32/64 bit
1247 // elements. 512-bits can be disabled based on prefer-vector-width and
1248 // required-vector-width function attributes.
1249 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1250 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1251 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1252 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1253 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1255 for (MVT VT : MVT::fp_vector_valuetypes())
1256 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1258 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1259 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1260 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1261 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1262 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1263 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1266 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1267 setOperationAction(ISD::FNEG, VT, Custom);
1268 setOperationAction(ISD::FABS, VT, Custom);
1269 setOperationAction(ISD::FMA, VT, Legal);
1270 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1273 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1274 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1275 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1276 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1278 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1279 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1280 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1281 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1282 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1284 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1285 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1286 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1287 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1288 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1290 if (!Subtarget.hasVLX()) {
1291 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1292 // to 512-bit rather than use the AVX2 instructions so that we can use
1294 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1295 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1296 setOperationAction(ISD::MLOAD, VT, Custom);
1297 setOperationAction(ISD::MSTORE, VT, Custom);
1301 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1302 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1303 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1304 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1305 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1306 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1310 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1311 setOperationAction(ISD::FFLOOR, VT, Legal);
1312 setOperationAction(ISD::FCEIL, VT, Legal);
1313 setOperationAction(ISD::FTRUNC, VT, Legal);
1314 setOperationAction(ISD::FRINT, VT, Legal);
1315 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1318 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1319 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1321 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1323 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1325 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1326 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1327 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1330 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1331 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1333 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1334 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1336 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1337 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1338 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1340 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1341 setOperationAction(ISD::SMAX, VT, Legal);
1342 setOperationAction(ISD::UMAX, VT, Legal);
1343 setOperationAction(ISD::SMIN, VT, Legal);
1344 setOperationAction(ISD::UMIN, VT, Legal);
1345 setOperationAction(ISD::ABS, VT, Legal);
1346 setOperationAction(ISD::SRL, VT, Custom);
1347 setOperationAction(ISD::SHL, VT, Custom);
1348 setOperationAction(ISD::SRA, VT, Custom);
1349 setOperationAction(ISD::CTPOP, VT, Custom);
1350 setOperationAction(ISD::CTTZ, VT, Custom);
1351 setOperationAction(ISD::ROTL, VT, Custom);
1352 setOperationAction(ISD::ROTR, VT, Custom);
1353 setOperationAction(ISD::SETCC, VT, Custom);
1355 // The condition codes aren't legal in SSE/AVX and under AVX512 we use
1356 // setcc all the way to isel and prefer SETGT in some isel patterns.
1357 setCondCodeAction(ISD::SETLT, VT, Custom);
1358 setCondCodeAction(ISD::SETLE, VT, Custom);
1361 // Need to promote to 64-bit even though we have 32-bit masked instructions
1362 // because the IR optimizers rearrange bitcasts around logic ops leaving
1363 // too many variations to handle if we don't promote them.
1364 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1365 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1366 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1368 if (Subtarget.hasDQI()) {
1369 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1370 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1371 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1372 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1374 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1377 if (Subtarget.hasCDI()) {
1378 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1379 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1380 setOperationAction(ISD::CTLZ, VT, Legal);
1381 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1383 } // Subtarget.hasCDI()
1385 if (Subtarget.hasVPOPCNTDQ()) {
1386 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1387 setOperationAction(ISD::CTPOP, VT, Legal);
1390 // Extract subvector is special because the value type
1391 // (result) is 256-bit but the source is 512-bit wide.
1392 // 128-bit was made Legal under AVX1.
1393 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1394 MVT::v8f32, MVT::v4f64 })
1395 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1397 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1401 setOperationAction(ISD::VSELECT, VT, Custom);
1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1405 setOperationAction(ISD::MLOAD, VT, Legal);
1406 setOperationAction(ISD::MSTORE, VT, Legal);
1407 setOperationAction(ISD::MGATHER, VT, Custom);
1408 setOperationAction(ISD::MSCATTER, VT, Custom);
1410 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1412 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1415 // Need to custom split v32i16/v64i8 bitcasts.
1416 if (!Subtarget.hasBWI()) {
1417 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1418 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1422 // This block controls legalization for operations that don't have
1423 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1425 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1426 // These operations are handled on non-VLX by artificially widening in
1428 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1430 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1431 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1432 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1433 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1434 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1436 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1437 setOperationAction(ISD::SMAX, VT, Legal);
1438 setOperationAction(ISD::UMAX, VT, Legal);
1439 setOperationAction(ISD::SMIN, VT, Legal);
1440 setOperationAction(ISD::UMIN, VT, Legal);
1441 setOperationAction(ISD::ABS, VT, Legal);
1444 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1445 setOperationAction(ISD::ROTL, VT, Custom);
1446 setOperationAction(ISD::ROTR, VT, Custom);
1449 // Custom legalize 2x32 to get a little better code.
1450 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1451 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1453 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1454 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1455 setOperationAction(ISD::MSCATTER, VT, Custom);
1457 if (Subtarget.hasDQI()) {
1458 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1459 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1460 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1461 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1462 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1464 setOperationAction(ISD::MUL, VT, Legal);
1468 if (Subtarget.hasCDI()) {
1469 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1470 setOperationAction(ISD::CTLZ, VT, Legal);
1471 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1473 } // Subtarget.hasCDI()
1475 if (Subtarget.hasVPOPCNTDQ()) {
1476 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1477 setOperationAction(ISD::CTPOP, VT, Legal);
1481 // This block control legalization of v32i1/v64i1 which are available with
1482 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1484 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1485 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1486 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1488 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1489 setOperationAction(ISD::ADD, VT, Custom);
1490 setOperationAction(ISD::SUB, VT, Custom);
1491 setOperationAction(ISD::MUL, VT, Custom);
1492 setOperationAction(ISD::VSELECT, VT, Expand);
1494 setOperationAction(ISD::TRUNCATE, VT, Custom);
1495 setOperationAction(ISD::SETCC, VT, Custom);
1496 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1497 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1498 setOperationAction(ISD::SELECT, VT, Custom);
1499 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1500 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1503 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1505 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1507 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1508 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1510 // Extends from v32i1 masks to 256-bit vectors.
1511 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1512 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1513 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1516 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1517 // disabled based on prefer-vector-width and required-vector-width function
1519 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1520 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1521 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1523 // Extends from v64i1 masks to 512-bit vectors.
1524 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1525 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1526 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1528 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1529 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1530 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1531 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1532 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1533 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1534 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1535 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1536 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1537 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1538 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1539 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1540 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1541 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1542 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1543 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1544 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1545 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1546 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1547 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1548 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1549 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1550 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1552 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1554 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1556 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1557 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1558 setOperationAction(ISD::VSELECT, VT, Custom);
1559 setOperationAction(ISD::ABS, VT, Legal);
1560 setOperationAction(ISD::SRL, VT, Custom);
1561 setOperationAction(ISD::SHL, VT, Custom);
1562 setOperationAction(ISD::SRA, VT, Custom);
1563 setOperationAction(ISD::MLOAD, VT, Legal);
1564 setOperationAction(ISD::MSTORE, VT, Legal);
1565 setOperationAction(ISD::CTPOP, VT, Custom);
1566 setOperationAction(ISD::CTTZ, VT, Custom);
1567 setOperationAction(ISD::CTLZ, VT, Custom);
1568 setOperationAction(ISD::SMAX, VT, Legal);
1569 setOperationAction(ISD::UMAX, VT, Legal);
1570 setOperationAction(ISD::SMIN, VT, Legal);
1571 setOperationAction(ISD::UMIN, VT, Legal);
1572 setOperationAction(ISD::SETCC, VT, Custom);
1574 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1575 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1576 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1579 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1580 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1583 if (Subtarget.hasBITALG()) {
1584 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1585 setOperationAction(ISD::CTPOP, VT, Legal);
1589 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1590 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1591 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1592 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1595 // These operations are handled on non-VLX by artificially widening in
1597 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1599 if (Subtarget.hasBITALG()) {
1600 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1601 setOperationAction(ISD::CTPOP, VT, Legal);
1605 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1606 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1607 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1608 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1609 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1610 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1612 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1613 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1614 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1615 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1616 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1618 if (Subtarget.hasDQI()) {
1619 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1620 // v2f32 UINT_TO_FP is already custom under SSE2.
1621 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1622 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1623 "Unexpected operation action!");
1624 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1625 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1626 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1629 if (Subtarget.hasBWI()) {
1630 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1631 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1635 // We want to custom lower some of our intrinsics.
1636 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1637 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1638 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1639 if (!Subtarget.is64Bit()) {
1640 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1641 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1644 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1645 // handle type legalization for these operations here.
1647 // FIXME: We really should do custom legalization for addition and
1648 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1649 // than generic legalization for 64-bit multiplication-with-overflow, though.
1650 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1651 if (VT == MVT::i64 && !Subtarget.is64Bit())
1653 // Add/Sub/Mul with overflow operations are custom lowered.
1654 setOperationAction(ISD::SADDO, VT, Custom);
1655 setOperationAction(ISD::UADDO, VT, Custom);
1656 setOperationAction(ISD::SSUBO, VT, Custom);
1657 setOperationAction(ISD::USUBO, VT, Custom);
1658 setOperationAction(ISD::SMULO, VT, Custom);
1659 setOperationAction(ISD::UMULO, VT, Custom);
1661 // Support carry in as value rather than glue.
1662 setOperationAction(ISD::ADDCARRY, VT, Custom);
1663 setOperationAction(ISD::SUBCARRY, VT, Custom);
1664 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1667 if (!Subtarget.is64Bit()) {
1668 // These libcalls are not available in 32-bit.
1669 setLibcallName(RTLIB::SHL_I128, nullptr);
1670 setLibcallName(RTLIB::SRL_I128, nullptr);
1671 setLibcallName(RTLIB::SRA_I128, nullptr);
1672 setLibcallName(RTLIB::MUL_I128, nullptr);
1675 // Combine sin / cos into _sincos_stret if it is available.
1676 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1677 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1678 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1679 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1682 if (Subtarget.isTargetWin64()) {
1683 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1684 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1685 setOperationAction(ISD::SREM, MVT::i128, Custom);
1686 setOperationAction(ISD::UREM, MVT::i128, Custom);
1687 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1688 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1691 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1692 // is. We should promote the value to 64-bits to solve this.
1693 // This is what the CRT headers do - `fmodf` is an inline header
1694 // function casting to f64 and calling `fmod`.
1695 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1696 Subtarget.isTargetWindowsItanium()))
1697 for (ISD::NodeType Op :
1698 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1699 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1700 if (isOperationExpand(Op, MVT::f32))
1701 setOperationAction(Op, MVT::f32, Promote);
1703 // We have target-specific dag combine patterns for the following nodes:
1704 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1705 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1706 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1707 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1708 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1709 setTargetDAGCombine(ISD::BITCAST);
1710 setTargetDAGCombine(ISD::VSELECT);
1711 setTargetDAGCombine(ISD::SELECT);
1712 setTargetDAGCombine(ISD::SHL);
1713 setTargetDAGCombine(ISD::SRA);
1714 setTargetDAGCombine(ISD::SRL);
1715 setTargetDAGCombine(ISD::OR);
1716 setTargetDAGCombine(ISD::AND);
1717 setTargetDAGCombine(ISD::ADD);
1718 setTargetDAGCombine(ISD::FADD);
1719 setTargetDAGCombine(ISD::FSUB);
1720 setTargetDAGCombine(ISD::FNEG);
1721 setTargetDAGCombine(ISD::FMA);
1722 setTargetDAGCombine(ISD::FMINNUM);
1723 setTargetDAGCombine(ISD::FMAXNUM);
1724 setTargetDAGCombine(ISD::SUB);
1725 setTargetDAGCombine(ISD::LOAD);
1726 setTargetDAGCombine(ISD::MLOAD);
1727 setTargetDAGCombine(ISD::STORE);
1728 setTargetDAGCombine(ISD::MSTORE);
1729 setTargetDAGCombine(ISD::TRUNCATE);
1730 setTargetDAGCombine(ISD::ZERO_EXTEND);
1731 setTargetDAGCombine(ISD::ANY_EXTEND);
1732 setTargetDAGCombine(ISD::SIGN_EXTEND);
1733 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1734 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1735 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1736 setTargetDAGCombine(ISD::SINT_TO_FP);
1737 setTargetDAGCombine(ISD::UINT_TO_FP);
1738 setTargetDAGCombine(ISD::SETCC);
1739 setTargetDAGCombine(ISD::MUL);
1740 setTargetDAGCombine(ISD::XOR);
1741 setTargetDAGCombine(ISD::MSCATTER);
1742 setTargetDAGCombine(ISD::MGATHER);
1744 computeRegisterProperties(Subtarget.getRegisterInfo());
1746 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1747 MaxStoresPerMemsetOptSize = 8;
1748 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1749 MaxStoresPerMemcpyOptSize = 4;
1750 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1751 MaxStoresPerMemmoveOptSize = 4;
1753 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1754 // that needs to benchmarked and balanced with the potential use of vector
1755 // load/store types (PR33329, PR33914).
1756 MaxLoadsPerMemcmp = 2;
1757 MaxLoadsPerMemcmpOptSize = 2;
1759 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1760 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1762 // An out-of-order CPU can speculatively execute past a predictable branch,
1763 // but a conditional move could be stalled by an expensive earlier operation.
1764 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1765 EnableExtLdPromotion = true;
1766 setPrefFunctionAlignment(4); // 2^4 bytes.
1768 verifyIntrinsicTables();
1771 // This has so far only been implemented for 64-bit MachO.
1772 bool X86TargetLowering::useLoadStackGuardNode() const {
1773 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1776 bool X86TargetLowering::useStackGuardXorFP() const {
1777 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1778 return Subtarget.getTargetTriple().isOSMSVCRT();
1781 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1782 const SDLoc &DL) const {
1783 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1784 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1785 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1786 return SDValue(Node, 0);
1789 TargetLoweringBase::LegalizeTypeAction
1790 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1791 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1792 return TypeSplitVector;
1794 if (ExperimentalVectorWideningLegalization &&
1795 VT.getVectorNumElements() != 1 &&
1796 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1797 return TypeWidenVector;
1799 return TargetLoweringBase::getPreferredVectorAction(VT);
1802 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1804 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1806 return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
1809 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1811 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1813 return TargetLowering::getNumRegistersForCallingConv(Context, VT);
1816 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1817 LLVMContext& Context,
1822 if (Subtarget.hasAVX512()) {
1823 const unsigned NumElts = VT.getVectorNumElements();
1825 // Figure out what this type will be legalized to.
1827 while (getTypeAction(Context, LegalVT) != TypeLegal)
1828 LegalVT = getTypeToTransformTo(Context, LegalVT);
1830 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1831 if (LegalVT.getSimpleVT().is512BitVector())
1832 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1834 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1835 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1836 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1838 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1839 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1840 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1844 return VT.changeVectorElementTypeToInteger();
1847 /// Helper for getByValTypeAlignment to determine
1848 /// the desired ByVal argument alignment.
1849 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1852 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1853 if (VTy->getBitWidth() == 128)
1855 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1856 unsigned EltAlign = 0;
1857 getMaxByValAlign(ATy->getElementType(), EltAlign);
1858 if (EltAlign > MaxAlign)
1859 MaxAlign = EltAlign;
1860 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1861 for (auto *EltTy : STy->elements()) {
1862 unsigned EltAlign = 0;
1863 getMaxByValAlign(EltTy, EltAlign);
1864 if (EltAlign > MaxAlign)
1865 MaxAlign = EltAlign;
1872 /// Return the desired alignment for ByVal aggregate
1873 /// function arguments in the caller parameter area. For X86, aggregates
1874 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1875 /// are at 4-byte boundaries.
1876 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1877 const DataLayout &DL) const {
1878 if (Subtarget.is64Bit()) {
1879 // Max of 8 and alignment of type.
1880 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1887 if (Subtarget.hasSSE1())
1888 getMaxByValAlign(Ty, Align);
1892 /// Returns the target specific optimal type for load
1893 /// and store operations as a result of memset, memcpy, and memmove
1894 /// lowering. If DstAlign is zero that means it's safe to destination
1895 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1896 /// means there isn't a need to check it against alignment requirement,
1897 /// probably because the source does not need to be loaded. If 'IsMemset' is
1898 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1899 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1900 /// source is constant so it does not need to be loaded.
1901 /// It returns EVT::Other if the type should be determined using generic
1902 /// target-independent logic.
1904 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1905 unsigned DstAlign, unsigned SrcAlign,
1906 bool IsMemset, bool ZeroMemset,
1908 MachineFunction &MF) const {
1909 const Function &F = MF.getFunction();
1910 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1912 (!Subtarget.isUnalignedMem16Slow() ||
1913 ((DstAlign == 0 || DstAlign >= 16) &&
1914 (SrcAlign == 0 || SrcAlign >= 16)))) {
1915 // FIXME: Check if unaligned 32-byte accesses are slow.
1916 if (Size >= 32 && Subtarget.hasAVX()) {
1917 // Although this isn't a well-supported type for AVX1, we'll let
1918 // legalization and shuffle lowering produce the optimal codegen. If we
1919 // choose an optimal type with a vector element larger than a byte,
1920 // getMemsetStores() may create an intermediate splat (using an integer
1921 // multiply) before we splat as a vector.
1924 if (Subtarget.hasSSE2())
1926 // TODO: Can SSE1 handle a byte vector?
1927 if (Subtarget.hasSSE1())
1929 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1930 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1931 // Do not use f64 to lower memcpy if source is string constant. It's
1932 // better to use i32 to avoid the loads.
1933 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1934 // The gymnastics of splatting a byte value into an XMM register and then
1935 // only using 8-byte stores (because this is a CPU with slow unaligned
1936 // 16-byte accesses) makes that a loser.
1940 // This is a compromise. If we reach here, unaligned accesses may be slow on
1941 // this target. However, creating smaller, aligned accesses could be even
1942 // slower and would certainly be a lot more code.
1943 if (Subtarget.is64Bit() && Size >= 8)
1948 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1950 return X86ScalarSSEf32;
1951 else if (VT == MVT::f64)
1952 return X86ScalarSSEf64;
1957 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1962 switch (VT.getSizeInBits()) {
1964 // 8-byte and under are always assumed to be fast.
1968 *Fast = !Subtarget.isUnalignedMem16Slow();
1971 *Fast = !Subtarget.isUnalignedMem32Slow();
1973 // TODO: What about AVX-512 (512-bit) accesses?
1976 // Misaligned accesses of any size are always allowed.
1980 /// Return the entry encoding for a jump table in the
1981 /// current function. The returned value is a member of the
1982 /// MachineJumpTableInfo::JTEntryKind enum.
1983 unsigned X86TargetLowering::getJumpTableEncoding() const {
1984 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1986 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1987 return MachineJumpTableInfo::EK_Custom32;
1989 // Otherwise, use the normal jump table encoding heuristics.
1990 return TargetLowering::getJumpTableEncoding();
1993 bool X86TargetLowering::useSoftFloat() const {
1994 return Subtarget.useSoftFloat();
1997 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1998 ArgListTy &Args) const {
2000 // Only relabel X86-32 for C / Stdcall CCs.
2001 if (Subtarget.is64Bit())
2003 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
2005 unsigned ParamRegs = 0;
2006 if (auto *M = MF->getFunction().getParent())
2007 ParamRegs = M->getNumberRegisterParameters();
2009 // Mark the first N int arguments as having reg
2010 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
2011 Type *T = Args[Idx].Ty;
2012 if (T->isIntOrPtrTy())
2013 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
2014 unsigned numRegs = 1;
2015 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
2017 if (ParamRegs < numRegs)
2019 ParamRegs -= numRegs;
2020 Args[Idx].IsInReg = true;
2026 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2027 const MachineBasicBlock *MBB,
2028 unsigned uid,MCContext &Ctx) const{
2029 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2030 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2032 return MCSymbolRefExpr::create(MBB->getSymbol(),
2033 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2036 /// Returns relocation base for the given PIC jumptable.
2037 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2038 SelectionDAG &DAG) const {
2039 if (!Subtarget.is64Bit())
2040 // This doesn't have SDLoc associated with it, but is not really the
2041 // same as a Register.
2042 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2043 getPointerTy(DAG.getDataLayout()));
2047 /// This returns the relocation base for the given PIC jumptable,
2048 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2049 const MCExpr *X86TargetLowering::
2050 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2051 MCContext &Ctx) const {
2052 // X86-64 uses RIP relative addressing based on the jump table label.
2053 if (Subtarget.isPICStyleRIPRel())
2054 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2056 // Otherwise, the reference is relative to the PIC base.
2057 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2060 std::pair<const TargetRegisterClass *, uint8_t>
2061 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2063 const TargetRegisterClass *RRC = nullptr;
2065 switch (VT.SimpleTy) {
2067 return TargetLowering::findRepresentativeClass(TRI, VT);
2068 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2069 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2072 RRC = &X86::VR64RegClass;
2074 case MVT::f32: case MVT::f64:
2075 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2076 case MVT::v4f32: case MVT::v2f64:
2077 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2078 case MVT::v8f32: case MVT::v4f64:
2079 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2080 case MVT::v16f32: case MVT::v8f64:
2081 RRC = &X86::VR128XRegClass;
2084 return std::make_pair(RRC, Cost);
2087 unsigned X86TargetLowering::getAddressSpace() const {
2088 if (Subtarget.is64Bit())
2089 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2093 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2094 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2095 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2098 static Constant* SegmentOffset(IRBuilder<> &IRB,
2099 unsigned Offset, unsigned AddressSpace) {
2100 return ConstantExpr::getIntToPtr(
2101 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2102 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2105 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2106 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2107 // tcbhead_t; use it instead of the usual global variable (see
2108 // sysdeps/{i386,x86_64}/nptl/tls.h)
2109 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2110 if (Subtarget.isTargetFuchsia()) {
2111 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2112 return SegmentOffset(IRB, 0x10, getAddressSpace());
2114 // %fs:0x28, unless we're using a Kernel code model, in which case
2115 // it's %gs:0x28. gs:0x14 on i386.
2116 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2117 return SegmentOffset(IRB, Offset, getAddressSpace());
2121 return TargetLowering::getIRStackGuard(IRB);
2124 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2125 // MSVC CRT provides functionalities for stack protection.
2126 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2127 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2128 // MSVC CRT has a global variable holding security cookie.
2129 M.getOrInsertGlobal("__security_cookie",
2130 Type::getInt8PtrTy(M.getContext()));
2132 // MSVC CRT has a function to validate security cookie.
2133 auto *SecurityCheckCookie = cast<Function>(
2134 M.getOrInsertFunction("__security_check_cookie",
2135 Type::getVoidTy(M.getContext()),
2136 Type::getInt8PtrTy(M.getContext())));
2137 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2138 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2141 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2142 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2144 TargetLowering::insertSSPDeclarations(M);
2147 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2148 // MSVC CRT has a global variable holding security cookie.
2149 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2150 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2151 return M.getGlobalVariable("__security_cookie");
2153 return TargetLowering::getSDagStackGuard(M);
2156 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2157 // MSVC CRT has a function to validate security cookie.
2158 if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
2159 Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
2160 return M.getFunction("__security_check_cookie");
2162 return TargetLowering::getSSPStackGuardCheck(M);
2165 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2166 if (Subtarget.getTargetTriple().isOSContiki())
2167 return getDefaultSafeStackPointerLocation(IRB, false);
2169 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2170 // definition of TLS_SLOT_SAFESTACK in
2171 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2172 if (Subtarget.isTargetAndroid()) {
2173 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2175 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2176 return SegmentOffset(IRB, Offset, getAddressSpace());
2179 // Fuchsia is similar.
2180 if (Subtarget.isTargetFuchsia()) {
2181 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2182 return SegmentOffset(IRB, 0x18, getAddressSpace());
2185 return TargetLowering::getSafeStackPointerLocation(IRB);
2188 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2189 unsigned DestAS) const {
2190 assert(SrcAS != DestAS && "Expected different address spaces!");
2192 return SrcAS < 256 && DestAS < 256;
2195 //===----------------------------------------------------------------------===//
2196 // Return Value Calling Convention Implementation
2197 //===----------------------------------------------------------------------===//
2199 #include "X86GenCallingConv.inc"
2201 bool X86TargetLowering::CanLowerReturn(
2202 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2203 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2204 SmallVector<CCValAssign, 16> RVLocs;
2205 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2206 return CCInfo.CheckReturn(Outs, RetCC_X86);
2209 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2210 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2214 /// Lowers masks values (v*i1) to the local register values
2215 /// \returns DAG node after lowering to register type
2216 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2217 const SDLoc &Dl, SelectionDAG &DAG) {
2218 EVT ValVT = ValArg.getValueType();
2220 if (ValVT == MVT::v1i1)
2221 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2222 DAG.getIntPtrConstant(0, Dl));
2224 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2225 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2226 // Two stage lowering might be required
2227 // bitcast: v8i1 -> i8 / v16i1 -> i16
2228 // anyextend: i8 -> i32 / i16 -> i32
2229 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2230 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2231 if (ValLoc == MVT::i32)
2232 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2236 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2237 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2238 // One stage lowering is required
2239 // bitcast: v32i1 -> i32 / v64i1 -> i64
2240 return DAG.getBitcast(ValLoc, ValArg);
2243 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2246 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2247 static void Passv64i1ArgInRegs(
2248 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2249 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2250 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2251 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2252 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2253 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2254 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2255 "The value should reside in two registers");
2257 // Before splitting the value we cast it to i64
2258 Arg = DAG.getBitcast(MVT::i64, Arg);
2260 // Splitting the value into two i32 types
2262 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2263 DAG.getConstant(0, Dl, MVT::i32));
2264 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2265 DAG.getConstant(1, Dl, MVT::i32));
2267 // Attach the two i32 types into corresponding registers
2268 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2269 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2273 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2275 const SmallVectorImpl<ISD::OutputArg> &Outs,
2276 const SmallVectorImpl<SDValue> &OutVals,
2277 const SDLoc &dl, SelectionDAG &DAG) const {
2278 MachineFunction &MF = DAG.getMachineFunction();
2279 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2281 // In some cases we need to disable registers from the default CSR list.
2282 // For example, when they are used for argument passing.
2283 bool ShouldDisableCalleeSavedRegister =
2284 CallConv == CallingConv::X86_RegCall ||
2285 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2287 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2288 report_fatal_error("X86 interrupts may not return any value");
2290 SmallVector<CCValAssign, 16> RVLocs;
2291 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2292 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2295 SmallVector<SDValue, 6> RetOps;
2296 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2297 // Operand #1 = Bytes To Pop
2298 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2301 // Copy the result values into the output registers.
2302 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2304 CCValAssign &VA = RVLocs[I];
2305 assert(VA.isRegLoc() && "Can only return in registers!");
2307 // Add the register to the CalleeSaveDisableRegs list.
2308 if (ShouldDisableCalleeSavedRegister)
2309 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2311 SDValue ValToCopy = OutVals[OutsIndex];
2312 EVT ValVT = ValToCopy.getValueType();
2314 // Promote values to the appropriate types.
2315 if (VA.getLocInfo() == CCValAssign::SExt)
2316 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2317 else if (VA.getLocInfo() == CCValAssign::ZExt)
2318 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2319 else if (VA.getLocInfo() == CCValAssign::AExt) {
2320 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2321 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2323 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2325 else if (VA.getLocInfo() == CCValAssign::BCvt)
2326 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2328 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2329 "Unexpected FP-extend for return value.");
2331 // If this is x86-64, and we disabled SSE, we can't return FP values,
2332 // or SSE or MMX vectors.
2333 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2334 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2335 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2336 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2337 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2338 } else if (ValVT == MVT::f64 &&
2339 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2340 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2341 // llvm-gcc has never done it right and no one has noticed, so this
2342 // should be OK for now.
2343 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2344 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2347 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2348 // the RET instruction and handled by the FP Stackifier.
2349 if (VA.getLocReg() == X86::FP0 ||
2350 VA.getLocReg() == X86::FP1) {
2351 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2352 // change the value to the FP stack register class.
2353 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2354 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2355 RetOps.push_back(ValToCopy);
2356 // Don't emit a copytoreg.
2360 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2361 // which is returned in RAX / RDX.
2362 if (Subtarget.is64Bit()) {
2363 if (ValVT == MVT::x86mmx) {
2364 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2365 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2366 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2368 // If we don't have SSE2 available, convert to v4f32 so the generated
2369 // register is legal.
2370 if (!Subtarget.hasSSE2())
2371 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2376 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2378 if (VA.needsCustom()) {
2379 assert(VA.getValVT() == MVT::v64i1 &&
2380 "Currently the only custom case is when we split v64i1 to 2 regs");
2382 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2385 assert(2 == RegsToPass.size() &&
2386 "Expecting two registers after Pass64BitArgInRegs");
2388 // Add the second register to the CalleeSaveDisableRegs list.
2389 if (ShouldDisableCalleeSavedRegister)
2390 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2392 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2395 // Add nodes to the DAG and add the values into the RetOps list
2396 for (auto &Reg : RegsToPass) {
2397 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2398 Flag = Chain.getValue(1);
2399 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2403 // Swift calling convention does not require we copy the sret argument
2404 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2406 // All x86 ABIs require that for returning structs by value we copy
2407 // the sret argument into %rax/%eax (depending on ABI) for the return.
2408 // We saved the argument into a virtual register in the entry block,
2409 // so now we copy the value out and into %rax/%eax.
2411 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2412 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2413 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2414 // either case FuncInfo->setSRetReturnReg() will have been called.
2415 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2416 // When we have both sret and another return value, we should use the
2417 // original Chain stored in RetOps[0], instead of the current Chain updated
2418 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2420 // For the case of sret and another return value, we have
2421 // Chain_0 at the function entry
2422 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2423 // If we use Chain_1 in getCopyFromReg, we will have
2424 // Val = getCopyFromReg(Chain_1)
2425 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2427 // getCopyToReg(Chain_0) will be glued together with
2428 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2429 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2430 // Data dependency from Unit B to Unit A due to usage of Val in
2431 // getCopyToReg(Chain_1, Val)
2432 // Chain dependency from Unit A to Unit B
2434 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2435 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2436 getPointerTy(MF.getDataLayout()));
2439 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2440 X86::RAX : X86::EAX;
2441 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2442 Flag = Chain.getValue(1);
2444 // RAX/EAX now acts like a return value.
2446 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2448 // Add the returned register to the CalleeSaveDisableRegs list.
2449 if (ShouldDisableCalleeSavedRegister)
2450 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2453 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2454 const MCPhysReg *I =
2455 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2458 if (X86::GR64RegClass.contains(*I))
2459 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2461 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2465 RetOps[0] = Chain; // Update chain.
2467 // Add the flag if we have it.
2469 RetOps.push_back(Flag);
2471 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2472 if (CallConv == CallingConv::X86_INTR)
2473 opcode = X86ISD::IRET;
2474 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2477 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2478 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2481 SDValue TCChain = Chain;
2482 SDNode *Copy = *N->use_begin();
2483 if (Copy->getOpcode() == ISD::CopyToReg) {
2484 // If the copy has a glue operand, we conservatively assume it isn't safe to
2485 // perform a tail call.
2486 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2488 TCChain = Copy->getOperand(0);
2489 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2492 bool HasRet = false;
2493 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2495 if (UI->getOpcode() != X86ISD::RET_FLAG)
2497 // If we are returning more than one value, we can definitely
2498 // not make a tail call see PR19530
2499 if (UI->getNumOperands() > 4)
2501 if (UI->getNumOperands() == 4 &&
2502 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2514 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2515 ISD::NodeType ExtendKind) const {
2516 MVT ReturnMVT = MVT::i32;
2518 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2519 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2520 // The ABI does not require i1, i8 or i16 to be extended.
2522 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2523 // always extending i8/i16 return values, so keep doing that for now.
2525 ReturnMVT = MVT::i8;
2528 EVT MinVT = getRegisterType(Context, ReturnMVT);
2529 return VT.bitsLT(MinVT) ? MinVT : VT;
2532 /// Reads two 32 bit registers and creates a 64 bit mask value.
2533 /// \param VA The current 32 bit value that need to be assigned.
2534 /// \param NextVA The next 32 bit value that need to be assigned.
2535 /// \param Root The parent DAG node.
2536 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2537 /// glue purposes. In the case the DAG is already using
2538 /// physical register instead of virtual, we should glue
2539 /// our new SDValue to InFlag SDvalue.
2540 /// \return a new SDvalue of size 64bit.
2541 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2542 SDValue &Root, SelectionDAG &DAG,
2543 const SDLoc &Dl, const X86Subtarget &Subtarget,
2544 SDValue *InFlag = nullptr) {
2545 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2546 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2547 assert(VA.getValVT() == MVT::v64i1 &&
2548 "Expecting first location of 64 bit width type");
2549 assert(NextVA.getValVT() == VA.getValVT() &&
2550 "The locations should have the same type");
2551 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2552 "The values should reside in two registers");
2556 SDValue ArgValueLo, ArgValueHi;
2558 MachineFunction &MF = DAG.getMachineFunction();
2559 const TargetRegisterClass *RC = &X86::GR32RegClass;
2561 // Read a 32 bit value from the registers.
2562 if (nullptr == InFlag) {
2563 // When no physical register is present,
2564 // create an intermediate virtual register.
2565 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2566 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2567 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2568 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2570 // When a physical register is available read the value from it and glue
2571 // the reads together.
2573 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2574 *InFlag = ArgValueLo.getValue(2);
2576 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2577 *InFlag = ArgValueHi.getValue(2);
2580 // Convert the i32 type into v32i1 type.
2581 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2583 // Convert the i32 type into v32i1 type.
2584 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2586 // Concatenate the two values together.
2587 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2590 /// The function will lower a register of various sizes (8/16/32/64)
2591 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2592 /// \returns a DAG node contains the operand after lowering to mask type.
2593 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2594 const EVT &ValLoc, const SDLoc &Dl,
2595 SelectionDAG &DAG) {
2596 SDValue ValReturned = ValArg;
2598 if (ValVT == MVT::v1i1)
2599 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2601 if (ValVT == MVT::v64i1) {
2602 // In 32 bit machine, this case is handled by getv64i1Argument
2603 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2604 // In 64 bit machine, There is no need to truncate the value only bitcast
2607 switch (ValVT.getSimpleVT().SimpleTy) {
2618 llvm_unreachable("Expecting a vector of i1 types");
2621 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2623 return DAG.getBitcast(ValVT, ValReturned);
2626 /// Lower the result values of a call into the
2627 /// appropriate copies out of appropriate physical registers.
2629 SDValue X86TargetLowering::LowerCallResult(
2630 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2631 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2632 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2633 uint32_t *RegMask) const {
2635 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2636 // Assign locations to each value returned by this call.
2637 SmallVector<CCValAssign, 16> RVLocs;
2638 bool Is64Bit = Subtarget.is64Bit();
2639 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2641 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2643 // Copy all of the result registers out of their specified physreg.
2644 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2646 CCValAssign &VA = RVLocs[I];
2647 EVT CopyVT = VA.getLocVT();
2649 // In some calling conventions we need to remove the used registers
2650 // from the register mask.
2652 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2653 SubRegs.isValid(); ++SubRegs)
2654 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2657 // If this is x86-64, and we disabled SSE, we can't return FP values
2658 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2659 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2660 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2661 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2664 // If we prefer to use the value in xmm registers, copy it out as f80 and
2665 // use a truncate to move it from fp stack reg to xmm reg.
2666 bool RoundAfterCopy = false;
2667 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2668 isScalarFPTypeInSSEReg(VA.getValVT())) {
2669 if (!Subtarget.hasX87())
2670 report_fatal_error("X87 register return with X87 disabled");
2672 RoundAfterCopy = (CopyVT != VA.getLocVT());
2676 if (VA.needsCustom()) {
2677 assert(VA.getValVT() == MVT::v64i1 &&
2678 "Currently the only custom case is when we split v64i1 to 2 regs");
2680 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2682 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2684 Val = Chain.getValue(0);
2685 InFlag = Chain.getValue(2);
2689 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2690 // This truncation won't change the value.
2691 DAG.getIntPtrConstant(1, dl));
2693 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2694 if (VA.getValVT().isVector() &&
2695 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2696 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2697 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2698 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2700 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2703 InVals.push_back(Val);
2709 //===----------------------------------------------------------------------===//
2710 // C & StdCall & Fast Calling Convention implementation
2711 //===----------------------------------------------------------------------===//
2712 // StdCall calling convention seems to be standard for many Windows' API
2713 // routines and around. It differs from C calling convention just a little:
2714 // callee should clean up the stack, not caller. Symbols should be also
2715 // decorated in some fancy way :) It doesn't support any vector arguments.
2716 // For info on fast calling convention see Fast Calling Convention (tail call)
2717 // implementation LowerX86_32FastCCCallTo.
2719 /// CallIsStructReturn - Determines whether a call uses struct return
2721 enum StructReturnType {
2726 static StructReturnType
2727 callIsStructReturn(ArrayRef<ISD::OutputArg> Outs, bool IsMCU) {
2729 return NotStructReturn;
2731 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2732 if (!Flags.isSRet())
2733 return NotStructReturn;
2734 if (Flags.isInReg() || IsMCU)
2735 return RegStructReturn;
2736 return StackStructReturn;
2739 /// Determines whether a function uses struct return semantics.
2740 static StructReturnType
2741 argsAreStructReturn(ArrayRef<ISD::InputArg> Ins, bool IsMCU) {
2743 return NotStructReturn;
2745 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2746 if (!Flags.isSRet())
2747 return NotStructReturn;
2748 if (Flags.isInReg() || IsMCU)
2749 return RegStructReturn;
2750 return StackStructReturn;
2753 /// Make a copy of an aggregate at address specified by "Src" to address
2754 /// "Dst" with size and alignment information specified by the specific
2755 /// parameter attribute. The copy will be passed as a byval function parameter.
2756 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2757 SDValue Chain, ISD::ArgFlagsTy Flags,
2758 SelectionDAG &DAG, const SDLoc &dl) {
2759 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2761 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2762 /*isVolatile*/false, /*AlwaysInline=*/true,
2763 /*isTailCall*/false,
2764 MachinePointerInfo(), MachinePointerInfo());
2767 /// Return true if the calling convention is one that we can guarantee TCO for.
2768 static bool canGuaranteeTCO(CallingConv::ID CC) {
2769 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2770 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2771 CC == CallingConv::HHVM);
2774 /// Return true if we might ever do TCO for calls with this calling convention.
2775 static bool mayTailCallThisCC(CallingConv::ID CC) {
2777 // C calling conventions:
2778 case CallingConv::C:
2779 case CallingConv::Win64:
2780 case CallingConv::X86_64_SysV:
2781 // Callee pop conventions:
2782 case CallingConv::X86_ThisCall:
2783 case CallingConv::X86_StdCall:
2784 case CallingConv::X86_VectorCall:
2785 case CallingConv::X86_FastCall:
2788 return canGuaranteeTCO(CC);
2792 /// Return true if the function is being made into a tailcall target by
2793 /// changing its ABI.
2794 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2795 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2798 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2800 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2801 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2804 ImmutableCallSite CS(CI);
2805 CallingConv::ID CalleeCC = CS.getCallingConv();
2806 if (!mayTailCallThisCC(CalleeCC))
2813 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2814 const SmallVectorImpl<ISD::InputArg> &Ins,
2815 const SDLoc &dl, SelectionDAG &DAG,
2816 const CCValAssign &VA,
2817 MachineFrameInfo &MFI, unsigned i) const {
2818 // Create the nodes corresponding to a load from this parameter slot.
2819 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2820 bool AlwaysUseMutable = shouldGuaranteeTCO(
2821 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2822 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2824 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2826 // If value is passed by pointer we have address passed instead of the value
2827 // itself. No need to extend if the mask value and location share the same
2829 bool ExtendedInMem =
2830 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2831 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2833 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2834 ValVT = VA.getLocVT();
2836 ValVT = VA.getValVT();
2838 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2839 // taken by a return address.
2841 if (CallConv == CallingConv::X86_INTR) {
2842 // X86 interrupts may take one or two arguments.
2843 // On the stack there will be no return address as in regular call.
2844 // Offset of last argument need to be set to -4/-8 bytes.
2845 // Where offset of the first argument out of two, should be set to 0 bytes.
2846 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2847 if (Subtarget.is64Bit() && Ins.size() == 2) {
2848 // The stack pointer needs to be realigned for 64 bit handlers with error
2849 // code, so the argument offset changes by 8 bytes.
2854 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2855 // changed with more analysis.
2856 // In case of tail call optimization mark all arguments mutable. Since they
2857 // could be overwritten by lowering of arguments in case of a tail call.
2858 if (Flags.isByVal()) {
2859 unsigned Bytes = Flags.getByValSize();
2860 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2862 // FIXME: For now, all byval parameter objects are marked as aliasing. This
2863 // can be improved with deeper analysis.
2864 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable,
2865 /*isAliased=*/true);
2866 // Adjust SP offset of interrupt parameter.
2867 if (CallConv == CallingConv::X86_INTR) {
2868 MFI.setObjectOffset(FI, Offset);
2870 return DAG.getFrameIndex(FI, PtrVT);
2873 // This is an argument in memory. We might be able to perform copy elision.
2874 if (Flags.isCopyElisionCandidate()) {
2875 EVT ArgVT = Ins[i].ArgVT;
2877 if (Ins[i].PartOffset == 0) {
2878 // If this is a one-part value or the first part of a multi-part value,
2879 // create a stack object for the entire argument value type and return a
2880 // load from our portion of it. This assumes that if the first part of an
2881 // argument is in memory, the rest will also be in memory.
2882 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2883 /*Immutable=*/false);
2884 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2886 ValVT, dl, Chain, PartAddr,
2887 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2889 // This is not the first piece of an argument in memory. See if there is
2890 // already a fixed stack object including this offset. If so, assume it
2891 // was created by the PartOffset == 0 branch above and create a load from
2892 // the appropriate offset into it.
2893 int64_t PartBegin = VA.getLocMemOffset();
2894 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2895 int FI = MFI.getObjectIndexBegin();
2896 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2897 int64_t ObjBegin = MFI.getObjectOffset(FI);
2898 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2899 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2902 if (MFI.isFixedObjectIndex(FI)) {
2904 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2905 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2907 ValVT, dl, Chain, Addr,
2908 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2909 Ins[i].PartOffset));
2914 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2915 VA.getLocMemOffset(), isImmutable);
2917 // Set SExt or ZExt flag.
2918 if (VA.getLocInfo() == CCValAssign::ZExt) {
2919 MFI.setObjectZExt(FI, true);
2920 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2921 MFI.setObjectSExt(FI, true);
2924 // Adjust SP offset of interrupt parameter.
2925 if (CallConv == CallingConv::X86_INTR) {
2926 MFI.setObjectOffset(FI, Offset);
2929 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2930 SDValue Val = DAG.getLoad(
2931 ValVT, dl, Chain, FIN,
2932 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2933 return ExtendedInMem
2934 ? (VA.getValVT().isVector()
2935 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2936 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2940 // FIXME: Get this from tablegen.
2941 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2942 const X86Subtarget &Subtarget) {
2943 assert(Subtarget.is64Bit());
2945 if (Subtarget.isCallingConvWin64(CallConv)) {
2946 static const MCPhysReg GPR64ArgRegsWin64[] = {
2947 X86::RCX, X86::RDX, X86::R8, X86::R9
2949 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2952 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2953 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2955 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2958 // FIXME: Get this from tablegen.
2959 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2960 CallingConv::ID CallConv,
2961 const X86Subtarget &Subtarget) {
2962 assert(Subtarget.is64Bit());
2963 if (Subtarget.isCallingConvWin64(CallConv)) {
2964 // The XMM registers which might contain var arg parameters are shadowed
2965 // in their paired GPR. So we only need to save the GPR to their home
2967 // TODO: __vectorcall will change this.
2971 const Function &F = MF.getFunction();
2972 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2973 bool isSoftFloat = Subtarget.useSoftFloat();
2974 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2975 "SSE register cannot be used when SSE is disabled!");
2976 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2977 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2981 static const MCPhysReg XMMArgRegs64Bit[] = {
2982 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2983 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2985 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2989 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
2990 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2991 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2992 return A.getValNo() < B.getValNo();
2997 SDValue X86TargetLowering::LowerFormalArguments(
2998 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2999 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
3000 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3001 MachineFunction &MF = DAG.getMachineFunction();
3002 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
3003 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3005 const Function &F = MF.getFunction();
3006 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
3007 F.getName() == "main")
3008 FuncInfo->setForceFramePointer(true);
3010 MachineFrameInfo &MFI = MF.getFrameInfo();
3011 bool Is64Bit = Subtarget.is64Bit();
3012 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3015 !(isVarArg && canGuaranteeTCO(CallConv)) &&
3016 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
3018 if (CallConv == CallingConv::X86_INTR) {
3019 bool isLegal = Ins.size() == 1 ||
3020 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
3021 (!Is64Bit && Ins[1].VT == MVT::i32)));
3023 report_fatal_error("X86 interrupts may take one or two arguments");
3026 // Assign locations to all of the incoming arguments.
3027 SmallVector<CCValAssign, 16> ArgLocs;
3028 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3030 // Allocate shadow area for Win64.
3032 CCInfo.AllocateStack(32, 8);
3034 CCInfo.AnalyzeArguments(Ins, CC_X86);
3036 // In vectorcall calling convention a second pass is required for the HVA
3038 if (CallingConv::X86_VectorCall == CallConv) {
3039 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3042 // The next loop assumes that the locations are in the same order of the
3044 assert(isSortedByValueNo(ArgLocs) &&
3045 "Argument Location list must be sorted before lowering");
3048 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3050 assert(InsIndex < Ins.size() && "Invalid Ins index");
3051 CCValAssign &VA = ArgLocs[I];
3053 if (VA.isRegLoc()) {
3054 EVT RegVT = VA.getLocVT();
3055 if (VA.needsCustom()) {
3057 VA.getValVT() == MVT::v64i1 &&
3058 "Currently the only custom case is when we split v64i1 to 2 regs");
3060 // v64i1 values, in regcall calling convention, that are
3061 // compiled to 32 bit arch, are split up into two registers.
3063 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3065 const TargetRegisterClass *RC;
3066 if (RegVT == MVT::i8)
3067 RC = &X86::GR8RegClass;
3068 else if (RegVT == MVT::i16)
3069 RC = &X86::GR16RegClass;
3070 else if (RegVT == MVT::i32)
3071 RC = &X86::GR32RegClass;
3072 else if (Is64Bit && RegVT == MVT::i64)
3073 RC = &X86::GR64RegClass;
3074 else if (RegVT == MVT::f32)
3075 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3076 else if (RegVT == MVT::f64)
3077 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3078 else if (RegVT == MVT::f80)
3079 RC = &X86::RFP80RegClass;
3080 else if (RegVT == MVT::f128)
3081 RC = &X86::FR128RegClass;
3082 else if (RegVT.is512BitVector())
3083 RC = &X86::VR512RegClass;
3084 else if (RegVT.is256BitVector())
3085 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3086 else if (RegVT.is128BitVector())
3087 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3088 else if (RegVT == MVT::x86mmx)
3089 RC = &X86::VR64RegClass;
3090 else if (RegVT == MVT::v1i1)
3091 RC = &X86::VK1RegClass;
3092 else if (RegVT == MVT::v8i1)
3093 RC = &X86::VK8RegClass;
3094 else if (RegVT == MVT::v16i1)
3095 RC = &X86::VK16RegClass;
3096 else if (RegVT == MVT::v32i1)
3097 RC = &X86::VK32RegClass;
3098 else if (RegVT == MVT::v64i1)
3099 RC = &X86::VK64RegClass;
3101 llvm_unreachable("Unknown argument type!");
3103 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3104 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3107 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3108 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3110 if (VA.getLocInfo() == CCValAssign::SExt)
3111 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3112 DAG.getValueType(VA.getValVT()));
3113 else if (VA.getLocInfo() == CCValAssign::ZExt)
3114 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3115 DAG.getValueType(VA.getValVT()));
3116 else if (VA.getLocInfo() == CCValAssign::BCvt)
3117 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3119 if (VA.isExtInLoc()) {
3120 // Handle MMX values passed in XMM regs.
3121 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3122 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3123 else if (VA.getValVT().isVector() &&
3124 VA.getValVT().getScalarType() == MVT::i1 &&
3125 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3126 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3127 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3128 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3130 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3133 assert(VA.isMemLoc());
3135 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3138 // If value is passed via pointer - do a load.
3139 if (VA.getLocInfo() == CCValAssign::Indirect)
3141 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3143 InVals.push_back(ArgValue);
3146 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3147 // Swift calling convention does not require we copy the sret argument
3148 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3149 if (CallConv == CallingConv::Swift)
3152 // All x86 ABIs require that for returning structs by value we copy the
3153 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3154 // the argument into a virtual register so that we can access it from the
3156 if (Ins[I].Flags.isSRet()) {
3157 unsigned Reg = FuncInfo->getSRetReturnReg();
3159 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3160 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3161 FuncInfo->setSRetReturnReg(Reg);
3163 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3164 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3169 unsigned StackSize = CCInfo.getNextStackOffset();
3170 // Align stack specially for tail calls.
3171 if (shouldGuaranteeTCO(CallConv,
3172 MF.getTarget().Options.GuaranteedTailCallOpt))
3173 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3175 // If the function takes variable number of arguments, make a frame index for
3176 // the start of the first vararg value... for expansion of llvm.va_start. We
3177 // can skip this if there are no va_start calls.
3178 if (MFI.hasVAStart() &&
3179 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3180 CallConv != CallingConv::X86_ThisCall))) {
3181 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3184 // Figure out if XMM registers are in use.
3185 assert(!(Subtarget.useSoftFloat() &&
3186 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3187 "SSE register cannot be used when SSE is disabled!");
3189 // 64-bit calling conventions support varargs and register parameters, so we
3190 // have to do extra work to spill them in the prologue.
3191 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3192 // Find the first unallocated argument registers.
3193 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3194 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3195 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3196 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3197 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3198 "SSE register cannot be used when SSE is disabled!");
3200 // Gather all the live in physical registers.
3201 SmallVector<SDValue, 6> LiveGPRs;
3202 SmallVector<SDValue, 8> LiveXMMRegs;
3204 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3205 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3207 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3209 if (!ArgXMMs.empty()) {
3210 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3211 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3212 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3213 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3214 LiveXMMRegs.push_back(
3215 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3220 // Get to the caller-allocated home save location. Add 8 to account
3221 // for the return address.
3222 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3223 FuncInfo->setRegSaveFrameIndex(
3224 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3225 // Fixup to set vararg frame on shadow area (4 x i64).
3227 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3229 // For X86-64, if there are vararg parameters that are passed via
3230 // registers, then we must store them to their spots on the stack so
3231 // they may be loaded by dereferencing the result of va_next.
3232 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3233 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3234 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3235 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3238 // Store the integer parameter registers.
3239 SmallVector<SDValue, 8> MemOps;
3240 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3241 getPointerTy(DAG.getDataLayout()));
3242 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3243 for (SDValue Val : LiveGPRs) {
3244 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3245 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3247 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3248 MachinePointerInfo::getFixedStack(
3249 DAG.getMachineFunction(),
3250 FuncInfo->getRegSaveFrameIndex(), Offset));
3251 MemOps.push_back(Store);
3255 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3256 // Now store the XMM (fp + vector) parameter registers.
3257 SmallVector<SDValue, 12> SaveXMMOps;
3258 SaveXMMOps.push_back(Chain);
3259 SaveXMMOps.push_back(ALVal);
3260 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3261 FuncInfo->getRegSaveFrameIndex(), dl));
3262 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3263 FuncInfo->getVarArgsFPOffset(), dl));
3264 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3266 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3267 MVT::Other, SaveXMMOps));
3270 if (!MemOps.empty())
3271 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3274 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3275 // Find the largest legal vector type.
3276 MVT VecVT = MVT::Other;
3277 // FIXME: Only some x86_32 calling conventions support AVX512.
3278 if (Subtarget.hasAVX512() &&
3279 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3280 CallConv == CallingConv::Intel_OCL_BI)))
3281 VecVT = MVT::v16f32;
3282 else if (Subtarget.hasAVX())
3284 else if (Subtarget.hasSSE2())
3287 // We forward some GPRs and some vector types.
3288 SmallVector<MVT, 2> RegParmTypes;
3289 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3290 RegParmTypes.push_back(IntVT);
3291 if (VecVT != MVT::Other)
3292 RegParmTypes.push_back(VecVT);
3294 // Compute the set of forwarded registers. The rest are scratch.
3295 SmallVectorImpl<ForwardedRegister> &Forwards =
3296 FuncInfo->getForwardedMustTailRegParms();
3297 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3299 // Conservatively forward AL on x86_64, since it might be used for varargs.
3300 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3301 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3302 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3305 // Copy all forwards from physical to virtual registers.
3306 for (ForwardedRegister &F : Forwards) {
3307 // FIXME: Can we use a less constrained schedule?
3308 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3309 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3310 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3314 // Some CCs need callee pop.
3315 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3316 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3317 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3318 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3319 // X86 interrupts must pop the error code (and the alignment padding) if
3321 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3323 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3324 // If this is an sret function, the return should pop the hidden pointer.
3325 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3326 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3327 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3328 FuncInfo->setBytesToPopOnReturn(4);
3332 // RegSaveFrameIndex is X86-64 only.
3333 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3334 if (CallConv == CallingConv::X86_FastCall ||
3335 CallConv == CallingConv::X86_ThisCall)
3336 // fastcc functions can't have varargs.
3337 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3340 FuncInfo->setArgumentStackSize(StackSize);
3342 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3343 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3344 if (Personality == EHPersonality::CoreCLR) {
3346 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3347 // that we'd prefer this slot be allocated towards the bottom of the frame
3348 // (i.e. near the stack pointer after allocating the frame). Every
3349 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3350 // offset from the bottom of this and each funclet's frame must be the
3351 // same, so the size of funclets' (mostly empty) frames is dictated by
3352 // how far this slot is from the bottom (since they allocate just enough
3353 // space to accommodate holding this slot at the correct offset).
3354 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3355 EHInfo->PSPSymFrameIdx = PSPSymFI;
3359 if (CallConv == CallingConv::X86_RegCall ||
3360 F.hasFnAttribute("no_caller_saved_registers")) {
3361 MachineRegisterInfo &MRI = MF.getRegInfo();
3362 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3363 MRI.disableCalleeSavedRegister(Pair.first);
3369 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3370 SDValue Arg, const SDLoc &dl,
3372 const CCValAssign &VA,
3373 ISD::ArgFlagsTy Flags) const {
3374 unsigned LocMemOffset = VA.getLocMemOffset();
3375 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3376 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3378 if (Flags.isByVal())
3379 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3381 return DAG.getStore(
3382 Chain, dl, Arg, PtrOff,
3383 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3386 /// Emit a load of return address if tail call
3387 /// optimization is performed and it is required.
3388 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3389 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3390 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3391 // Adjust the Return address stack slot.
3392 EVT VT = getPointerTy(DAG.getDataLayout());
3393 OutRetAddr = getReturnAddressFrameIndex(DAG);
3395 // Load the "old" Return address.
3396 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3397 return SDValue(OutRetAddr.getNode(), 1);
3400 /// Emit a store of the return address if tail call
3401 /// optimization is performed and it is required (FPDiff!=0).
3402 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3403 SDValue Chain, SDValue RetAddrFrIdx,
3404 EVT PtrVT, unsigned SlotSize,
3405 int FPDiff, const SDLoc &dl) {
3406 // Store the return address to the appropriate stack slot.
3407 if (!FPDiff) return Chain;
3408 // Calculate the new stack slot for the return address.
3409 int NewReturnAddrFI =
3410 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3412 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3413 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3414 MachinePointerInfo::getFixedStack(
3415 DAG.getMachineFunction(), NewReturnAddrFI));
3419 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3420 /// operation of specified width.
3421 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3423 unsigned NumElems = VT.getVectorNumElements();
3424 SmallVector<int, 8> Mask;
3425 Mask.push_back(NumElems);
3426 for (unsigned i = 1; i != NumElems; ++i)
3428 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3432 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3433 SmallVectorImpl<SDValue> &InVals) const {
3434 SelectionDAG &DAG = CLI.DAG;
3436 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3437 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3438 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3439 SDValue Chain = CLI.Chain;
3440 SDValue Callee = CLI.Callee;
3441 CallingConv::ID CallConv = CLI.CallConv;
3442 bool &isTailCall = CLI.IsTailCall;
3443 bool isVarArg = CLI.IsVarArg;
3445 MachineFunction &MF = DAG.getMachineFunction();
3446 bool Is64Bit = Subtarget.is64Bit();
3447 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3448 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3449 bool IsSibcall = false;
3450 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3451 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3452 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3453 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3454 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3455 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3456 const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
3458 (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
3459 const Module *M = MF.getMMI().getModule();
3460 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
3462 if (CallConv == CallingConv::X86_INTR)
3463 report_fatal_error("X86 interrupts may not be called directly");
3465 if (Attr.getValueAsString() == "true")
3468 if (Subtarget.isPICStyleGOT() &&
3469 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3470 // If we are using a GOT, disable tail calls to external symbols with
3471 // default visibility. Tail calling such a symbol requires using a GOT
3472 // relocation, which forces early binding of the symbol. This breaks code
3473 // that require lazy function symbol resolution. Using musttail or
3474 // GuaranteedTailCallOpt will override this.
3475 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3476 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3477 G->getGlobal()->hasDefaultVisibility()))
3481 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3483 // Force this to be a tail call. The verifier rules are enough to ensure
3484 // that we can lower this successfully without moving the return address
3487 } else if (isTailCall) {
3488 // Check if it's really possible to do a tail call.
3489 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3490 isVarArg, SR != NotStructReturn,
3491 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3492 Outs, OutVals, Ins, DAG);
3494 // Sibcalls are automatically detected tailcalls which do not require
3496 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3503 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3504 "Var args not supported with calling convention fastcc, ghc or hipe");
3506 // Analyze operands of the call, assigning locations to each operand.
3507 SmallVector<CCValAssign, 16> ArgLocs;
3508 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3510 // Allocate shadow area for Win64.
3512 CCInfo.AllocateStack(32, 8);
3514 CCInfo.AnalyzeArguments(Outs, CC_X86);
3516 // In vectorcall calling convention a second pass is required for the HVA
3518 if (CallingConv::X86_VectorCall == CallConv) {
3519 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3522 // Get a count of how many bytes are to be pushed on the stack.
3523 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3525 // This is a sibcall. The memory operands are available in caller's
3526 // own caller's stack.
3528 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3529 canGuaranteeTCO(CallConv))
3530 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3533 if (isTailCall && !IsSibcall && !IsMustTail) {
3534 // Lower arguments at fp - stackoffset + fpdiff.
3535 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3537 FPDiff = NumBytesCallerPushed - NumBytes;
3539 // Set the delta of movement of the returnaddr stackslot.
3540 // But only set if delta is greater than previous delta.
3541 if (FPDiff < X86Info->getTCReturnAddrDelta())
3542 X86Info->setTCReturnAddrDelta(FPDiff);
3545 unsigned NumBytesToPush = NumBytes;
3546 unsigned NumBytesToPop = NumBytes;
3548 // If we have an inalloca argument, all stack space has already been allocated
3549 // for us and be right at the top of the stack. We don't support multiple
3550 // arguments passed in memory when using inalloca.
3551 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3553 if (!ArgLocs.back().isMemLoc())
3554 report_fatal_error("cannot use inalloca attribute on a register "
3556 if (ArgLocs.back().getLocMemOffset() != 0)
3557 report_fatal_error("any parameter with the inalloca attribute must be "
3558 "the only memory argument");
3562 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3563 NumBytes - NumBytesToPush, dl);
3565 SDValue RetAddrFrIdx;
3566 // Load return address for tail calls.
3567 if (isTailCall && FPDiff)
3568 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3569 Is64Bit, FPDiff, dl);
3571 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3572 SmallVector<SDValue, 8> MemOpChains;
3575 // The next loop assumes that the locations are in the same order of the
3577 assert(isSortedByValueNo(ArgLocs) &&
3578 "Argument Location list must be sorted before lowering");
3580 // Walk the register/memloc assignments, inserting copies/loads. In the case
3581 // of tail call optimization arguments are handle later.
3582 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3583 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3585 assert(OutIndex < Outs.size() && "Invalid Out index");
3586 // Skip inalloca arguments, they have already been written.
3587 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3588 if (Flags.isInAlloca())
3591 CCValAssign &VA = ArgLocs[I];
3592 EVT RegVT = VA.getLocVT();
3593 SDValue Arg = OutVals[OutIndex];
3594 bool isByVal = Flags.isByVal();
3596 // Promote the value if needed.
3597 switch (VA.getLocInfo()) {
3598 default: llvm_unreachable("Unknown loc info!");
3599 case CCValAssign::Full: break;
3600 case CCValAssign::SExt:
3601 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3603 case CCValAssign::ZExt:
3604 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3606 case CCValAssign::AExt:
3607 if (Arg.getValueType().isVector() &&
3608 Arg.getValueType().getVectorElementType() == MVT::i1)
3609 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3610 else if (RegVT.is128BitVector()) {
3611 // Special case: passing MMX values in XMM registers.
3612 Arg = DAG.getBitcast(MVT::i64, Arg);
3613 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3614 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3616 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3618 case CCValAssign::BCvt:
3619 Arg = DAG.getBitcast(RegVT, Arg);
3621 case CCValAssign::Indirect: {
3622 // Store the argument.
3623 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3624 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3625 Chain = DAG.getStore(
3626 Chain, dl, Arg, SpillSlot,
3627 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3633 if (VA.needsCustom()) {
3634 assert(VA.getValVT() == MVT::v64i1 &&
3635 "Currently the only custom case is when we split v64i1 to 2 regs");
3636 // Split v64i1 value into two registers
3637 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3639 } else if (VA.isRegLoc()) {
3640 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3641 if (isVarArg && IsWin64) {
3642 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3643 // shadow reg if callee is a varargs function.
3644 unsigned ShadowReg = 0;
3645 switch (VA.getLocReg()) {
3646 case X86::XMM0: ShadowReg = X86::RCX; break;
3647 case X86::XMM1: ShadowReg = X86::RDX; break;
3648 case X86::XMM2: ShadowReg = X86::R8; break;
3649 case X86::XMM3: ShadowReg = X86::R9; break;
3652 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3654 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3655 assert(VA.isMemLoc());
3656 if (!StackPtr.getNode())
3657 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3658 getPointerTy(DAG.getDataLayout()));
3659 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3660 dl, DAG, VA, Flags));
3664 if (!MemOpChains.empty())
3665 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3667 if (Subtarget.isPICStyleGOT()) {
3668 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3671 RegsToPass.push_back(std::make_pair(
3672 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3673 getPointerTy(DAG.getDataLayout()))));
3675 // If we are tail calling and generating PIC/GOT style code load the
3676 // address of the callee into ECX. The value in ecx is used as target of
3677 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3678 // for tail calls on PIC/GOT architectures. Normally we would just put the
3679 // address of GOT into ebx and then call target@PLT. But for tail calls
3680 // ebx would be restored (since ebx is callee saved) before jumping to the
3683 // Note: The actual moving to ECX is done further down.
3684 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3685 if (G && !G->getGlobal()->hasLocalLinkage() &&
3686 G->getGlobal()->hasDefaultVisibility())
3687 Callee = LowerGlobalAddress(Callee, DAG);
3688 else if (isa<ExternalSymbolSDNode>(Callee))
3689 Callee = LowerExternalSymbol(Callee, DAG);
3693 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3694 // From AMD64 ABI document:
3695 // For calls that may call functions that use varargs or stdargs
3696 // (prototype-less calls or calls to functions containing ellipsis (...) in
3697 // the declaration) %al is used as hidden argument to specify the number
3698 // of SSE registers used. The contents of %al do not need to match exactly
3699 // the number of registers, but must be an ubound on the number of SSE
3700 // registers used and is in the range 0 - 8 inclusive.
3702 // Count the number of XMM registers allocated.
3703 static const MCPhysReg XMMArgRegs[] = {
3704 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3705 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3707 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3708 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3709 && "SSE registers cannot be used when SSE is disabled");
3711 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3712 DAG.getConstant(NumXMMRegs, dl,
3716 if (isVarArg && IsMustTail) {
3717 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3718 for (const auto &F : Forwards) {
3719 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3720 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3724 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3725 // don't need this because the eligibility check rejects calls that require
3726 // shuffling arguments passed in memory.
3727 if (!IsSibcall && isTailCall) {
3728 // Force all the incoming stack arguments to be loaded from the stack
3729 // before any new outgoing arguments are stored to the stack, because the
3730 // outgoing stack slots may alias the incoming argument stack slots, and
3731 // the alias isn't otherwise explicit. This is slightly more conservative
3732 // than necessary, because it means that each store effectively depends
3733 // on every argument instead of just those arguments it would clobber.
3734 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3736 SmallVector<SDValue, 8> MemOpChains2;
3739 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3741 CCValAssign &VA = ArgLocs[I];
3743 if (VA.isRegLoc()) {
3744 if (VA.needsCustom()) {
3745 assert((CallConv == CallingConv::X86_RegCall) &&
3746 "Expecting custom case only in regcall calling convention");
3747 // This means that we are in special case where one argument was
3748 // passed through two register locations - Skip the next location
3755 assert(VA.isMemLoc());
3756 SDValue Arg = OutVals[OutsIndex];
3757 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3758 // Skip inalloca arguments. They don't require any work.
3759 if (Flags.isInAlloca())
3761 // Create frame index.
3762 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3763 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3764 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3765 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3767 if (Flags.isByVal()) {
3768 // Copy relative to framepointer.
3769 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3770 if (!StackPtr.getNode())
3771 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3772 getPointerTy(DAG.getDataLayout()));
3773 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3776 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3780 // Store relative to framepointer.
3781 MemOpChains2.push_back(DAG.getStore(
3782 ArgChain, dl, Arg, FIN,
3783 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3787 if (!MemOpChains2.empty())
3788 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3790 // Store the return address to the appropriate stack slot.
3791 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3792 getPointerTy(DAG.getDataLayout()),
3793 RegInfo->getSlotSize(), FPDiff, dl);
3796 // Build a sequence of copy-to-reg nodes chained together with token chain
3797 // and flag operands which copy the outgoing args into registers.
3799 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3800 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3801 RegsToPass[i].second, InFlag);
3802 InFlag = Chain.getValue(1);
3805 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3806 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3807 // In the 64-bit large code model, we have to make all calls
3808 // through a register, since the call instruction's 32-bit
3809 // pc-relative offset may not be large enough to hold the whole
3811 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3812 // If the callee is a GlobalAddress node (quite common, every direct call
3813 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3815 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3817 // We should use extra load for direct calls to dllimported functions in
3819 const GlobalValue *GV = G->getGlobal();
3820 if (!GV->hasDLLImportStorageClass()) {
3821 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3823 Callee = DAG.getTargetGlobalAddress(
3824 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3826 if (OpFlags == X86II::MO_GOTPCREL) {
3828 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3829 getPointerTy(DAG.getDataLayout()), Callee);
3830 // Add extra indirection
3831 Callee = DAG.getLoad(
3832 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3833 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3836 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3837 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3838 unsigned char OpFlags =
3839 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3841 Callee = DAG.getTargetExternalSymbol(
3842 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3844 if (OpFlags == X86II::MO_GOTPCREL) {
3845 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3846 getPointerTy(DAG.getDataLayout()), Callee);
3847 Callee = DAG.getLoad(
3848 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3849 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3851 } else if (Subtarget.isTarget64BitILP32() &&
3852 Callee->getValueType(0) == MVT::i32) {
3853 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3854 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3857 // Returns a chain & a flag for retval copy to use.
3858 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3859 SmallVector<SDValue, 8> Ops;
3861 if (!IsSibcall && isTailCall) {
3862 Chain = DAG.getCALLSEQ_END(Chain,
3863 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3864 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3865 InFlag = Chain.getValue(1);
3868 Ops.push_back(Chain);
3869 Ops.push_back(Callee);
3872 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3874 // Add argument registers to the end of the list so that they are known live
3876 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3877 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3878 RegsToPass[i].second.getValueType()));
3880 // Add a register mask operand representing the call-preserved registers.
3881 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3882 // set X86_INTR calling convention because it has the same CSR mask
3883 // (same preserved registers).
3884 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3885 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3886 assert(Mask && "Missing call preserved mask for calling convention");
3888 // If this is an invoke in a 32-bit function using a funclet-based
3889 // personality, assume the function clobbers all registers. If an exception
3890 // is thrown, the runtime will not restore CSRs.
3891 // FIXME: Model this more precisely so that we can register allocate across
3892 // the normal edge and spill and fill across the exceptional edge.
3893 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3894 const Function &CallerFn = MF.getFunction();
3895 EHPersonality Pers =
3896 CallerFn.hasPersonalityFn()
3897 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3898 : EHPersonality::Unknown;
3899 if (isFuncletEHPersonality(Pers))
3900 Mask = RegInfo->getNoPreservedMask();
3903 // Define a new register mask from the existing mask.
3904 uint32_t *RegMask = nullptr;
3906 // In some calling conventions we need to remove the used physical registers
3907 // from the reg mask.
3908 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3909 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3911 // Allocate a new Reg Mask and copy Mask.
3912 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3913 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3914 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3916 // Make sure all sub registers of the argument registers are reset
3918 for (auto const &RegPair : RegsToPass)
3919 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3920 SubRegs.isValid(); ++SubRegs)
3921 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3923 // Create the RegMask Operand according to our updated mask.
3924 Ops.push_back(DAG.getRegisterMask(RegMask));
3926 // Create the RegMask Operand according to the static mask.
3927 Ops.push_back(DAG.getRegisterMask(Mask));
3930 if (InFlag.getNode())
3931 Ops.push_back(InFlag);
3935 //// If this is the first return lowered for this function, add the regs
3936 //// to the liveout set for the function.
3937 // This isn't right, although it's probably harmless on x86; liveouts
3938 // should be computed from returns not tail calls. Consider a void
3939 // function making a tail call to a function returning int.
3940 MF.getFrameInfo().setHasTailCall();
3941 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3944 if (HasNoCfCheck && IsCFProtectionSupported) {
3945 Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops);
3947 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3949 InFlag = Chain.getValue(1);
3951 // Create the CALLSEQ_END node.
3952 unsigned NumBytesForCalleeToPop;
3953 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3954 DAG.getTarget().Options.GuaranteedTailCallOpt))
3955 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3956 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3957 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3958 SR == StackStructReturn)
3959 // If this is a call to a struct-return function, the callee
3960 // pops the hidden struct pointer, so we have to push it back.
3961 // This is common for Darwin/X86, Linux & Mingw32 targets.
3962 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3963 NumBytesForCalleeToPop = 4;
3965 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3967 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3968 // No need to reset the stack after the call if the call doesn't return. To
3969 // make the MI verify, we'll pretend the callee does it for us.
3970 NumBytesForCalleeToPop = NumBytes;
3973 // Returns a flag for retval copy to use.
3975 Chain = DAG.getCALLSEQ_END(Chain,
3976 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3977 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3980 InFlag = Chain.getValue(1);
3983 // Handle result values, copying them out of physregs into vregs that we
3985 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3989 //===----------------------------------------------------------------------===//
3990 // Fast Calling Convention (tail call) implementation
3991 //===----------------------------------------------------------------------===//
3993 // Like std call, callee cleans arguments, convention except that ECX is
3994 // reserved for storing the tail called function address. Only 2 registers are
3995 // free for argument passing (inreg). Tail call optimization is performed
3997 // * tailcallopt is enabled
3998 // * caller/callee are fastcc
3999 // On X86_64 architecture with GOT-style position independent code only local
4000 // (within module) calls are supported at the moment.
4001 // To keep the stack aligned according to platform abi the function
4002 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
4003 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
4004 // If a tail called function callee has more arguments than the caller the
4005 // caller needs to make sure that there is room to move the RETADDR to. This is
4006 // achieved by reserving an area the size of the argument delta right after the
4007 // original RETADDR, but before the saved framepointer or the spilled registers
4008 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
4020 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
4023 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
4024 SelectionDAG& DAG) const {
4025 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4026 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
4027 unsigned StackAlignment = TFI.getStackAlignment();
4028 uint64_t AlignMask = StackAlignment - 1;
4029 int64_t Offset = StackSize;
4030 unsigned SlotSize = RegInfo->getSlotSize();
4031 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
4032 // Number smaller than 12 so just add the difference.
4033 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
4035 // Mask out lower bits, add stackalignment once plus the 12 bytes.
4036 Offset = ((~AlignMask) & Offset) + StackAlignment +
4037 (StackAlignment-SlotSize);
4042 /// Return true if the given stack call argument is already available in the
4043 /// same position (relatively) of the caller's incoming argument stack.
4045 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
4046 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4047 const X86InstrInfo *TII, const CCValAssign &VA) {
4048 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4051 // Look through nodes that don't alter the bits of the incoming value.
4052 unsigned Op = Arg.getOpcode();
4053 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4054 Arg = Arg.getOperand(0);
4057 if (Op == ISD::TRUNCATE) {
4058 const SDValue &TruncInput = Arg.getOperand(0);
4059 if (TruncInput.getOpcode() == ISD::AssertZext &&
4060 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4061 Arg.getValueType()) {
4062 Arg = TruncInput.getOperand(0);
4070 if (Arg.getOpcode() == ISD::CopyFromReg) {
4071 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4072 if (!TargetRegisterInfo::isVirtualRegister(VR))
4074 MachineInstr *Def = MRI->getVRegDef(VR);
4077 if (!Flags.isByVal()) {
4078 if (!TII->isLoadFromStackSlot(*Def, FI))
4081 unsigned Opcode = Def->getOpcode();
4082 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4083 Opcode == X86::LEA64_32r) &&
4084 Def->getOperand(1).isFI()) {
4085 FI = Def->getOperand(1).getIndex();
4086 Bytes = Flags.getByValSize();
4090 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4091 if (Flags.isByVal())
4092 // ByVal argument is passed in as a pointer but it's now being
4093 // dereferenced. e.g.
4094 // define @foo(%struct.X* %A) {
4095 // tail call @bar(%struct.X* byval %A)
4098 SDValue Ptr = Ld->getBasePtr();
4099 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4102 FI = FINode->getIndex();
4103 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4104 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4105 FI = FINode->getIndex();
4106 Bytes = Flags.getByValSize();
4110 assert(FI != INT_MAX);
4111 if (!MFI.isFixedObjectIndex(FI))
4114 if (Offset != MFI.getObjectOffset(FI))
4117 // If this is not byval, check that the argument stack object is immutable.
4118 // inalloca and argument copy elision can create mutable argument stack
4119 // objects. Byval objects can be mutated, but a byval call intends to pass the
4121 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4124 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4125 // If the argument location is wider than the argument type, check that any
4126 // extension flags match.
4127 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4128 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4133 return Bytes == MFI.getObjectSize(FI);
4136 /// Check whether the call is eligible for tail call optimization. Targets
4137 /// that want to do tail call optimization should implement this function.
4138 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4139 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4140 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4141 const SmallVectorImpl<ISD::OutputArg> &Outs,
4142 const SmallVectorImpl<SDValue> &OutVals,
4143 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4144 if (!mayTailCallThisCC(CalleeCC))
4147 // If -tailcallopt is specified, make fastcc functions tail-callable.
4148 MachineFunction &MF = DAG.getMachineFunction();
4149 const Function &CallerF = MF.getFunction();
4151 // If the function return type is x86_fp80 and the callee return type is not,
4152 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4153 // perform a tailcall optimization here.
4154 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4157 CallingConv::ID CallerCC = CallerF.getCallingConv();
4158 bool CCMatch = CallerCC == CalleeCC;
4159 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4160 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4162 // Win64 functions have extra shadow space for argument homing. Don't do the
4163 // sibcall if the caller and callee have mismatched expectations for this
4165 if (IsCalleeWin64 != IsCallerWin64)
4168 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4169 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4174 // Look for obvious safe cases to perform tail call optimization that do not
4175 // require ABI changes. This is what gcc calls sibcall.
4177 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4178 // emit a special epilogue.
4179 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4180 if (RegInfo->needsStackRealignment(MF))
4183 // Also avoid sibcall optimization if either caller or callee uses struct
4184 // return semantics.
4185 if (isCalleeStructRet || isCallerStructRet)
4188 // Do not sibcall optimize vararg calls unless all arguments are passed via
4190 LLVMContext &C = *DAG.getContext();
4191 if (isVarArg && !Outs.empty()) {
4192 // Optimizing for varargs on Win64 is unlikely to be safe without
4193 // additional testing.
4194 if (IsCalleeWin64 || IsCallerWin64)
4197 SmallVector<CCValAssign, 16> ArgLocs;
4198 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4200 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4201 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4202 if (!ArgLocs[i].isRegLoc())
4206 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4207 // stack. Therefore, if it's not used by the call it is not safe to optimize
4208 // this into a sibcall.
4209 bool Unused = false;
4210 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4217 SmallVector<CCValAssign, 16> RVLocs;
4218 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4219 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4220 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4221 CCValAssign &VA = RVLocs[i];
4222 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4227 // Check that the call results are passed in the same way.
4228 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4229 RetCC_X86, RetCC_X86))
4231 // The callee has to preserve all registers the caller needs to preserve.
4232 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4233 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4235 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4236 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4240 unsigned StackArgsSize = 0;
4242 // If the callee takes no arguments then go on to check the results of the
4244 if (!Outs.empty()) {
4245 // Check if stack adjustment is needed. For now, do not do this if any
4246 // argument is passed on the stack.
4247 SmallVector<CCValAssign, 16> ArgLocs;
4248 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4250 // Allocate shadow area for Win64
4252 CCInfo.AllocateStack(32, 8);
4254 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4255 StackArgsSize = CCInfo.getNextStackOffset();
4257 if (CCInfo.getNextStackOffset()) {
4258 // Check if the arguments are already laid out in the right way as
4259 // the caller's fixed stack objects.
4260 MachineFrameInfo &MFI = MF.getFrameInfo();
4261 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4262 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4263 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4264 CCValAssign &VA = ArgLocs[i];
4265 SDValue Arg = OutVals[i];
4266 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4267 if (VA.getLocInfo() == CCValAssign::Indirect)
4269 if (!VA.isRegLoc()) {
4270 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4277 bool PositionIndependent = isPositionIndependent();
4278 // If the tailcall address may be in a register, then make sure it's
4279 // possible to register allocate for it. In 32-bit, the call address can
4280 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4281 // callee-saved registers are restored. These happen to be the same
4282 // registers used to pass 'inreg' arguments so watch out for those.
4283 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4284 !isa<ExternalSymbolSDNode>(Callee)) ||
4285 PositionIndependent)) {
4286 unsigned NumInRegs = 0;
4287 // In PIC we need an extra register to formulate the address computation
4289 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4291 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4292 CCValAssign &VA = ArgLocs[i];
4295 unsigned Reg = VA.getLocReg();
4298 case X86::EAX: case X86::EDX: case X86::ECX:
4299 if (++NumInRegs == MaxInRegs)
4306 const MachineRegisterInfo &MRI = MF.getRegInfo();
4307 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4311 bool CalleeWillPop =
4312 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4313 MF.getTarget().Options.GuaranteedTailCallOpt);
4315 if (unsigned BytesToPop =
4316 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4317 // If we have bytes to pop, the callee must pop them.
4318 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4319 if (!CalleePopMatches)
4321 } else if (CalleeWillPop && StackArgsSize > 0) {
4322 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4330 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4331 const TargetLibraryInfo *libInfo) const {
4332 return X86::createFastISel(funcInfo, libInfo);
4335 //===----------------------------------------------------------------------===//
4336 // Other Lowering Hooks
4337 //===----------------------------------------------------------------------===//
4339 static bool MayFoldLoad(SDValue Op) {
4340 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4343 static bool MayFoldIntoStore(SDValue Op) {
4344 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4347 static bool MayFoldIntoZeroExtend(SDValue Op) {
4348 if (Op.hasOneUse()) {
4349 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4350 return (ISD::ZERO_EXTEND == Opcode);
4355 static bool isTargetShuffle(unsigned Opcode) {
4357 default: return false;
4358 case X86ISD::BLENDI:
4359 case X86ISD::PSHUFB:
4360 case X86ISD::PSHUFD:
4361 case X86ISD::PSHUFHW:
4362 case X86ISD::PSHUFLW:
4364 case X86ISD::INSERTPS:
4365 case X86ISD::EXTRQI:
4366 case X86ISD::INSERTQI:
4367 case X86ISD::PALIGNR:
4368 case X86ISD::VSHLDQ:
4369 case X86ISD::VSRLDQ:
4370 case X86ISD::MOVLHPS:
4371 case X86ISD::MOVHLPS:
4372 case X86ISD::MOVSHDUP:
4373 case X86ISD::MOVSLDUP:
4374 case X86ISD::MOVDDUP:
4377 case X86ISD::UNPCKL:
4378 case X86ISD::UNPCKH:
4379 case X86ISD::VBROADCAST:
4380 case X86ISD::VPERMILPI:
4381 case X86ISD::VPERMILPV:
4382 case X86ISD::VPERM2X128:
4383 case X86ISD::SHUF128:
4384 case X86ISD::VPERMIL2:
4385 case X86ISD::VPERMI:
4386 case X86ISD::VPPERM:
4387 case X86ISD::VPERMV:
4388 case X86ISD::VPERMV3:
4389 case X86ISD::VZEXT_MOVL:
4394 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4396 default: return false;
4398 case X86ISD::PSHUFB:
4399 case X86ISD::VPERMILPV:
4400 case X86ISD::VPERMIL2:
4401 case X86ISD::VPPERM:
4402 case X86ISD::VPERMV:
4403 case X86ISD::VPERMV3:
4405 // 'Faux' Target Shuffles.
4412 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4413 MachineFunction &MF = DAG.getMachineFunction();
4414 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4415 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4416 int ReturnAddrIndex = FuncInfo->getRAIndex();
4418 if (ReturnAddrIndex == 0) {
4419 // Set up a frame object for the return address.
4420 unsigned SlotSize = RegInfo->getSlotSize();
4421 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4424 FuncInfo->setRAIndex(ReturnAddrIndex);
4427 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4430 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4431 bool hasSymbolicDisplacement) {
4432 // Offset should fit into 32 bit immediate field.
4433 if (!isInt<32>(Offset))
4436 // If we don't have a symbolic displacement - we don't have any extra
4438 if (!hasSymbolicDisplacement)
4441 // FIXME: Some tweaks might be needed for medium code model.
4442 if (M != CodeModel::Small && M != CodeModel::Kernel)
4445 // For small code model we assume that latest object is 16MB before end of 31
4446 // bits boundary. We may also accept pretty large negative constants knowing
4447 // that all objects are in the positive half of address space.
4448 if (M == CodeModel::Small && Offset < 16*1024*1024)
4451 // For kernel code model we know that all object resist in the negative half
4452 // of 32bits address space. We may not accept negative offsets, since they may
4453 // be just off and we may accept pretty large positive ones.
4454 if (M == CodeModel::Kernel && Offset >= 0)
4460 /// Determines whether the callee is required to pop its own arguments.
4461 /// Callee pop is necessary to support tail calls.
4462 bool X86::isCalleePop(CallingConv::ID CallingConv,
4463 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4464 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4465 // can guarantee TCO.
4466 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4469 switch (CallingConv) {
4472 case CallingConv::X86_StdCall:
4473 case CallingConv::X86_FastCall:
4474 case CallingConv::X86_ThisCall:
4475 case CallingConv::X86_VectorCall:
4480 /// Return true if the condition is an unsigned comparison operation.
4481 static bool isX86CCUnsigned(unsigned X86CC) {
4484 llvm_unreachable("Invalid integer condition!");
4500 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4501 switch (SetCCOpcode) {
4502 default: llvm_unreachable("Invalid integer condition!");
4503 case ISD::SETEQ: return X86::COND_E;
4504 case ISD::SETGT: return X86::COND_G;
4505 case ISD::SETGE: return X86::COND_GE;
4506 case ISD::SETLT: return X86::COND_L;
4507 case ISD::SETLE: return X86::COND_LE;
4508 case ISD::SETNE: return X86::COND_NE;
4509 case ISD::SETULT: return X86::COND_B;
4510 case ISD::SETUGT: return X86::COND_A;
4511 case ISD::SETULE: return X86::COND_BE;
4512 case ISD::SETUGE: return X86::COND_AE;
4516 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4517 /// condition code, returning the condition code and the LHS/RHS of the
4518 /// comparison to make.
4519 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4520 bool isFP, SDValue &LHS, SDValue &RHS,
4521 SelectionDAG &DAG) {
4523 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4524 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4525 // X > -1 -> X == 0, jump !sign.
4526 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4527 return X86::COND_NS;
4529 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4530 // X < 0 -> X == 0, jump on sign.
4533 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4535 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4536 return X86::COND_LE;
4540 return TranslateIntegerX86CC(SetCCOpcode);
4543 // First determine if it is required or is profitable to flip the operands.
4545 // If LHS is a foldable load, but RHS is not, flip the condition.
4546 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4547 !ISD::isNON_EXTLoad(RHS.getNode())) {
4548 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4549 std::swap(LHS, RHS);
4552 switch (SetCCOpcode) {
4558 std::swap(LHS, RHS);
4562 // On a floating point condition, the flags are set as follows:
4564 // 0 | 0 | 0 | X > Y
4565 // 0 | 0 | 1 | X < Y
4566 // 1 | 0 | 0 | X == Y
4567 // 1 | 1 | 1 | unordered
4568 switch (SetCCOpcode) {
4569 default: llvm_unreachable("Condcode should be pre-legalized away");
4571 case ISD::SETEQ: return X86::COND_E;
4572 case ISD::SETOLT: // flipped
4574 case ISD::SETGT: return X86::COND_A;
4575 case ISD::SETOLE: // flipped
4577 case ISD::SETGE: return X86::COND_AE;
4578 case ISD::SETUGT: // flipped
4580 case ISD::SETLT: return X86::COND_B;
4581 case ISD::SETUGE: // flipped
4583 case ISD::SETLE: return X86::COND_BE;
4585 case ISD::SETNE: return X86::COND_NE;
4586 case ISD::SETUO: return X86::COND_P;
4587 case ISD::SETO: return X86::COND_NP;
4589 case ISD::SETUNE: return X86::COND_INVALID;
4593 /// Is there a floating point cmov for the specific X86 condition code?
4594 /// Current x86 isa includes the following FP cmov instructions:
4595 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4596 static bool hasFPCMov(unsigned X86CC) {
4613 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4615 MachineFunction &MF,
4616 unsigned Intrinsic) const {
4618 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4622 Info.opc = ISD::INTRINSIC_W_CHAIN;
4623 Info.flags = MachineMemOperand::MONone;
4626 switch (IntrData->Type) {
4627 case TRUNCATE_TO_MEM_VI8:
4628 case TRUNCATE_TO_MEM_VI16:
4629 case TRUNCATE_TO_MEM_VI32: {
4630 Info.ptrVal = I.getArgOperand(0);
4631 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4632 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4633 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4635 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4636 ScalarVT = MVT::i16;
4637 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4638 ScalarVT = MVT::i32;
4640 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4642 Info.flags |= MachineMemOperand::MOStore;
4652 /// Returns true if the target can instruction select the
4653 /// specified FP immediate natively. If false, the legalizer will
4654 /// materialize the FP immediate as a load from a constant pool.
4655 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4656 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4657 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4663 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4664 ISD::LoadExtType ExtTy,
4666 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4667 // relocation target a movq or addq instruction: don't let the load shrink.
4668 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4669 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4670 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4671 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4675 /// Returns true if it is beneficial to convert a load of a constant
4676 /// to just the constant itself.
4677 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4679 assert(Ty->isIntegerTy());
4681 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4682 if (BitSize == 0 || BitSize > 64)
4687 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4688 // TODO: It might be a win to ease or lift this restriction, but the generic
4689 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4690 if (VT.isVector() && Subtarget.hasAVX512())
4696 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4697 unsigned Index) const {
4698 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4701 // Mask vectors support all subregister combinations and operations that
4702 // extract half of vector.
4703 if (ResVT.getVectorElementType() == MVT::i1)
4704 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4705 (Index == ResVT.getVectorNumElements()));
4707 return (Index % ResVT.getVectorNumElements()) == 0;
4710 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4711 // Speculate cttz only if we can directly use TZCNT.
4712 return Subtarget.hasBMI();
4715 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4716 // Speculate ctlz only if we can directly use LZCNT.
4717 return Subtarget.hasLZCNT();
4720 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4721 EVT BitcastVT) const {
4722 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4725 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4728 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4729 const SelectionDAG &DAG) const {
4730 // Do not merge to float value size (128 bytes) if no implicit
4731 // float attribute is set.
4732 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4733 Attribute::NoImplicitFloat);
4736 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4737 return (MemVT.getSizeInBits() <= MaxIntSize);
4742 bool X86TargetLowering::isCtlzFast() const {
4743 return Subtarget.hasFastLZCNT();
4746 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4747 const Instruction &AndI) const {
4751 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4752 EVT VT = Y.getValueType();
4757 if (!Subtarget.hasBMI())
4760 // There are only 32-bit and 64-bit forms for 'andn'.
4761 if (VT != MVT::i32 && VT != MVT::i64)
4764 // A mask and compare against constant is ok for an 'andn' too
4765 // even though the BMI instruction doesn't have an immediate form.
4770 bool X86TargetLowering::hasAndNot(SDValue Y) const {
4771 EVT VT = Y.getValueType();
4773 if (!VT.isVector()) // x86 can't form 'andn' with an immediate.
4774 return !isa<ConstantSDNode>(Y) && hasAndNotCompare(Y);
4778 if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128)
4781 if (VT == MVT::v4i32)
4784 return Subtarget.hasSSE2();
4787 bool X86TargetLowering::preferShiftsToClearExtremeBits(SDValue Y) const {
4788 EVT VT = Y.getValueType();
4790 // For vectors, we don't have a preference, but we probably want a mask.
4794 // 64-bit shifts on 32-bit targets produce really bad bloated code.
4795 if (VT == MVT::i64 && !Subtarget.is64Bit())
4801 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4802 MVT VT = MVT::getIntegerVT(NumBits);
4803 if (isTypeLegal(VT))
4806 // PMOVMSKB can handle this.
4807 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4810 // VPMOVMSKB can handle this.
4811 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4814 // TODO: Allow 64-bit type for 32-bit target.
4815 // TODO: 512-bit types should be allowed, but make sure that those
4816 // cases are handled in combineVectorSizedSetCCEquality().
4818 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4821 /// Val is the undef sentinel value or equal to the specified value.
4822 static bool isUndefOrEqual(int Val, int CmpVal) {
4823 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4826 /// Val is either the undef or zero sentinel value.
4827 static bool isUndefOrZero(int Val) {
4828 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4831 /// Return true if every element in Mask, beginning
4832 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4833 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4834 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4835 if (Mask[i] != SM_SentinelUndef)
4840 /// Return true if Val falls within the specified range (L, H].
4841 static bool isInRange(int Val, int Low, int Hi) {
4842 return (Val >= Low && Val < Hi);
4845 /// Return true if the value of any element in Mask falls within the specified
4847 static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
4849 if (isInRange(M, Low, Hi))
4854 /// Return true if Val is undef or if its value falls within the
4855 /// specified range (L, H].
4856 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4857 return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi);
4860 /// Return true if every element in Mask is undef or if its value
4861 /// falls within the specified range (L, H].
4862 static bool isUndefOrInRange(ArrayRef<int> Mask,
4865 if (!isUndefOrInRange(M, Low, Hi))
4870 /// Return true if Val is undef, zero or if its value falls within the
4871 /// specified range (L, H].
4872 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4873 return isUndefOrZero(Val) || isInRange(Val, Low, Hi);
4876 /// Return true if every element in Mask is undef, zero or if its value
4877 /// falls within the specified range (L, H].
4878 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4880 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4885 /// Return true if every element in Mask, beginning
4886 /// from position Pos and ending in Pos + Size, falls within the specified
4887 /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef.
4888 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos,
4889 unsigned Size, int Low, int Step = 1) {
4890 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step)
4891 if (!isUndefOrEqual(Mask[i], Low))
4896 /// Return true if every element in Mask, beginning
4897 /// from position Pos and ending in Pos+Size, falls within the specified
4898 /// sequential range (Low, Low+Size], or is undef or is zero.
4899 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4900 unsigned Size, int Low) {
4901 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4902 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4907 /// Return true if every element in Mask, beginning
4908 /// from position Pos and ending in Pos+Size is undef or is zero.
4909 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4911 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4912 if (!isUndefOrZero(Mask[i]))
4917 /// Helper function to test whether a shuffle mask could be
4918 /// simplified by widening the elements being shuffled.
4920 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4921 /// leaves it in an unspecified state.
4923 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4924 /// shuffle masks. The latter have the special property of a '-2' representing
4925 /// a zero-ed lane of a vector.
4926 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4927 SmallVectorImpl<int> &WidenedMask) {
4928 WidenedMask.assign(Mask.size() / 2, 0);
4929 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4931 int M1 = Mask[i + 1];
4933 // If both elements are undef, its trivial.
4934 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4935 WidenedMask[i / 2] = SM_SentinelUndef;
4939 // Check for an undef mask and a mask value properly aligned to fit with
4940 // a pair of values. If we find such a case, use the non-undef mask's value.
4941 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4942 WidenedMask[i / 2] = M1 / 2;
4945 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4946 WidenedMask[i / 2] = M0 / 2;
4950 // When zeroing, we need to spread the zeroing across both lanes to widen.
4951 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4952 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4953 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4954 WidenedMask[i / 2] = SM_SentinelZero;
4960 // Finally check if the two mask values are adjacent and aligned with
4962 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4963 WidenedMask[i / 2] = M0 / 2;
4967 // Otherwise we can't safely widen the elements used in this shuffle.
4970 assert(WidenedMask.size() == Mask.size() / 2 &&
4971 "Incorrect size of mask after widening the elements!");
4976 static bool canWidenShuffleElements(ArrayRef<int> Mask) {
4977 SmallVector<int, 32> WidenedMask;
4978 return canWidenShuffleElements(Mask, WidenedMask);
4981 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4982 bool X86::isZeroNode(SDValue Elt) {
4983 return isNullConstant(Elt) || isNullFPConstant(Elt);
4986 // Build a vector of constants.
4987 // Use an UNDEF node if MaskElt == -1.
4988 // Split 64-bit constants in the 32-bit mode.
4989 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4990 const SDLoc &dl, bool IsMask = false) {
4992 SmallVector<SDValue, 32> Ops;
4995 MVT ConstVecVT = VT;
4996 unsigned NumElts = VT.getVectorNumElements();
4997 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4998 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4999 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5003 MVT EltVT = ConstVecVT.getVectorElementType();
5004 for (unsigned i = 0; i < NumElts; ++i) {
5005 bool IsUndef = Values[i] < 0 && IsMask;
5006 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
5007 DAG.getConstant(Values[i], dl, EltVT);
5008 Ops.push_back(OpNode);
5010 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
5011 DAG.getConstant(0, dl, EltVT));
5013 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5015 ConstsNode = DAG.getBitcast(VT, ConstsNode);
5019 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
5020 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5021 assert(Bits.size() == Undefs.getBitWidth() &&
5022 "Unequal constant and undef arrays");
5023 SmallVector<SDValue, 32> Ops;
5026 MVT ConstVecVT = VT;
5027 unsigned NumElts = VT.getVectorNumElements();
5028 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
5029 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
5030 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
5034 MVT EltVT = ConstVecVT.getVectorElementType();
5035 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
5037 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
5040 const APInt &V = Bits[i];
5041 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
5043 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
5044 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
5045 } else if (EltVT == MVT::f32) {
5046 APFloat FV(APFloat::IEEEsingle(), V);
5047 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5048 } else if (EltVT == MVT::f64) {
5049 APFloat FV(APFloat::IEEEdouble(), V);
5050 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
5052 Ops.push_back(DAG.getConstant(V, dl, EltVT));
5056 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
5057 return DAG.getBitcast(VT, ConstsNode);
5060 /// Returns a vector of specified type with all zero elements.
5061 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
5062 SelectionDAG &DAG, const SDLoc &dl) {
5063 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
5064 VT.getVectorElementType() == MVT::i1) &&
5065 "Unexpected vector type");
5067 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
5068 // type. This ensures they get CSE'd. But if the integer type is not
5069 // available, use a floating-point +0.0 instead.
5071 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
5072 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
5073 } else if (VT.getVectorElementType() == MVT::i1) {
5074 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
5075 "Unexpected vector type");
5076 Vec = DAG.getConstant(0, dl, VT);
5078 unsigned Num32BitElts = VT.getSizeInBits() / 32;
5079 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5081 return DAG.getBitcast(VT, Vec);
5084 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5085 const SDLoc &dl, unsigned vectorWidth) {
5086 EVT VT = Vec.getValueType();
5087 EVT ElVT = VT.getVectorElementType();
5088 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5089 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5090 VT.getVectorNumElements()/Factor);
5092 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5093 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5094 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5096 // This is the index of the first element of the vectorWidth-bit chunk
5097 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5098 IdxVal &= ~(ElemsPerChunk - 1);
5100 // If the input is a buildvector just emit a smaller one.
5101 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5102 return DAG.getBuildVector(ResultVT, dl,
5103 Vec->ops().slice(IdxVal, ElemsPerChunk));
5105 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5106 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5109 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5110 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5111 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5112 /// instructions or a simple subregister reference. Idx is an index in the
5113 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5114 /// lowering EXTRACT_VECTOR_ELT operations easier.
5115 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5116 SelectionDAG &DAG, const SDLoc &dl) {
5117 assert((Vec.getValueType().is256BitVector() ||
5118 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5119 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5122 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5123 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5124 SelectionDAG &DAG, const SDLoc &dl) {
5125 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5126 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5129 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5130 SelectionDAG &DAG, const SDLoc &dl,
5131 unsigned vectorWidth) {
5132 assert((vectorWidth == 128 || vectorWidth == 256) &&
5133 "Unsupported vector width");
5134 // Inserting UNDEF is Result
5137 EVT VT = Vec.getValueType();
5138 EVT ElVT = VT.getVectorElementType();
5139 EVT ResultVT = Result.getValueType();
5141 // Insert the relevant vectorWidth bits.
5142 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5143 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5145 // This is the index of the first element of the vectorWidth-bit chunk
5146 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5147 IdxVal &= ~(ElemsPerChunk - 1);
5149 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5150 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5153 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5154 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5155 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5156 /// simple superregister reference. Idx is an index in the 128 bits
5157 /// we want. It need not be aligned to a 128-bit boundary. That makes
5158 /// lowering INSERT_VECTOR_ELT operations easier.
5159 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5160 SelectionDAG &DAG, const SDLoc &dl) {
5161 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5162 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5165 /// Widen a vector to a larger size with the same scalar type, with the new
5166 /// elements either zero or undef.
5167 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5168 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5170 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5171 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5172 "Unsupported vector widening type");
5173 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5175 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5176 DAG.getIntPtrConstant(0, dl));
5179 // Helper for splitting operands of an operation to legal target size and
5180 // apply a function on each part.
5181 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5182 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5183 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5184 // The argument Builder is a function that will be applied on each split part:
5185 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5186 template <typename F>
5187 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5188 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5189 F Builder, bool CheckBWI = true) {
5190 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5191 unsigned NumSubs = 1;
5192 if ((CheckBWI && Subtarget.useBWIRegs()) ||
5193 (!CheckBWI && Subtarget.useAVX512Regs())) {
5194 if (VT.getSizeInBits() > 512) {
5195 NumSubs = VT.getSizeInBits() / 512;
5196 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5198 } else if (Subtarget.hasAVX2()) {
5199 if (VT.getSizeInBits() > 256) {
5200 NumSubs = VT.getSizeInBits() / 256;
5201 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5204 if (VT.getSizeInBits() > 128) {
5205 NumSubs = VT.getSizeInBits() / 128;
5206 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5211 return Builder(DAG, DL, Ops);
5213 SmallVector<SDValue, 4> Subs;
5214 for (unsigned i = 0; i != NumSubs; ++i) {
5215 SmallVector<SDValue, 2> SubOps;
5216 for (SDValue Op : Ops) {
5217 EVT OpVT = Op.getValueType();
5218 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5219 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5220 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5222 Subs.push_back(Builder(DAG, DL, SubOps));
5224 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5227 // Return true if the instruction zeroes the unused upper part of the
5228 // destination and accepts mask.
5229 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5234 case X86ISD::CMPM_RND:
5240 /// Insert i1-subvector to i1-vector.
5241 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5242 const X86Subtarget &Subtarget) {
5245 SDValue Vec = Op.getOperand(0);
5246 SDValue SubVec = Op.getOperand(1);
5247 SDValue Idx = Op.getOperand(2);
5249 if (!isa<ConstantSDNode>(Idx))
5252 // Inserting undef is a nop. We can just return the original vector.
5253 if (SubVec.isUndef())
5256 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5257 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5260 MVT OpVT = Op.getSimpleValueType();
5261 unsigned NumElems = OpVT.getVectorNumElements();
5263 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5265 // Extend to natively supported kshift.
5266 MVT WideOpVT = OpVT;
5267 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5268 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5270 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5272 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5273 // May need to promote to a legal type.
5274 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5275 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5277 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5280 MVT SubVecVT = SubVec.getSimpleValueType();
5281 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5283 assert(IdxVal + SubVecNumElems <= NumElems &&
5284 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5285 "Unexpected index value in INSERT_SUBVECTOR");
5287 SDValue Undef = DAG.getUNDEF(WideOpVT);
5290 // Zero lower bits of the Vec
5291 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5292 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5294 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5295 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5296 // Merge them together, SubVec should be zero extended.
5297 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5298 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5300 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5301 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5304 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5305 Undef, SubVec, ZeroIdx);
5307 if (Vec.isUndef()) {
5308 assert(IdxVal != 0 && "Unexpected index");
5309 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5310 DAG.getConstant(IdxVal, dl, MVT::i8));
5311 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5314 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5315 assert(IdxVal != 0 && "Unexpected index");
5316 NumElems = WideOpVT.getVectorNumElements();
5317 unsigned ShiftLeft = NumElems - SubVecNumElems;
5318 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5319 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5320 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5321 if (ShiftRight != 0)
5322 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5323 DAG.getConstant(ShiftRight, dl, MVT::i8));
5324 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5327 // Simple case when we put subvector in the upper part
5328 if (IdxVal + SubVecNumElems == NumElems) {
5329 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5330 DAG.getConstant(IdxVal, dl, MVT::i8));
5331 if (SubVecNumElems * 2 == NumElems) {
5332 // Special case, use legal zero extending insert_subvector. This allows
5333 // isel to opimitize when bits are known zero.
5334 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5335 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5336 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5339 // Otherwise use explicit shifts to zero the bits.
5340 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5341 Undef, Vec, ZeroIdx);
5342 NumElems = WideOpVT.getVectorNumElements();
5343 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5344 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5345 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5347 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5348 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5351 // Inserting into the middle is more complicated.
5353 NumElems = WideOpVT.getVectorNumElements();
5355 // Widen the vector if needed.
5356 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5357 // Move the current value of the bit to be replace to the lsbs.
5358 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5359 DAG.getConstant(IdxVal, dl, MVT::i8));
5360 // Xor with the new bit.
5361 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5362 // Shift to MSB, filling bottom bits with 0.
5363 unsigned ShiftLeft = NumElems - SubVecNumElems;
5364 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5365 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5366 // Shift to the final position, filling upper bits with 0.
5367 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5368 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5369 DAG.getConstant(ShiftRight, dl, MVT::i8));
5370 // Xor with original vector leaving the new value.
5371 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5372 // Reduce to original width if needed.
5373 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5376 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5377 unsigned NumElems, SelectionDAG &DAG,
5378 const SDLoc &dl, unsigned VectorWidth) {
5379 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5380 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5383 /// Returns a vector of specified type with all bits set.
5384 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5385 /// Then bitcast to their original type, ensuring they get CSE'd.
5386 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5387 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5388 "Expected a 128/256/512-bit vector type");
5390 APInt Ones = APInt::getAllOnesValue(32);
5391 unsigned NumElts = VT.getSizeInBits() / 32;
5392 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5393 return DAG.getBitcast(VT, Vec);
5396 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5397 SelectionDAG &DAG) {
5398 EVT InVT = In.getValueType();
5399 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5401 if (VT.is128BitVector() && InVT.is128BitVector())
5402 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5403 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5405 // For 256-bit vectors, we only need the lower (128-bit) input half.
5406 // For 512-bit vectors, we only need the lower input half or quarter.
5407 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5408 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5409 In = extractSubVector(In, 0, DAG, DL,
5410 std::max(128, (int)VT.getSizeInBits() / Scale));
5413 return DAG.getNode(Opc, DL, VT, In);
5416 /// Returns a vector_shuffle node for an unpackl operation.
5417 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5418 SDValue V1, SDValue V2) {
5419 SmallVector<int, 8> Mask;
5420 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5421 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5424 /// Returns a vector_shuffle node for an unpackh operation.
5425 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5426 SDValue V1, SDValue V2) {
5427 SmallVector<int, 8> Mask;
5428 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5429 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5432 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5433 /// This produces a shuffle where the low element of V2 is swizzled into the
5434 /// zero/undef vector, landing at element Idx.
5435 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5436 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5438 const X86Subtarget &Subtarget,
5439 SelectionDAG &DAG) {
5440 MVT VT = V2.getSimpleValueType();
5442 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5443 int NumElems = VT.getVectorNumElements();
5444 SmallVector<int, 16> MaskVec(NumElems);
5445 for (int i = 0; i != NumElems; ++i)
5446 // If this is the insertion idx, put the low elt of V2 here.
5447 MaskVec[i] = (i == Idx) ? NumElems : i;
5448 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5451 static SDValue peekThroughBitcasts(SDValue V) {
5452 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5453 V = V.getOperand(0);
5457 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5458 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5459 V.getOperand(0).hasOneUse())
5460 V = V.getOperand(0);
5464 // Peek through EXTRACT_SUBVECTORs - typically used for AVX1 256-bit intops.
5465 static SDValue peekThroughEXTRACT_SUBVECTORs(SDValue V) {
5466 while (V.getOpcode() == ISD::EXTRACT_SUBVECTOR)
5467 V = V.getOperand(0);
5471 static const Constant *getTargetConstantFromNode(SDValue Op) {
5472 Op = peekThroughBitcasts(Op);
5474 auto *Load = dyn_cast<LoadSDNode>(Op);
5478 SDValue Ptr = Load->getBasePtr();
5479 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5480 Ptr->getOpcode() == X86ISD::WrapperRIP)
5481 Ptr = Ptr->getOperand(0);
5483 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5484 if (!CNode || CNode->isMachineConstantPoolEntry())
5487 return dyn_cast<Constant>(CNode->getConstVal());
5490 // Extract raw constant bits from constant pools.
5491 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5493 SmallVectorImpl<APInt> &EltBits,
5494 bool AllowWholeUndefs = true,
5495 bool AllowPartialUndefs = true) {
5496 assert(EltBits.empty() && "Expected an empty EltBits vector");
5498 Op = peekThroughBitcasts(Op);
5500 EVT VT = Op.getValueType();
5501 unsigned SizeInBits = VT.getSizeInBits();
5502 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5503 unsigned NumElts = SizeInBits / EltSizeInBits;
5505 // Bitcast a source array of element bits to the target size.
5506 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5507 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5508 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5509 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5510 "Constant bit sizes don't match");
5512 // Don't split if we don't allow undef bits.
5513 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5514 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5517 // If we're already the right size, don't bother bitcasting.
5518 if (NumSrcElts == NumElts) {
5519 UndefElts = UndefSrcElts;
5520 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5524 // Extract all the undef/constant element data and pack into single bitsets.
5525 APInt UndefBits(SizeInBits, 0);
5526 APInt MaskBits(SizeInBits, 0);
5528 for (unsigned i = 0; i != NumSrcElts; ++i) {
5529 unsigned BitOffset = i * SrcEltSizeInBits;
5530 if (UndefSrcElts[i])
5531 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5532 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5535 // Split the undef/constant single bitset data into the target elements.
5536 UndefElts = APInt(NumElts, 0);
5537 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5539 for (unsigned i = 0; i != NumElts; ++i) {
5540 unsigned BitOffset = i * EltSizeInBits;
5541 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5543 // Only treat an element as UNDEF if all bits are UNDEF.
5544 if (UndefEltBits.isAllOnesValue()) {
5545 if (!AllowWholeUndefs)
5547 UndefElts.setBit(i);
5551 // If only some bits are UNDEF then treat them as zero (or bail if not
5553 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5556 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5557 EltBits[i] = Bits.getZExtValue();
5562 // Collect constant bits and insert into mask/undef bit masks.
5563 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5564 unsigned UndefBitIndex) {
5567 if (isa<UndefValue>(Cst)) {
5568 Undefs.setBit(UndefBitIndex);
5571 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5572 Mask = CInt->getValue();
5575 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5576 Mask = CFP->getValueAPF().bitcastToAPInt();
5584 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5585 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5586 return CastBitData(UndefSrcElts, SrcEltBits);
5589 // Extract scalar constant bits.
5590 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5591 APInt UndefSrcElts = APInt::getNullValue(1);
5592 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5593 return CastBitData(UndefSrcElts, SrcEltBits);
5595 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5596 APInt UndefSrcElts = APInt::getNullValue(1);
5597 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5598 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5599 return CastBitData(UndefSrcElts, SrcEltBits);
5602 // Extract constant bits from build vector.
5603 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5604 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5605 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5607 APInt UndefSrcElts(NumSrcElts, 0);
5608 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5609 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5610 const SDValue &Src = Op.getOperand(i);
5611 if (Src.isUndef()) {
5612 UndefSrcElts.setBit(i);
5615 auto *Cst = cast<ConstantSDNode>(Src);
5616 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5618 return CastBitData(UndefSrcElts, SrcEltBits);
5621 // Extract constant bits from constant pool vector.
5622 if (auto *Cst = getTargetConstantFromNode(Op)) {
5623 Type *CstTy = Cst->getType();
5624 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5627 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5628 unsigned NumSrcElts = CstTy->getVectorNumElements();
5630 APInt UndefSrcElts(NumSrcElts, 0);
5631 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5632 for (unsigned i = 0; i != NumSrcElts; ++i)
5633 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5637 return CastBitData(UndefSrcElts, SrcEltBits);
5640 // Extract constant bits from a broadcasted constant pool scalar.
5641 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5642 EltSizeInBits <= VT.getScalarSizeInBits()) {
5643 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5644 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5645 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5647 APInt UndefSrcElts(NumSrcElts, 0);
5648 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5649 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5650 if (UndefSrcElts[0])
5651 UndefSrcElts.setBits(0, NumSrcElts);
5652 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5653 return CastBitData(UndefSrcElts, SrcEltBits);
5658 // Extract a rematerialized scalar constant insertion.
5659 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5660 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5661 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5662 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5663 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5665 APInt UndefSrcElts(NumSrcElts, 0);
5666 SmallVector<APInt, 64> SrcEltBits;
5667 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5668 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5669 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5670 return CastBitData(UndefSrcElts, SrcEltBits);
5676 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5677 unsigned MaskEltSizeInBits,
5678 SmallVectorImpl<uint64_t> &RawMask) {
5680 SmallVector<APInt, 64> EltBits;
5682 // Extract the raw target constant bits.
5683 // FIXME: We currently don't support UNDEF bits or mask entries.
5684 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5685 EltBits, /* AllowWholeUndefs */ false,
5686 /* AllowPartialUndefs */ false))
5689 // Insert the extracted elements into the mask.
5690 for (APInt Elt : EltBits)
5691 RawMask.push_back(Elt.getZExtValue());
5696 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5697 /// Note: This ignores saturation, so inputs must be checked first.
5698 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5700 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5701 unsigned NumElts = VT.getVectorNumElements();
5702 unsigned NumLanes = VT.getSizeInBits() / 128;
5703 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5704 unsigned Offset = Unary ? 0 : NumElts;
5706 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5707 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5708 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5709 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5710 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5714 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5715 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5716 /// operands in \p Ops, and returns true.
5717 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5718 /// IsUnary for shuffles which use a single input multiple times, and in those
5719 /// cases it will adjust the mask to only have indices within that single input.
5720 /// It is an error to call this with non-empty Mask/Ops vectors.
5721 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5722 SmallVectorImpl<SDValue> &Ops,
5723 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5724 unsigned NumElems = VT.getVectorNumElements();
5727 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5728 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5731 bool IsFakeUnary = false;
5732 switch(N->getOpcode()) {
5733 case X86ISD::BLENDI:
5734 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5735 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5736 ImmN = N->getOperand(N->getNumOperands()-1);
5737 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5738 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5741 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5742 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5743 ImmN = N->getOperand(N->getNumOperands()-1);
5744 DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
5745 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5746 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5748 case X86ISD::INSERTPS:
5749 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5750 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5751 ImmN = N->getOperand(N->getNumOperands()-1);
5752 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5753 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5755 case X86ISD::EXTRQI:
5756 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5757 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5758 isa<ConstantSDNode>(N->getOperand(2))) {
5759 int BitLen = N->getConstantOperandVal(1);
5760 int BitIdx = N->getConstantOperandVal(2);
5761 DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5766 case X86ISD::INSERTQI:
5767 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5768 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5769 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5770 isa<ConstantSDNode>(N->getOperand(3))) {
5771 int BitLen = N->getConstantOperandVal(2);
5772 int BitIdx = N->getConstantOperandVal(3);
5773 DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5775 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5778 case X86ISD::UNPCKH:
5779 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5780 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5781 DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
5782 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5784 case X86ISD::UNPCKL:
5785 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5786 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5787 DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
5788 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5790 case X86ISD::MOVHLPS:
5791 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5792 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5793 DecodeMOVHLPSMask(NumElems, Mask);
5794 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5796 case X86ISD::MOVLHPS:
5797 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5798 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5799 DecodeMOVLHPSMask(NumElems, Mask);
5800 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5802 case X86ISD::PALIGNR:
5803 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5804 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5805 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5806 ImmN = N->getOperand(N->getNumOperands()-1);
5807 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5809 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5810 Ops.push_back(N->getOperand(1));
5811 Ops.push_back(N->getOperand(0));
5813 case X86ISD::VSHLDQ:
5814 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5815 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5816 ImmN = N->getOperand(N->getNumOperands() - 1);
5817 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5821 case X86ISD::VSRLDQ:
5822 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5823 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5824 ImmN = N->getOperand(N->getNumOperands() - 1);
5825 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5829 case X86ISD::PSHUFD:
5830 case X86ISD::VPERMILPI:
5831 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5832 ImmN = N->getOperand(N->getNumOperands()-1);
5833 DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
5834 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5837 case X86ISD::PSHUFHW:
5838 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5839 ImmN = N->getOperand(N->getNumOperands()-1);
5840 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5844 case X86ISD::PSHUFLW:
5845 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5846 ImmN = N->getOperand(N->getNumOperands()-1);
5847 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5851 case X86ISD::VZEXT_MOVL:
5852 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5853 DecodeZeroMoveLowMask(NumElems, Mask);
5856 case X86ISD::VBROADCAST: {
5857 SDValue N0 = N->getOperand(0);
5858 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5859 // add the pre-extracted value to the Ops vector.
5860 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5861 N0.getOperand(0).getValueType() == VT &&
5862 N0.getConstantOperandVal(1) == 0)
5863 Ops.push_back(N0.getOperand(0));
5865 // We only decode broadcasts of same-sized vectors, unless the broadcast
5866 // came from an extract from the original width. If we found one, we
5867 // pushed it the Ops vector above.
5868 if (N0.getValueType() == VT || !Ops.empty()) {
5869 DecodeVectorBroadcast(NumElems, Mask);
5875 case X86ISD::VPERMILPV: {
5876 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5878 SDValue MaskNode = N->getOperand(1);
5879 unsigned MaskEltSize = VT.getScalarSizeInBits();
5880 SmallVector<uint64_t, 32> RawMask;
5881 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5882 DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
5885 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5886 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5891 case X86ISD::PSHUFB: {
5892 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5893 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5894 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5896 SDValue MaskNode = N->getOperand(1);
5897 SmallVector<uint64_t, 32> RawMask;
5898 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5899 DecodePSHUFBMask(RawMask, Mask);
5902 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5903 DecodePSHUFBMask(C, Mask);
5908 case X86ISD::VPERMI:
5909 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5910 ImmN = N->getOperand(N->getNumOperands()-1);
5911 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5916 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5917 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5918 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5920 case X86ISD::VPERM2X128:
5921 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5922 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5923 ImmN = N->getOperand(N->getNumOperands()-1);
5924 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5926 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5928 case X86ISD::SHUF128:
5929 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5930 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5931 ImmN = N->getOperand(N->getNumOperands()-1);
5932 decodeVSHUF64x2FamilyMask(NumElems, VT.getScalarSizeInBits(),
5933 cast<ConstantSDNode>(ImmN)->getZExtValue(),
5935 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5937 case X86ISD::MOVSLDUP:
5938 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5939 DecodeMOVSLDUPMask(NumElems, Mask);
5942 case X86ISD::MOVSHDUP:
5943 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5944 DecodeMOVSHDUPMask(NumElems, Mask);
5947 case X86ISD::MOVDDUP:
5948 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5949 DecodeMOVDDUPMask(NumElems, Mask);
5952 case X86ISD::VPERMIL2: {
5953 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5954 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5955 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5956 unsigned MaskEltSize = VT.getScalarSizeInBits();
5957 SDValue MaskNode = N->getOperand(2);
5958 SDValue CtrlNode = N->getOperand(3);
5959 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5960 unsigned CtrlImm = CtrlOp->getZExtValue();
5961 SmallVector<uint64_t, 32> RawMask;
5962 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5963 DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
5967 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5968 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5974 case X86ISD::VPPERM: {
5975 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5976 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5977 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5978 SDValue MaskNode = N->getOperand(2);
5979 SmallVector<uint64_t, 32> RawMask;
5980 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5981 DecodeVPPERMMask(RawMask, Mask);
5984 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5985 DecodeVPPERMMask(C, Mask);
5990 case X86ISD::VPERMV: {
5991 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5993 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5994 Ops.push_back(N->getOperand(1));
5995 SDValue MaskNode = N->getOperand(0);
5996 SmallVector<uint64_t, 32> RawMask;
5997 unsigned MaskEltSize = VT.getScalarSizeInBits();
5998 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5999 DecodeVPERMVMask(RawMask, Mask);
6002 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6003 DecodeVPERMVMask(C, MaskEltSize, Mask);
6008 case X86ISD::VPERMV3: {
6009 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
6010 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
6011 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
6012 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
6013 Ops.push_back(N->getOperand(0));
6014 Ops.push_back(N->getOperand(2));
6015 SDValue MaskNode = N->getOperand(1);
6016 unsigned MaskEltSize = VT.getScalarSizeInBits();
6017 if (auto *C = getTargetConstantFromNode(MaskNode)) {
6018 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
6023 default: llvm_unreachable("unknown target shuffle node");
6026 // Empty mask indicates the decode failed.
6030 // Check if we're getting a shuffle mask with zero'd elements.
6031 if (!AllowSentinelZero)
6032 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
6035 // If we have a fake unary shuffle, the shuffle mask is spread across two
6036 // inputs that are actually the same node. Re-map the mask to always point
6037 // into the first input.
6040 if (M >= (int)Mask.size())
6043 // If we didn't already add operands in the opcode-specific code, default to
6044 // adding 1 or 2 operands starting at 0.
6046 Ops.push_back(N->getOperand(0));
6047 if (!IsUnary || IsFakeUnary)
6048 Ops.push_back(N->getOperand(1));
6054 /// Check a target shuffle mask's inputs to see if we can set any values to
6055 /// SM_SentinelZero - this is for elements that are known to be zero
6056 /// (not just zeroable) from their inputs.
6057 /// Returns true if the target shuffle mask was decoded.
6058 static bool setTargetShuffleZeroElements(SDValue N,
6059 SmallVectorImpl<int> &Mask,
6060 SmallVectorImpl<SDValue> &Ops) {
6062 if (!isTargetShuffle(N.getOpcode()))
6065 MVT VT = N.getSimpleValueType();
6066 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
6069 SDValue V1 = Ops[0];
6070 SDValue V2 = IsUnary ? V1 : Ops[1];
6072 V1 = peekThroughBitcasts(V1);
6073 V2 = peekThroughBitcasts(V2);
6075 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
6076 "Illegal split of shuffle value type");
6077 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
6079 // Extract known constant input data.
6080 APInt UndefSrcElts[2];
6081 SmallVector<APInt, 32> SrcEltBits[2];
6082 bool IsSrcConstant[2] = {
6083 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
6084 SrcEltBits[0], true, false),
6085 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6086 SrcEltBits[1], true, false)};
6088 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6091 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6095 // Determine shuffle input and normalize the mask.
6096 unsigned SrcIdx = M / Size;
6097 SDValue V = M < Size ? V1 : V2;
6100 // We are referencing an UNDEF input.
6102 Mask[i] = SM_SentinelUndef;
6106 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6107 // TODO: We currently only set UNDEF for integer types - floats use the same
6108 // registers as vectors and many of the scalar folded loads rely on the
6109 // SCALAR_TO_VECTOR pattern.
6110 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6111 (Size % V.getValueType().getVectorNumElements()) == 0) {
6112 int Scale = Size / V.getValueType().getVectorNumElements();
6113 int Idx = M / Scale;
6114 if (Idx != 0 && !VT.isFloatingPoint())
6115 Mask[i] = SM_SentinelUndef;
6116 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6117 Mask[i] = SM_SentinelZero;
6121 // Attempt to extract from the source's constant bits.
6122 if (IsSrcConstant[SrcIdx]) {
6123 if (UndefSrcElts[SrcIdx][M])
6124 Mask[i] = SM_SentinelUndef;
6125 else if (SrcEltBits[SrcIdx][M] == 0)
6126 Mask[i] = SM_SentinelZero;
6130 assert(VT.getVectorNumElements() == Mask.size() &&
6131 "Different mask size from vector size!");
6135 // Attempt to decode ops that could be represented as a shuffle mask.
6136 // The decoded shuffle mask may contain a different number of elements to the
6137 // destination value type.
6138 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6139 SmallVectorImpl<SDValue> &Ops,
6140 const SelectionDAG &DAG) {
6144 MVT VT = N.getSimpleValueType();
6145 unsigned NumElts = VT.getVectorNumElements();
6146 unsigned NumSizeInBits = VT.getSizeInBits();
6147 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6148 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6149 "Expected byte aligned value types");
6151 unsigned Opcode = N.getOpcode();
6153 case ISD::VECTOR_SHUFFLE: {
6154 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6155 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6156 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6157 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6158 Ops.push_back(N.getOperand(0));
6159 Ops.push_back(N.getOperand(1));
6165 case X86ISD::ANDNP: {
6166 // Attempt to decode as a per-byte mask.
6168 SmallVector<APInt, 32> EltBits;
6169 SDValue N0 = N.getOperand(0);
6170 SDValue N1 = N.getOperand(1);
6171 bool IsAndN = (X86ISD::ANDNP == Opcode);
6172 uint64_t ZeroMask = IsAndN ? 255 : 0;
6173 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6175 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6177 Mask.push_back(SM_SentinelUndef);
6180 uint64_t ByteBits = EltBits[i].getZExtValue();
6181 if (ByteBits != 0 && ByteBits != 255)
6183 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6185 Ops.push_back(IsAndN ? N1 : N0);
6188 case ISD::SCALAR_TO_VECTOR: {
6189 // Match against a scalar_to_vector of an extract from a vector,
6190 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6191 SDValue N0 = N.getOperand(0);
6194 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6195 N0.getOperand(0).getValueType() == VT) ||
6196 (N0.getOpcode() == X86ISD::PEXTRW &&
6197 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6198 (N0.getOpcode() == X86ISD::PEXTRB &&
6199 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6203 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6206 SDValue SrcVec = SrcExtract.getOperand(0);
6207 EVT SrcVT = SrcVec.getValueType();
6208 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6209 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6211 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6212 if (NumSrcElts <= SrcIdx)
6215 Ops.push_back(SrcVec);
6216 Mask.push_back(SrcIdx);
6217 Mask.append(NumZeros, SM_SentinelZero);
6218 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6221 case X86ISD::PINSRB:
6222 case X86ISD::PINSRW: {
6223 SDValue InVec = N.getOperand(0);
6224 SDValue InScl = N.getOperand(1);
6225 SDValue InIndex = N.getOperand(2);
6226 if (!isa<ConstantSDNode>(InIndex) ||
6227 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6229 uint64_t InIdx = N.getConstantOperandVal(2);
6231 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6232 if (X86::isZeroNode(InScl)) {
6233 Ops.push_back(InVec);
6234 for (unsigned i = 0; i != NumElts; ++i)
6235 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6239 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6240 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6242 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6243 if (InScl.getOpcode() != ExOp)
6246 SDValue ExVec = InScl.getOperand(0);
6247 SDValue ExIndex = InScl.getOperand(1);
6248 if (!isa<ConstantSDNode>(ExIndex) ||
6249 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6251 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6253 Ops.push_back(InVec);
6254 Ops.push_back(ExVec);
6255 for (unsigned i = 0; i != NumElts; ++i)
6256 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6259 case X86ISD::PACKSS:
6260 case X86ISD::PACKUS: {
6261 SDValue N0 = N.getOperand(0);
6262 SDValue N1 = N.getOperand(1);
6263 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6264 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6265 "Unexpected input value type");
6267 // If we know input saturation won't happen we can treat this
6268 // as a truncation shuffle.
6269 if (Opcode == X86ISD::PACKSS) {
6270 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6271 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6274 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6275 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6276 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6280 bool IsUnary = (N0 == N1);
6286 createPackShuffleMask(VT, Mask, IsUnary);
6290 case X86ISD::VSRLI: {
6291 uint64_t ShiftVal = N.getConstantOperandVal(1);
6292 // Out of range bit shifts are guaranteed to be zero.
6293 if (NumBitsPerElt <= ShiftVal) {
6294 Mask.append(NumElts, SM_SentinelZero);
6298 // We can only decode 'whole byte' bit shifts as shuffles.
6299 if ((ShiftVal % 8) != 0)
6302 uint64_t ByteShift = ShiftVal / 8;
6303 unsigned NumBytes = NumSizeInBits / 8;
6304 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6305 Ops.push_back(N.getOperand(0));
6307 // Clear mask to all zeros and insert the shifted byte indices.
6308 Mask.append(NumBytes, SM_SentinelZero);
6310 if (X86ISD::VSHLI == Opcode) {
6311 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6312 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6313 Mask[i + j] = i + j - ByteShift;
6315 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6316 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6317 Mask[i + j - ByteShift] = i + j;
6321 case ISD::ZERO_EXTEND_VECTOR_INREG:
6322 case X86ISD::VZEXT: {
6323 // TODO - add support for VPMOVZX with smaller input vector types.
6324 SDValue Src = N.getOperand(0);
6325 MVT SrcVT = Src.getSimpleValueType();
6326 if (NumSizeInBits != SrcVT.getSizeInBits())
6328 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
6329 VT.getVectorNumElements(), Mask);
6338 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6339 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6340 SmallVectorImpl<int> &Mask) {
6341 int MaskWidth = Mask.size();
6342 SmallVector<SDValue, 16> UsedInputs;
6343 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6344 int lo = UsedInputs.size() * MaskWidth;
6345 int hi = lo + MaskWidth;
6347 // Strip UNDEF input usage.
6348 if (Inputs[i].isUndef())
6350 if ((lo <= M) && (M < hi))
6351 M = SM_SentinelUndef;
6353 // Check for unused inputs.
6354 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6355 UsedInputs.push_back(Inputs[i]);
6362 Inputs = UsedInputs;
6365 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6366 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6367 /// remaining input indices in case we now have a unary shuffle and adjust the
6368 /// inputs accordingly.
6369 /// Returns true if the target shuffle mask was decoded.
6370 static bool resolveTargetShuffleInputs(SDValue Op,
6371 SmallVectorImpl<SDValue> &Inputs,
6372 SmallVectorImpl<int> &Mask,
6373 const SelectionDAG &DAG) {
6374 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6375 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6378 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6382 /// Returns the scalar element that will make up the ith
6383 /// element of the result of the vector shuffle.
6384 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6387 return SDValue(); // Limit search depth.
6389 SDValue V = SDValue(N, 0);
6390 EVT VT = V.getValueType();
6391 unsigned Opcode = V.getOpcode();
6393 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6394 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6395 int Elt = SV->getMaskElt(Index);
6398 return DAG.getUNDEF(VT.getVectorElementType());
6400 unsigned NumElems = VT.getVectorNumElements();
6401 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6402 : SV->getOperand(1);
6403 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6406 // Recurse into target specific vector shuffles to find scalars.
6407 if (isTargetShuffle(Opcode)) {
6408 MVT ShufVT = V.getSimpleValueType();
6409 MVT ShufSVT = ShufVT.getVectorElementType();
6410 int NumElems = (int)ShufVT.getVectorNumElements();
6411 SmallVector<int, 16> ShuffleMask;
6412 SmallVector<SDValue, 16> ShuffleOps;
6415 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6418 int Elt = ShuffleMask[Index];
6419 if (Elt == SM_SentinelZero)
6420 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6421 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6422 if (Elt == SM_SentinelUndef)
6423 return DAG.getUNDEF(ShufSVT);
6425 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6426 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6427 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6431 // Actual nodes that may contain scalar elements
6432 if (Opcode == ISD::BITCAST) {
6433 V = V.getOperand(0);
6434 EVT SrcVT = V.getValueType();
6435 unsigned NumElems = VT.getVectorNumElements();
6437 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6441 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6442 return (Index == 0) ? V.getOperand(0)
6443 : DAG.getUNDEF(VT.getVectorElementType());
6445 if (V.getOpcode() == ISD::BUILD_VECTOR)
6446 return V.getOperand(Index);
6451 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6452 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6453 unsigned NumNonZero, unsigned NumZero,
6455 const X86Subtarget &Subtarget) {
6456 MVT VT = Op.getSimpleValueType();
6457 unsigned NumElts = VT.getVectorNumElements();
6458 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6459 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6460 "Illegal vector insertion");
6466 for (unsigned i = 0; i < NumElts; ++i) {
6467 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6471 // If the build vector contains zeros or our first insertion is not the
6472 // first index then insert into zero vector to break any register
6473 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6476 if (NumZero || 0 != i)
6477 V = getZeroVector(VT, Subtarget, DAG, dl);
6479 assert(0 == i && "Expected insertion into zero-index");
6480 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6481 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6482 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6483 V = DAG.getBitcast(VT, V);
6487 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6488 DAG.getIntPtrConstant(i, dl));
6494 /// Custom lower build_vector of v16i8.
6495 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6496 unsigned NumNonZero, unsigned NumZero,
6498 const X86Subtarget &Subtarget) {
6499 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6502 // SSE4.1 - use PINSRB to insert each byte directly.
6503 if (Subtarget.hasSSE41())
6504 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6511 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6512 for (unsigned i = 0; i < 16; ++i) {
6513 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6514 if (ThisIsNonZero && First) {
6516 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6518 V = DAG.getUNDEF(MVT::v8i16);
6523 // FIXME: Investigate extending to i32 instead of just i16.
6524 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6525 SDValue ThisElt, LastElt;
6526 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6527 if (LastIsNonZero) {
6529 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6531 if (ThisIsNonZero) {
6532 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6533 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6534 DAG.getConstant(8, dl, MVT::i8));
6536 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6542 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6543 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6544 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6545 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6546 V = DAG.getBitcast(MVT::v8i16, V);
6548 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6549 DAG.getIntPtrConstant(i / 2, dl));
6555 return DAG.getBitcast(MVT::v16i8, V);
6558 /// Custom lower build_vector of v8i16.
6559 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6560 unsigned NumNonZero, unsigned NumZero,
6562 const X86Subtarget &Subtarget) {
6563 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6566 // Use PINSRW to insert each byte directly.
6567 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6571 /// Custom lower build_vector of v4i32 or v4f32.
6572 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6573 const X86Subtarget &Subtarget) {
6574 // Find all zeroable elements.
6575 std::bitset<4> Zeroable;
6576 for (int i=0; i < 4; ++i) {
6577 SDValue Elt = Op->getOperand(i);
6578 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6580 assert(Zeroable.size() - Zeroable.count() > 1 &&
6581 "We expect at least two non-zero elements!");
6583 // We only know how to deal with build_vector nodes where elements are either
6584 // zeroable or extract_vector_elt with constant index.
6585 SDValue FirstNonZero;
6586 unsigned FirstNonZeroIdx;
6587 for (unsigned i=0; i < 4; ++i) {
6590 SDValue Elt = Op->getOperand(i);
6591 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6592 !isa<ConstantSDNode>(Elt.getOperand(1)))
6594 // Make sure that this node is extracting from a 128-bit vector.
6595 MVT VT = Elt.getOperand(0).getSimpleValueType();
6596 if (!VT.is128BitVector())
6598 if (!FirstNonZero.getNode()) {
6600 FirstNonZeroIdx = i;
6604 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6605 SDValue V1 = FirstNonZero.getOperand(0);
6606 MVT VT = V1.getSimpleValueType();
6608 // See if this build_vector can be lowered as a blend with zero.
6610 unsigned EltMaskIdx, EltIdx;
6612 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6613 if (Zeroable[EltIdx]) {
6614 // The zero vector will be on the right hand side.
6615 Mask[EltIdx] = EltIdx+4;
6619 Elt = Op->getOperand(EltIdx);
6620 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6621 EltMaskIdx = Elt.getConstantOperandVal(1);
6622 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6624 Mask[EltIdx] = EltIdx;
6628 // Let the shuffle legalizer deal with blend operations.
6629 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6630 if (V1.getSimpleValueType() != VT)
6631 V1 = DAG.getBitcast(VT, V1);
6632 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6635 // See if we can lower this build_vector to a INSERTPS.
6636 if (!Subtarget.hasSSE41())
6639 SDValue V2 = Elt.getOperand(0);
6640 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6643 bool CanFold = true;
6644 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6648 SDValue Current = Op->getOperand(i);
6649 SDValue SrcVector = Current->getOperand(0);
6652 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6658 assert(V1.getNode() && "Expected at least two non-zero elements!");
6659 if (V1.getSimpleValueType() != MVT::v4f32)
6660 V1 = DAG.getBitcast(MVT::v4f32, V1);
6661 if (V2.getSimpleValueType() != MVT::v4f32)
6662 V2 = DAG.getBitcast(MVT::v4f32, V2);
6664 // Ok, we can emit an INSERTPS instruction.
6665 unsigned ZMask = Zeroable.to_ulong();
6667 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6668 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6670 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6671 DAG.getIntPtrConstant(InsertPSMask, DL));
6672 return DAG.getBitcast(VT, Result);
6675 /// Return a vector logical shift node.
6676 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6677 SelectionDAG &DAG, const TargetLowering &TLI,
6679 assert(VT.is128BitVector() && "Unknown type for VShift");
6680 MVT ShVT = MVT::v16i8;
6681 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6682 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6683 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6684 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, MVT::i8);
6685 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6688 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6689 SelectionDAG &DAG) {
6691 // Check if the scalar load can be widened into a vector load. And if
6692 // the address is "base + cst" see if the cst can be "absorbed" into
6693 // the shuffle mask.
6694 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6695 SDValue Ptr = LD->getBasePtr();
6696 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6698 EVT PVT = LD->getValueType(0);
6699 if (PVT != MVT::i32 && PVT != MVT::f32)
6704 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6705 FI = FINode->getIndex();
6707 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6708 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6709 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6710 Offset = Ptr.getConstantOperandVal(1);
6711 Ptr = Ptr.getOperand(0);
6716 // FIXME: 256-bit vector instructions don't require a strict alignment,
6717 // improve this code to support it better.
6718 unsigned RequiredAlign = VT.getSizeInBits()/8;
6719 SDValue Chain = LD->getChain();
6720 // Make sure the stack object alignment is at least 16 or 32.
6721 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6722 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6723 if (MFI.isFixedObjectIndex(FI)) {
6724 // Can't change the alignment. FIXME: It's possible to compute
6725 // the exact stack offset and reference FI + adjust offset instead.
6726 // If someone *really* cares about this. That's the way to implement it.
6729 MFI.setObjectAlignment(FI, RequiredAlign);
6733 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6734 // Ptr + (Offset & ~15).
6737 if ((Offset % RequiredAlign) & 3)
6739 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6742 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6743 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6746 int EltNo = (Offset - StartOffset) >> 2;
6747 unsigned NumElems = VT.getVectorNumElements();
6749 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6750 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6751 LD->getPointerInfo().getWithOffset(StartOffset));
6753 SmallVector<int, 8> Mask(NumElems, EltNo);
6755 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6761 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6762 /// elements can be replaced by a single large load which has the same value as
6763 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6765 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6766 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6767 const SDLoc &DL, SelectionDAG &DAG,
6768 const X86Subtarget &Subtarget,
6769 bool isAfterLegalize) {
6770 unsigned NumElems = Elts.size();
6772 int LastLoadedElt = -1;
6773 SmallBitVector LoadMask(NumElems, false);
6774 SmallBitVector ZeroMask(NumElems, false);
6775 SmallBitVector UndefMask(NumElems, false);
6777 // For each element in the initializer, see if we've found a load, zero or an
6779 for (unsigned i = 0; i < NumElems; ++i) {
6780 SDValue Elt = peekThroughBitcasts(Elts[i]);
6785 UndefMask[i] = true;
6786 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6788 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6791 // Each loaded element must be the correct fractional portion of the
6792 // requested vector load.
6793 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6798 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6799 "Incomplete element masks");
6801 // Handle Special Cases - all undef or undef/zero.
6802 if (UndefMask.count() == NumElems)
6803 return DAG.getUNDEF(VT);
6805 // FIXME: Should we return this as a BUILD_VECTOR instead?
6806 if ((ZeroMask | UndefMask).count() == NumElems)
6807 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6808 : DAG.getConstantFP(0.0, DL, VT);
6810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6811 int FirstLoadedElt = LoadMask.find_first();
6812 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6813 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6814 EVT LDBaseVT = EltBase.getValueType();
6816 // Consecutive loads can contain UNDEFS but not ZERO elements.
6817 // Consecutive loads with UNDEFs and ZEROs elements require a
6818 // an additional shuffle stage to clear the ZERO elements.
6819 bool IsConsecutiveLoad = true;
6820 bool IsConsecutiveLoadWithZeros = true;
6821 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6823 SDValue Elt = peekThroughBitcasts(Elts[i]);
6824 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6825 if (!DAG.areNonVolatileConsecutiveLoads(
6826 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6827 i - FirstLoadedElt)) {
6828 IsConsecutiveLoad = false;
6829 IsConsecutiveLoadWithZeros = false;
6832 } else if (ZeroMask[i]) {
6833 IsConsecutiveLoad = false;
6837 SmallVector<LoadSDNode *, 8> Loads;
6838 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6840 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6842 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6843 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6844 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6845 "Cannot merge volatile loads.");
6847 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6848 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6849 for (auto *LD : Loads)
6850 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6854 // LOAD - all consecutive load/undefs (must start/end with a load).
6855 // If we have found an entire vector of loads and undefs, then return a large
6856 // load of the entire vector width starting at the base pointer.
6857 // If the vector contains zeros, then attempt to shuffle those elements.
6858 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6859 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6860 assert(LDBase && "Did not find base load for merging consecutive loads");
6861 EVT EltVT = LDBase->getValueType(0);
6862 // Ensure that the input vector size for the merged loads matches the
6863 // cumulative size of the input elements.
6864 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6867 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6870 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6871 // will lower to regular temporal loads and use the cache.
6872 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6873 VT.is256BitVector() && !Subtarget.hasInt256())
6876 if (IsConsecutiveLoad)
6877 return CreateLoad(VT, LDBase);
6879 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6880 // vector and a zero vector to clear out the zero elements.
6881 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6882 SmallVector<int, 4> ClearMask(NumElems, -1);
6883 for (unsigned i = 0; i < NumElems; ++i) {
6885 ClearMask[i] = i + NumElems;
6886 else if (LoadMask[i])
6889 SDValue V = CreateLoad(VT, LDBase);
6890 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6891 : DAG.getConstantFP(0.0, DL, VT);
6892 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6897 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6899 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6900 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6901 (LoadSize == 32 || LoadSize == 64) &&
6902 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6903 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6904 : MVT::getIntegerVT(LoadSize);
6905 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6906 if (TLI.isTypeLegal(VecVT)) {
6907 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6908 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6910 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6911 LDBase->getPointerInfo(),
6912 LDBase->getAlignment(),
6913 MachineMemOperand::MOLoad);
6914 for (auto *LD : Loads)
6915 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6916 return DAG.getBitcast(VT, ResNode);
6923 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6924 unsigned SplatBitSize, LLVMContext &C) {
6925 unsigned ScalarSize = VT.getScalarSizeInBits();
6926 unsigned NumElm = SplatBitSize / ScalarSize;
6928 SmallVector<Constant *, 32> ConstantVec;
6929 for (unsigned i = 0; i < NumElm; i++) {
6930 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6932 if (VT.isFloatingPoint()) {
6933 if (ScalarSize == 32) {
6934 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6936 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6937 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6940 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6941 ConstantVec.push_back(Const);
6943 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6946 static bool isUseOfShuffle(SDNode *N) {
6947 for (auto *U : N->uses()) {
6948 if (isTargetShuffle(U->getOpcode()))
6950 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6951 return isUseOfShuffle(U);
6956 // Check if the current node of build vector is a zero extended vector.
6957 // // If so, return the value extended.
6958 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6959 // // NumElt - return the number of zero extended identical values.
6960 // // EltType - return the type of the value include the zero extend.
6961 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6962 unsigned &NumElt, MVT &EltType) {
6963 SDValue ExtValue = Op->getOperand(0);
6964 unsigned NumElts = Op->getNumOperands();
6965 unsigned Delta = NumElts;
6967 for (unsigned i = 1; i < NumElts; i++) {
6968 if (Op->getOperand(i) == ExtValue) {
6972 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6975 if (!isPowerOf2_32(Delta) || Delta == 1)
6978 for (unsigned i = Delta; i < NumElts; i++) {
6979 if (i % Delta == 0) {
6980 if (Op->getOperand(i) != ExtValue)
6982 } else if (!(isNullConstant(Op->getOperand(i)) ||
6983 Op->getOperand(i).isUndef()))
6986 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6987 unsigned ExtVTSize = EltSize * Delta;
6988 EltType = MVT::getIntegerVT(ExtVTSize);
6989 NumElt = NumElts / Delta;
6993 /// Attempt to use the vbroadcast instruction to generate a splat value
6994 /// from a splat BUILD_VECTOR which uses:
6995 /// a. A single scalar load, or a constant.
6996 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6998 /// The VBROADCAST node is returned when a pattern is found,
6999 /// or SDValue() otherwise.
7000 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
7001 const X86Subtarget &Subtarget,
7002 SelectionDAG &DAG) {
7003 // VBROADCAST requires AVX.
7004 // TODO: Splats could be generated for non-AVX CPUs using SSE
7005 // instructions, but there's less potential gain for only 128-bit vectors.
7006 if (!Subtarget.hasAVX())
7009 MVT VT = BVOp->getSimpleValueType(0);
7012 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
7013 "Unsupported vector type for broadcast.");
7015 BitVector UndefElements;
7016 SDValue Ld = BVOp->getSplatValue(&UndefElements);
7018 // Attempt to use VBROADCASTM
7019 // From this paterrn:
7020 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
7021 // b. t1 = (build_vector t0 t0)
7023 // Create (VBROADCASTM v2i1 X)
7024 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
7025 MVT EltType = VT.getScalarType();
7026 unsigned NumElts = VT.getVectorNumElements();
7028 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
7029 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
7030 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
7031 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
7033 BOperand = ZeroExtended.getOperand(0);
7035 BOperand = Ld.getOperand(0).getOperand(0);
7036 MVT MaskVT = BOperand.getSimpleValueType();
7037 if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q
7038 (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d
7040 DAG.getNode(X86ISD::VBROADCASTM, dl,
7041 MVT::getVectorVT(EltType, NumElts), BOperand);
7042 return DAG.getBitcast(VT, Brdcst);
7047 // We need a splat of a single value to use broadcast, and it doesn't
7048 // make any sense if the value is only in one element of the vector.
7049 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
7050 APInt SplatValue, Undef;
7051 unsigned SplatBitSize;
7053 // Check if this is a repeated constant pattern suitable for broadcasting.
7054 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
7055 SplatBitSize > VT.getScalarSizeInBits() &&
7056 SplatBitSize < VT.getSizeInBits()) {
7057 // Avoid replacing with broadcast when it's a use of a shuffle
7058 // instruction to preserve the present custom lowering of shuffles.
7059 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
7061 // replace BUILD_VECTOR with broadcast of the repeated constants.
7062 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7063 LLVMContext *Ctx = DAG.getContext();
7064 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
7065 if (Subtarget.hasAVX()) {
7066 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
7067 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
7068 // Splatted value can fit in one INTEGER constant in constant pool.
7069 // Load the constant and broadcast it.
7070 MVT CVT = MVT::getIntegerVT(SplatBitSize);
7071 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
7072 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
7073 SDValue CP = DAG.getConstantPool(C, PVT);
7074 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7076 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7078 CVT, dl, DAG.getEntryNode(), CP,
7079 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7081 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7082 MVT::getVectorVT(CVT, Repeat), Ld);
7083 return DAG.getBitcast(VT, Brdcst);
7084 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7085 // Splatted value can fit in one FLOAT constant in constant pool.
7086 // Load the constant and broadcast it.
7087 // AVX have support for 32 and 64 bit broadcast for floats only.
7088 // No 64bit integer in 32bit subtarget.
7089 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7090 // Lower the splat via APFloat directly, to avoid any conversion.
7093 ? ConstantFP::get(*Ctx,
7094 APFloat(APFloat::IEEEsingle(), SplatValue))
7095 : ConstantFP::get(*Ctx,
7096 APFloat(APFloat::IEEEdouble(), SplatValue));
7097 SDValue CP = DAG.getConstantPool(C, PVT);
7098 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7100 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7102 CVT, dl, DAG.getEntryNode(), CP,
7103 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7105 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7106 MVT::getVectorVT(CVT, Repeat), Ld);
7107 return DAG.getBitcast(VT, Brdcst);
7108 } else if (SplatBitSize > 64) {
7109 // Load the vector of constants and broadcast it.
7110 MVT CVT = VT.getScalarType();
7111 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7113 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7114 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7115 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7117 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7118 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7120 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7121 return DAG.getBitcast(VT, Brdcst);
7128 bool ConstSplatVal =
7129 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7131 // Make sure that all of the users of a non-constant load are from the
7132 // BUILD_VECTOR node.
7133 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7136 unsigned ScalarSize = Ld.getValueSizeInBits();
7137 bool IsGE256 = (VT.getSizeInBits() >= 256);
7139 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7140 // instruction to save 8 or more bytes of constant pool data.
7141 // TODO: If multiple splats are generated to load the same constant,
7142 // it may be detrimental to overall size. There needs to be a way to detect
7143 // that condition to know if this is truly a size win.
7144 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7146 // Handle broadcasting a single constant scalar from the constant pool
7148 // On Sandybridge (no AVX2), it is still better to load a constant vector
7149 // from the constant pool and not to broadcast it from a scalar.
7150 // But override that restriction when optimizing for size.
7151 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7152 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7153 EVT CVT = Ld.getValueType();
7154 assert(!CVT.isVector() && "Must not broadcast a vector type");
7156 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7157 // For size optimization, also splat v2f64 and v2i64, and for size opt
7158 // with AVX2, also splat i8 and i16.
7159 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7160 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7161 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7162 const Constant *C = nullptr;
7163 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7164 C = CI->getConstantIntValue();
7165 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7166 C = CF->getConstantFPValue();
7168 assert(C && "Invalid constant type");
7170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7172 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7173 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7175 CVT, dl, DAG.getEntryNode(), CP,
7176 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7179 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7183 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7185 // Handle AVX2 in-register broadcasts.
7186 if (!IsLoad && Subtarget.hasInt256() &&
7187 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7188 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7190 // The scalar source must be a normal load.
7194 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7195 (Subtarget.hasVLX() && ScalarSize == 64))
7196 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7198 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7199 // double since there is no vbroadcastsd xmm
7200 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7201 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7202 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7205 // Unsupported broadcast.
7209 /// For an EXTRACT_VECTOR_ELT with a constant index return the real
7210 /// underlying vector and index.
7212 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7214 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7216 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7217 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7220 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7222 // (extract_vector_elt (v8f32 %1), Constant<6>)
7224 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7225 // (extract_subvector (v8f32 %0), Constant<4>),
7228 // In this case the vector is the extract_subvector expression and the index
7229 // is 2, as specified by the shuffle.
7230 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7231 SDValue ShuffleVec = SVOp->getOperand(0);
7232 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7233 assert(ShuffleVecVT.getVectorElementType() ==
7234 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7236 int ShuffleIdx = SVOp->getMaskElt(Idx);
7237 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7238 ExtractedFromVec = ShuffleVec;
7244 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7245 MVT VT = Op.getSimpleValueType();
7247 // Skip if insert_vec_elt is not supported.
7248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7249 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7253 unsigned NumElems = Op.getNumOperands();
7257 SmallVector<unsigned, 4> InsertIndices;
7258 SmallVector<int, 8> Mask(NumElems, -1);
7260 for (unsigned i = 0; i != NumElems; ++i) {
7261 unsigned Opc = Op.getOperand(i).getOpcode();
7263 if (Opc == ISD::UNDEF)
7266 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7267 // Quit if more than 1 elements need inserting.
7268 if (InsertIndices.size() > 1)
7271 InsertIndices.push_back(i);
7275 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7276 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7278 // Quit if non-constant index.
7279 if (!isa<ConstantSDNode>(ExtIdx))
7281 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7283 // Quit if extracted from vector of different type.
7284 if (ExtractedFromVec.getValueType() != VT)
7287 if (!VecIn1.getNode())
7288 VecIn1 = ExtractedFromVec;
7289 else if (VecIn1 != ExtractedFromVec) {
7290 if (!VecIn2.getNode())
7291 VecIn2 = ExtractedFromVec;
7292 else if (VecIn2 != ExtractedFromVec)
7293 // Quit if more than 2 vectors to shuffle
7297 if (ExtractedFromVec == VecIn1)
7299 else if (ExtractedFromVec == VecIn2)
7300 Mask[i] = Idx + NumElems;
7303 if (!VecIn1.getNode())
7306 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7307 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7309 for (unsigned Idx : InsertIndices)
7310 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7311 DAG.getIntPtrConstant(Idx, DL));
7316 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7317 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7318 Op.getScalarValueSizeInBits() == 1 &&
7319 "Can not convert non-constant vector");
7320 uint64_t Immediate = 0;
7321 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7322 SDValue In = Op.getOperand(idx);
7324 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7327 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7328 return DAG.getConstant(Immediate, dl, VT);
7330 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7331 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7332 const X86Subtarget &Subtarget) {
7334 MVT VT = Op.getSimpleValueType();
7335 assert((VT.getVectorElementType() == MVT::i1) &&
7336 "Unexpected type in LowerBUILD_VECTORvXi1!");
7339 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7342 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7345 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7346 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7347 // Split the pieces.
7349 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7351 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7352 // We have to manually lower both halves so getNode doesn't try to
7353 // reassemble the build_vector.
7354 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7355 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7356 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7358 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7359 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7360 return DAG.getBitcast(VT, Imm);
7361 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7362 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7363 DAG.getIntPtrConstant(0, dl));
7366 // Vector has one or more non-const elements
7367 uint64_t Immediate = 0;
7368 SmallVector<unsigned, 16> NonConstIdx;
7369 bool IsSplat = true;
7370 bool HasConstElts = false;
7372 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7373 SDValue In = Op.getOperand(idx);
7376 if (!isa<ConstantSDNode>(In))
7377 NonConstIdx.push_back(idx);
7379 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7380 HasConstElts = true;
7384 else if (In != Op.getOperand(SplatIdx))
7388 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7390 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7391 DAG.getConstant(1, dl, VT),
7392 DAG.getConstant(0, dl, VT));
7394 // insert elements one by one
7398 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7399 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7401 else if (HasConstElts)
7402 Imm = DAG.getConstant(0, dl, VT);
7404 Imm = DAG.getUNDEF(VT);
7405 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7406 DstVec = DAG.getBitcast(VT, Imm);
7408 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7409 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7410 DAG.getIntPtrConstant(0, dl));
7413 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7414 unsigned InsertIdx = NonConstIdx[i];
7415 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7416 Op.getOperand(InsertIdx),
7417 DAG.getIntPtrConstant(InsertIdx, dl));
7422 /// Return true if \p N implements a horizontal binop and return the
7423 /// operands for the horizontal binop into V0 and V1.
7425 /// This is a helper function of LowerToHorizontalOp().
7426 /// This function checks that the build_vector \p N in input implements a
7427 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7428 /// operation to match.
7429 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7430 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7431 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7434 /// This function only analyzes elements of \p N whose indices are
7435 /// in range [BaseIdx, LastIdx).
7436 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7438 unsigned BaseIdx, unsigned LastIdx,
7439 SDValue &V0, SDValue &V1) {
7440 EVT VT = N->getValueType(0);
7442 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7443 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7444 "Invalid Vector in input!");
7446 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7447 bool CanFold = true;
7448 unsigned ExpectedVExtractIdx = BaseIdx;
7449 unsigned NumElts = LastIdx - BaseIdx;
7450 V0 = DAG.getUNDEF(VT);
7451 V1 = DAG.getUNDEF(VT);
7453 // Check if N implements a horizontal binop.
7454 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7455 SDValue Op = N->getOperand(i + BaseIdx);
7458 if (Op->isUndef()) {
7459 // Update the expected vector extract index.
7460 if (i * 2 == NumElts)
7461 ExpectedVExtractIdx = BaseIdx;
7462 ExpectedVExtractIdx += 2;
7466 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7471 SDValue Op0 = Op.getOperand(0);
7472 SDValue Op1 = Op.getOperand(1);
7474 // Try to match the following pattern:
7475 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7476 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7477 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7478 Op0.getOperand(0) == Op1.getOperand(0) &&
7479 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7480 isa<ConstantSDNode>(Op1.getOperand(1)));
7484 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7485 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7487 if (i * 2 < NumElts) {
7489 V0 = Op0.getOperand(0);
7490 if (V0.getValueType() != VT)
7495 V1 = Op0.getOperand(0);
7496 if (V1.getValueType() != VT)
7499 if (i * 2 == NumElts)
7500 ExpectedVExtractIdx = BaseIdx;
7503 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7504 if (I0 == ExpectedVExtractIdx)
7505 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7506 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7507 // Try to match the following dag sequence:
7508 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7509 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7513 ExpectedVExtractIdx += 2;
7519 /// Emit a sequence of two 128-bit horizontal add/sub followed by
7520 /// a concat_vector.
7522 /// This is a helper function of LowerToHorizontalOp().
7523 /// This function expects two 256-bit vectors called V0 and V1.
7524 /// At first, each vector is split into two separate 128-bit vectors.
7525 /// Then, the resulting 128-bit vectors are used to implement two
7526 /// horizontal binary operations.
7528 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7530 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7531 /// the two new horizontal binop.
7532 /// When Mode is set, the first horizontal binop dag node would take as input
7533 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7534 /// horizontal binop dag node would take as input the lower 128-bit of V1
7535 /// and the upper 128-bit of V1.
7537 /// HADD V0_LO, V0_HI
7538 /// HADD V1_LO, V1_HI
7540 /// Otherwise, the first horizontal binop dag node takes as input the lower
7541 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7542 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7544 /// HADD V0_LO, V1_LO
7545 /// HADD V0_HI, V1_HI
7547 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7548 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7549 /// the upper 128-bits of the result.
7550 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7551 const SDLoc &DL, SelectionDAG &DAG,
7552 unsigned X86Opcode, bool Mode,
7553 bool isUndefLO, bool isUndefHI) {
7554 MVT VT = V0.getSimpleValueType();
7555 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7556 "Invalid nodes in input!");
7558 unsigned NumElts = VT.getVectorNumElements();
7559 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7560 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7561 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7562 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7563 MVT NewVT = V0_LO.getSimpleValueType();
7565 SDValue LO = DAG.getUNDEF(NewVT);
7566 SDValue HI = DAG.getUNDEF(NewVT);
7569 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7570 if (!isUndefLO && !V0->isUndef())
7571 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7572 if (!isUndefHI && !V1->isUndef())
7573 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7575 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7576 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7577 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7579 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7580 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7583 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7586 /// Returns true iff \p BV builds a vector with the result equivalent to
7587 /// the result of ADDSUB/SUBADD operation.
7588 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7589 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7590 /// \p Opnd0 and \p Opnd1.
7591 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7592 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7593 SDValue &Opnd0, SDValue &Opnd1,
7594 unsigned &NumExtracts,
7597 MVT VT = BV->getSimpleValueType(0);
7598 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7601 unsigned NumElts = VT.getVectorNumElements();
7602 SDValue InVec0 = DAG.getUNDEF(VT);
7603 SDValue InVec1 = DAG.getUNDEF(VT);
7607 // Odd-numbered elements in the input build vector are obtained from
7608 // adding/subtracting two integer/float elements.
7609 // Even-numbered elements in the input build vector are obtained from
7610 // subtracting/adding two integer/float elements.
7611 unsigned Opc[2] {0, 0};
7612 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7613 SDValue Op = BV->getOperand(i);
7615 // Skip 'undef' values.
7616 unsigned Opcode = Op.getOpcode();
7617 if (Opcode == ISD::UNDEF)
7620 // Early exit if we found an unexpected opcode.
7621 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7624 SDValue Op0 = Op.getOperand(0);
7625 SDValue Op1 = Op.getOperand(1);
7627 // Try to match the following pattern:
7628 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7629 // Early exit if we cannot match that sequence.
7630 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7631 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7632 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7633 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7634 Op0.getOperand(1) != Op1.getOperand(1))
7637 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7641 // We found a valid add/sub node, make sure its the same opcode as previous
7642 // elements for this parity.
7643 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7645 Opc[i % 2] = Opcode;
7647 // Update InVec0 and InVec1.
7648 if (InVec0.isUndef()) {
7649 InVec0 = Op0.getOperand(0);
7650 if (InVec0.getSimpleValueType() != VT)
7653 if (InVec1.isUndef()) {
7654 InVec1 = Op1.getOperand(0);
7655 if (InVec1.getSimpleValueType() != VT)
7659 // Make sure that operands in input to each add/sub node always
7660 // come from a same pair of vectors.
7661 if (InVec0 != Op0.getOperand(0)) {
7662 if (Opcode == ISD::FSUB)
7665 // FADD is commutable. Try to commute the operands
7666 // and then test again.
7667 std::swap(Op0, Op1);
7668 if (InVec0 != Op0.getOperand(0))
7672 if (InVec1 != Op1.getOperand(0))
7675 // Increment the number of extractions done.
7679 // Ensure we have found an opcode for both parities and that they are
7680 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7681 // inputs are undef.
7682 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7683 InVec0.isUndef() || InVec1.isUndef())
7686 IsSubAdd = Opc[0] == ISD::FADD;
7693 /// Returns true if is possible to fold MUL and an idiom that has already been
7694 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7695 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7696 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7698 /// Prior to calling this function it should be known that there is some
7699 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7700 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7701 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7702 /// of \p Opnd0 uses is expected to be equal to 2.
7703 /// For example, this function may be called for the following IR:
7704 /// %AB = fmul fast <2 x double> %A, %B
7705 /// %Sub = fsub fast <2 x double> %AB, %C
7706 /// %Add = fadd fast <2 x double> %AB, %C
7707 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7708 /// <2 x i32> <i32 0, i32 3>
7709 /// There is a def for %Addsub here, which potentially can be replaced by
7710 /// X86ISD::ADDSUB operation:
7711 /// %Addsub = X86ISD::ADDSUB %AB, %C
7712 /// and such ADDSUB can further be replaced with FMADDSUB:
7713 /// %Addsub = FMADDSUB %A, %B, %C.
7715 /// The main reason why this method is called before the replacement of the
7716 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7717 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7719 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7721 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7722 unsigned ExpectedUses) {
7723 if (Opnd0.getOpcode() != ISD::FMUL ||
7724 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7727 // FIXME: These checks must match the similar ones in
7728 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7729 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7730 // or MUL + ADDSUB to FMADDSUB.
7731 const TargetOptions &Options = DAG.getTarget().Options;
7733 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7738 Opnd1 = Opnd0.getOperand(1);
7739 Opnd0 = Opnd0.getOperand(0);
7744 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7745 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7746 /// X86ISD::FMSUBADD node.
7747 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7748 const X86Subtarget &Subtarget,
7749 SelectionDAG &DAG) {
7750 SDValue Opnd0, Opnd1;
7751 unsigned NumExtracts;
7753 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7757 MVT VT = BV->getSimpleValueType(0);
7760 // Try to generate X86ISD::FMADDSUB node here.
7762 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7763 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7764 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7767 // We only support ADDSUB.
7771 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7772 // the ADDSUB idiom has been successfully recognized. There are no known
7773 // X86 targets with 512-bit ADDSUB instructions!
7774 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7776 if (VT.is512BitVector())
7779 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7782 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7783 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7784 const X86Subtarget &Subtarget,
7785 SelectionDAG &DAG) {
7786 MVT VT = BV->getSimpleValueType(0);
7787 unsigned NumElts = VT.getVectorNumElements();
7788 unsigned NumUndefsLO = 0;
7789 unsigned NumUndefsHI = 0;
7790 unsigned Half = NumElts/2;
7792 // Count the number of UNDEF operands in the build_vector in input.
7793 for (unsigned i = 0, e = Half; i != e; ++i)
7794 if (BV->getOperand(i)->isUndef())
7797 for (unsigned i = Half, e = NumElts; i != e; ++i)
7798 if (BV->getOperand(i)->isUndef())
7801 // Early exit if this is either a build_vector of all UNDEFs or all the
7802 // operands but one are UNDEF.
7803 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7807 SDValue InVec0, InVec1;
7808 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7809 // Try to match an SSE3 float HADD/HSUB.
7810 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7811 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7813 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7814 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7815 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7816 // Try to match an SSSE3 integer HADD/HSUB.
7817 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7818 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7820 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7821 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7824 if (!Subtarget.hasAVX())
7827 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7828 // Try to match an AVX horizontal add/sub of packed single/double
7829 // precision floating point values from 256-bit vectors.
7830 SDValue InVec2, InVec3;
7831 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7832 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7833 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7834 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7835 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7837 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7838 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7839 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7840 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7841 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7842 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7843 // Try to match an AVX2 horizontal add/sub of signed integers.
7844 SDValue InVec2, InVec3;
7846 bool CanFold = true;
7848 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7849 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7850 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7851 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7852 X86Opcode = X86ISD::HADD;
7853 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7854 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7855 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7856 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7857 X86Opcode = X86ISD::HSUB;
7862 // Fold this build_vector into a single horizontal add/sub.
7863 // Do this only if the target has AVX2.
7864 if (Subtarget.hasAVX2())
7865 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7867 // Do not try to expand this build_vector into a pair of horizontal
7868 // add/sub if we can emit a pair of scalar add/sub.
7869 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7872 // Convert this build_vector into a pair of horizontal binop followed by
7874 bool isUndefLO = NumUndefsLO == Half;
7875 bool isUndefHI = NumUndefsHI == Half;
7876 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7877 isUndefLO, isUndefHI);
7881 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7882 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7884 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7885 X86Opcode = X86ISD::HADD;
7886 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7887 X86Opcode = X86ISD::HSUB;
7888 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7889 X86Opcode = X86ISD::FHADD;
7890 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7891 X86Opcode = X86ISD::FHSUB;
7895 // Don't try to expand this build_vector into a pair of horizontal add/sub
7896 // if we can simply emit a pair of scalar add/sub.
7897 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7900 // Convert this build_vector into two horizontal add/sub followed by
7902 bool isUndefLO = NumUndefsLO == Half;
7903 bool isUndefHI = NumUndefsHI == Half;
7904 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7905 isUndefLO, isUndefHI);
7911 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7912 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7913 /// just apply the bit to the vectors.
7914 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7915 /// from this, but enough scalar bit operations are created from the later
7916 /// legalization + scalarization stages to need basic support.
7917 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7918 SelectionDAG &DAG) {
7920 MVT VT = Op->getSimpleValueType(0);
7921 unsigned NumElems = VT.getVectorNumElements();
7922 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7924 // Check that all elements have the same opcode.
7925 // TODO: Should we allow UNDEFS and if so how many?
7926 unsigned Opcode = Op->getOperand(0).getOpcode();
7927 for (unsigned i = 1; i < NumElems; ++i)
7928 if (Opcode != Op->getOperand(i).getOpcode())
7931 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7938 // Don't do this if the buildvector is a splat - we'd replace one
7939 // constant with an entire vector.
7940 if (Op->getSplatValue())
7942 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7947 SmallVector<SDValue, 4> LHSElts, RHSElts;
7948 for (SDValue Elt : Op->ops()) {
7949 SDValue LHS = Elt.getOperand(0);
7950 SDValue RHS = Elt.getOperand(1);
7952 // We expect the canonicalized RHS operand to be the constant.
7953 if (!isa<ConstantSDNode>(RHS))
7955 LHSElts.push_back(LHS);
7956 RHSElts.push_back(RHS);
7959 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7960 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7961 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7964 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7965 /// functionality to do this, so it's all zeros, all ones, or some derivation
7966 /// that is cheap to calculate.
7967 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7968 const X86Subtarget &Subtarget) {
7970 MVT VT = Op.getSimpleValueType();
7972 // Vectors containing all zeros can be matched by pxor and xorps.
7973 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7974 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7975 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7976 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7979 return getZeroVector(VT, Subtarget, DAG, DL);
7982 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7983 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7984 // vpcmpeqd on 256-bit vectors.
7985 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7986 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7987 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7990 return getOnesVector(VT, DAG, DL);
7996 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
7997 /// from a vector of source values and a vector of extraction indices.
7998 /// The vectors might be manipulated to match the type of the permute op.
7999 static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
8000 SDLoc &DL, SelectionDAG &DAG,
8001 const X86Subtarget &Subtarget) {
8003 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8004 unsigned NumElts = VT.getVectorNumElements();
8005 unsigned SizeInBits = VT.getSizeInBits();
8007 // Adjust IndicesVec to match VT size.
8008 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
8009 "Illegal variable permute mask size");
8010 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
8011 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
8012 NumElts * VT.getScalarSizeInBits());
8013 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
8015 // Handle SrcVec that don't match VT type.
8016 if (SrcVec.getValueSizeInBits() != SizeInBits) {
8017 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
8018 // Handle larger SrcVec by treating it as a larger permute.
8019 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
8020 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
8021 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
8022 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
8023 Subtarget, DAG, SDLoc(IndicesVec));
8024 return extractSubVector(
8025 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
8026 DAG, DL, SizeInBits);
8027 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
8028 // Widen smaller SrcVec to match VT.
8029 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
8034 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
8035 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
8036 EVT SrcVT = Idx.getValueType();
8037 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
8038 uint64_t IndexScale = 0;
8039 uint64_t IndexOffset = 0;
8041 // If we're scaling a smaller permute op, then we need to repeat the
8042 // indices, scaling and offsetting them as well.
8043 // e.g. v4i32 -> v16i8 (Scale = 4)
8044 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
8045 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
8046 for (uint64_t i = 0; i != Scale; ++i) {
8047 IndexScale |= Scale << (i * NumDstBits);
8048 IndexOffset |= i << (i * NumDstBits);
8051 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
8052 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
8053 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
8054 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
8058 unsigned Opcode = 0;
8059 switch (VT.SimpleTy) {
8063 if (Subtarget.hasSSSE3())
8064 Opcode = X86ISD::PSHUFB;
8067 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8068 Opcode = X86ISD::VPERMV;
8069 else if (Subtarget.hasSSSE3()) {
8070 Opcode = X86ISD::PSHUFB;
8071 ShuffleVT = MVT::v16i8;
8076 if (Subtarget.hasAVX()) {
8077 Opcode = X86ISD::VPERMILPV;
8078 ShuffleVT = MVT::v4f32;
8079 } else if (Subtarget.hasSSSE3()) {
8080 Opcode = X86ISD::PSHUFB;
8081 ShuffleVT = MVT::v16i8;
8086 if (Subtarget.hasAVX()) {
8087 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8088 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8089 Opcode = X86ISD::VPERMILPV;
8090 ShuffleVT = MVT::v2f64;
8091 } else if (Subtarget.hasSSE41()) {
8092 // SSE41 can compare v2i64 - select between indices 0 and 1.
8093 return DAG.getSelectCC(
8095 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8096 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8097 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8098 ISD::CondCode::SETEQ);
8102 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8103 Opcode = X86ISD::VPERMV;
8104 else if (Subtarget.hasXOP()) {
8105 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8106 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8107 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8108 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8110 ISD::CONCAT_VECTORS, DL, VT,
8111 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8112 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8113 } else if (Subtarget.hasAVX()) {
8114 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8115 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8116 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8117 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8118 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8119 ArrayRef<SDValue> Ops) {
8120 // Permute Lo and Hi and then select based on index range.
8121 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8122 // care about the bit[7] as its just an index vector.
8123 SDValue Idx = Ops[2];
8124 EVT VT = Idx.getValueType();
8125 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8126 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8127 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8128 ISD::CondCode::SETGT);
8130 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8131 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8136 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8137 Opcode = X86ISD::VPERMV;
8138 else if (Subtarget.hasAVX()) {
8139 // Scale to v32i8 and perform as v32i8.
8140 IndicesVec = ScaleIndices(IndicesVec, 2);
8141 return DAG.getBitcast(
8142 VT, createVariablePermute(
8143 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8144 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8149 if (Subtarget.hasAVX2())
8150 Opcode = X86ISD::VPERMV;
8151 else if (Subtarget.hasAVX()) {
8152 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8153 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8154 {0, 1, 2, 3, 0, 1, 2, 3});
8155 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8156 {4, 5, 6, 7, 4, 5, 6, 7});
8157 if (Subtarget.hasXOP())
8158 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8159 LoLo, HiHi, IndicesVec,
8160 DAG.getConstant(0, DL, MVT::i8)));
8161 // Permute Lo and Hi and then select based on index range.
8162 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8163 SDValue Res = DAG.getSelectCC(
8164 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8165 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8166 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8167 ISD::CondCode::SETGT);
8168 return DAG.getBitcast(VT, Res);
8173 if (Subtarget.hasAVX512()) {
8174 if (!Subtarget.hasVLX()) {
8175 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8176 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8178 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8179 DAG, SDLoc(IndicesVec));
8180 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8182 return extract256BitVector(Res, 0, DAG, DL);
8184 Opcode = X86ISD::VPERMV;
8185 } else if (Subtarget.hasAVX()) {
8186 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8188 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8190 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8191 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8192 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8193 if (Subtarget.hasXOP())
8194 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8195 LoLo, HiHi, IndicesVec,
8196 DAG.getConstant(0, DL, MVT::i8)));
8197 // Permute Lo and Hi and then select based on index range.
8198 // This works as VPERMILPD only uses index bit[1] to permute elements.
8199 SDValue Res = DAG.getSelectCC(
8200 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8201 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8202 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8203 ISD::CondCode::SETGT);
8204 return DAG.getBitcast(VT, Res);
8208 if (Subtarget.hasVBMI())
8209 Opcode = X86ISD::VPERMV;
8212 if (Subtarget.hasBWI())
8213 Opcode = X86ISD::VPERMV;
8219 if (Subtarget.hasAVX512())
8220 Opcode = X86ISD::VPERMV;
8226 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8227 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8228 "Illegal variable permute shuffle type");
8230 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8232 IndicesVec = ScaleIndices(IndicesVec, Scale);
8234 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8235 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8237 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8238 SDValue Res = Opcode == X86ISD::VPERMV
8239 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8240 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8241 return DAG.getBitcast(VT, Res);
8244 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8245 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8246 // (build_vector (extract_elt V, (extract_elt I, 0)),
8247 // (extract_elt V, (extract_elt I, 1)),
8252 // TODO: Handle undefs
8253 // TODO: Utilize pshufb and zero mask blending to support more efficient
8254 // construction of vectors with constant-0 elements.
8256 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8257 const X86Subtarget &Subtarget) {
8258 SDValue SrcVec, IndicesVec;
8259 // Check for a match of the permute source vector and permute index elements.
8260 // This is done by checking that the i-th build_vector operand is of the form:
8261 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8262 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8263 SDValue Op = V.getOperand(Idx);
8264 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8267 // If this is the first extract encountered in V, set the source vector,
8268 // otherwise verify the extract is from the previously defined source
8271 SrcVec = Op.getOperand(0);
8272 else if (SrcVec != Op.getOperand(0))
8274 SDValue ExtractedIndex = Op->getOperand(1);
8275 // Peek through extends.
8276 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8277 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8278 ExtractedIndex = ExtractedIndex.getOperand(0);
8279 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8282 // If this is the first extract from the index vector candidate, set the
8283 // indices vector, otherwise verify the extract is from the previously
8284 // defined indices vector.
8286 IndicesVec = ExtractedIndex.getOperand(0);
8287 else if (IndicesVec != ExtractedIndex.getOperand(0))
8290 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8291 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8296 MVT VT = V.getSimpleValueType();
8297 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8301 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8304 MVT VT = Op.getSimpleValueType();
8305 MVT EltVT = VT.getVectorElementType();
8306 unsigned NumElems = Op.getNumOperands();
8308 // Generate vectors for predicate vectors.
8309 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8310 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8312 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8313 return VectorConstant;
8315 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8316 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8318 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8319 return HorizontalOp;
8320 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8322 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8325 unsigned EVTBits = EltVT.getSizeInBits();
8327 unsigned NumZero = 0;
8328 unsigned NumNonZero = 0;
8329 uint64_t NonZeros = 0;
8330 bool IsAllConstants = true;
8331 SmallSet<SDValue, 8> Values;
8332 unsigned NumConstants = NumElems;
8333 for (unsigned i = 0; i < NumElems; ++i) {
8334 SDValue Elt = Op.getOperand(i);
8338 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8339 IsAllConstants = false;
8342 if (X86::isZeroNode(Elt))
8345 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8346 NonZeros |= ((uint64_t)1 << i);
8351 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8352 if (NumNonZero == 0)
8353 return DAG.getUNDEF(VT);
8355 // If we are inserting one variable into a vector of non-zero constants, try
8356 // to avoid loading each constant element as a scalar. Load the constants as a
8357 // vector and then insert the variable scalar element. If insertion is not
8358 // supported, we assume that we will fall back to a shuffle to get the scalar
8359 // blended with the constants. Insertion into a zero vector is handled as a
8360 // special-case somewhere below here.
8361 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8362 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8363 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8364 // Create an all-constant vector. The variable element in the old
8365 // build vector is replaced by undef in the constant vector. Save the
8366 // variable scalar element and its index for use in the insertelement.
8367 LLVMContext &Context = *DAG.getContext();
8368 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8369 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8372 for (unsigned i = 0; i != NumElems; ++i) {
8373 SDValue Elt = Op.getOperand(i);
8374 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8375 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8376 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8377 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8378 else if (!Elt.isUndef()) {
8379 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8380 "Expected one variable element in this vector");
8382 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8385 Constant *CV = ConstantVector::get(ConstVecOps);
8386 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8388 // The constants we just created may not be legal (eg, floating point). We
8389 // must lower the vector right here because we can not guarantee that we'll
8390 // legalize it before loading it. This is also why we could not just create
8391 // a new build vector here. If the build vector contains illegal constants,
8392 // it could get split back up into a series of insert elements.
8393 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8394 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8395 MachineFunction &MF = DAG.getMachineFunction();
8396 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8397 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8398 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8401 // Special case for single non-zero, non-undef, element.
8402 if (NumNonZero == 1) {
8403 unsigned Idx = countTrailingZeros(NonZeros);
8404 SDValue Item = Op.getOperand(Idx);
8406 // If we have a constant or non-constant insertion into the low element of
8407 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8408 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8409 // depending on what the source datatype is.
8412 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8414 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8415 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8416 assert((VT.is128BitVector() || VT.is256BitVector() ||
8417 VT.is512BitVector()) &&
8418 "Expected an SSE value type!");
8419 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8420 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8421 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8424 // We can't directly insert an i8 or i16 into a vector, so zero extend
8426 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8427 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8428 if (VT.getSizeInBits() >= 256) {
8429 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8430 if (Subtarget.hasAVX()) {
8431 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8432 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8434 // Without AVX, we need to extend to a 128-bit vector and then
8435 // insert into the 256-bit vector.
8436 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8437 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8438 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8441 assert(VT.is128BitVector() && "Expected an SSE value type!");
8442 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8443 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8445 return DAG.getBitcast(VT, Item);
8449 // Is it a vector logical left shift?
8450 if (NumElems == 2 && Idx == 1 &&
8451 X86::isZeroNode(Op.getOperand(0)) &&
8452 !X86::isZeroNode(Op.getOperand(1))) {
8453 unsigned NumBits = VT.getSizeInBits();
8454 return getVShift(true, VT,
8455 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8456 VT, Op.getOperand(1)),
8457 NumBits/2, DAG, *this, dl);
8460 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8463 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8464 // is a non-constant being inserted into an element other than the low one,
8465 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8466 // movd/movss) to move this into the low element, then shuffle it into
8468 if (EVTBits == 32) {
8469 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8470 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8474 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8475 if (Values.size() == 1) {
8476 if (EVTBits == 32) {
8477 // Instead of a shuffle like this:
8478 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8479 // Check if it's possible to issue this instead.
8480 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8481 unsigned Idx = countTrailingZeros(NonZeros);
8482 SDValue Item = Op.getOperand(Idx);
8483 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8484 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8489 // A vector full of immediates; various special cases are already
8490 // handled, so this is best done with a single constant-pool load.
8494 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8497 // See if we can use a vector load to get all of the elements.
8499 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8501 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8505 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8506 // build_vector and broadcast it.
8507 // TODO: We could probably generalize this more.
8508 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8509 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8510 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8511 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8512 // Make sure all the even/odd operands match.
8513 for (unsigned i = 2; i != NumElems; ++i)
8514 if (Ops[i % 2] != Op.getOperand(i))
8518 if (CanSplat(Op, NumElems, Ops)) {
8519 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8520 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8521 // Create a new build vector and cast to v2i64/v2f64.
8522 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8523 DAG.getBuildVector(NarrowVT, dl, Ops));
8524 // Broadcast from v2i64/v2f64 and cast to final VT.
8525 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8526 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8531 // For AVX-length vectors, build the individual 128-bit pieces and use
8532 // shuffles to put them in place.
8533 if (VT.getSizeInBits() > 128) {
8534 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8536 // Build both the lower and upper subvector.
8538 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8539 SDValue Upper = DAG.getBuildVector(
8540 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8542 // Recreate the wider vector with the lower and upper part.
8543 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8544 VT.getSizeInBits() / 2);
8547 // Let legalizer expand 2-wide build_vectors.
8548 if (EVTBits == 64) {
8549 if (NumNonZero == 1) {
8550 // One half is zero or undef.
8551 unsigned Idx = countTrailingZeros(NonZeros);
8552 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8553 Op.getOperand(Idx));
8554 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8559 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8560 if (EVTBits == 8 && NumElems == 16)
8561 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8565 if (EVTBits == 16 && NumElems == 8)
8566 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8570 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8571 if (EVTBits == 32 && NumElems == 4)
8572 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8575 // If element VT is == 32 bits, turn it into a number of shuffles.
8576 if (NumElems == 4 && NumZero > 0) {
8577 SmallVector<SDValue, 8> Ops(NumElems);
8578 for (unsigned i = 0; i < 4; ++i) {
8579 bool isZero = !(NonZeros & (1ULL << i));
8581 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8583 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8586 for (unsigned i = 0; i < 2; ++i) {
8587 switch ((NonZeros >> (i*2)) & 0x3) {
8588 default: llvm_unreachable("Unexpected NonZero count");
8590 Ops[i] = Ops[i*2]; // Must be a zero vector.
8593 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8596 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8599 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8604 bool Reverse1 = (NonZeros & 0x3) == 2;
8605 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8609 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8610 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8612 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8615 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8617 // Check for a build vector from mostly shuffle plus few inserting.
8618 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8621 // For SSE 4.1, use insertps to put the high elements into the low element.
8622 if (Subtarget.hasSSE41()) {
8624 if (!Op.getOperand(0).isUndef())
8625 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8627 Result = DAG.getUNDEF(VT);
8629 for (unsigned i = 1; i < NumElems; ++i) {
8630 if (Op.getOperand(i).isUndef()) continue;
8631 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8632 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8637 // Otherwise, expand into a number of unpckl*, start by extending each of
8638 // our (non-undef) elements to the full vector width with the element in the
8639 // bottom slot of the vector (which generates no code for SSE).
8640 SmallVector<SDValue, 8> Ops(NumElems);
8641 for (unsigned i = 0; i < NumElems; ++i) {
8642 if (!Op.getOperand(i).isUndef())
8643 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8645 Ops[i] = DAG.getUNDEF(VT);
8648 // Next, we iteratively mix elements, e.g. for v4f32:
8649 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8650 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8651 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8652 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8653 // Generate scaled UNPCKL shuffle mask.
8654 SmallVector<int, 16> Mask;
8655 for(unsigned i = 0; i != Scale; ++i)
8657 for (unsigned i = 0; i != Scale; ++i)
8658 Mask.push_back(NumElems+i);
8659 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8661 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8662 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8667 // 256-bit AVX can use the vinsertf128 instruction
8668 // to create 256-bit vectors from two other 128-bit ones.
8669 // TODO: Detect subvector broadcast here instead of DAG combine?
8670 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8671 const X86Subtarget &Subtarget) {
8673 MVT ResVT = Op.getSimpleValueType();
8675 assert((ResVT.is256BitVector() ||
8676 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8678 unsigned NumOperands = Op.getNumOperands();
8679 unsigned NumZero = 0;
8680 unsigned NumNonZero = 0;
8681 unsigned NonZeros = 0;
8682 for (unsigned i = 0; i != NumOperands; ++i) {
8683 SDValue SubVec = Op.getOperand(i);
8684 if (SubVec.isUndef())
8686 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8689 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8695 // If we have more than 2 non-zeros, build each half separately.
8696 if (NumNonZero > 2) {
8697 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8698 ResVT.getVectorNumElements()/2);
8699 ArrayRef<SDUse> Ops = Op->ops();
8700 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8701 Ops.slice(0, NumOperands/2));
8702 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8703 Ops.slice(NumOperands/2));
8704 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8707 // Otherwise, build it up through insert_subvectors.
8708 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8709 : DAG.getUNDEF(ResVT);
8711 MVT SubVT = Op.getOperand(0).getSimpleValueType();
8712 unsigned NumSubElems = SubVT.getVectorNumElements();
8713 for (unsigned i = 0; i != NumOperands; ++i) {
8714 if ((NonZeros & (1 << i)) == 0)
8717 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
8719 DAG.getIntPtrConstant(i * NumSubElems, dl));
8725 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8726 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8727 static bool isExpandWithZeros(const SDValue &Op) {
8728 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8729 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8731 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8732 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8738 // Returns true if the given node is a type promotion (by concatenating i1
8739 // zeros) of the result of a node that already zeros all upper bits of
8741 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8742 unsigned Opc = Op.getOpcode();
8744 assert(Opc == ISD::CONCAT_VECTORS &&
8745 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8746 "Unexpected node to check for type promotion!");
8748 // As long as we are concatenating zeros to the upper part of a previous node
8749 // result, climb up the tree until a node with different opcode is
8751 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8752 if (Opc == ISD::INSERT_SUBVECTOR) {
8753 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8754 Op.getConstantOperandVal(2) == 0)
8755 Op = Op.getOperand(1);
8758 } else { // Opc == ISD::CONCAT_VECTORS
8759 if (isExpandWithZeros(Op))
8760 Op = Op.getOperand(0);
8764 Opc = Op.getOpcode();
8767 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8768 // of a node that zeros the upper bits (its masked version).
8769 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8770 (Op.getOpcode() == ISD::AND &&
8771 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8772 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8779 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
8780 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8781 const X86Subtarget &Subtarget,
8782 SelectionDAG & DAG) {
8784 MVT ResVT = Op.getSimpleValueType();
8785 unsigned NumOperands = Op.getNumOperands();
8787 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8788 "Unexpected number of operands in CONCAT_VECTORS");
8790 // If this node promotes - by concatenating zeroes - the type of the result
8791 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8792 // output register, mark it as legal and catch the pattern in instruction
8793 // selection to avoid emitting extra instructions (for zeroing upper bits).
8794 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
8795 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
8797 unsigned NumZero = 0;
8798 unsigned NumNonZero = 0;
8799 uint64_t NonZeros = 0;
8800 for (unsigned i = 0; i != NumOperands; ++i) {
8801 SDValue SubVec = Op.getOperand(i);
8802 if (SubVec.isUndef())
8804 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8807 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8808 NonZeros |= (uint64_t)1 << i;
8814 // If there are zero or one non-zeros we can handle this very simply.
8815 if (NumNonZero <= 1) {
8816 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8817 : DAG.getUNDEF(ResVT);
8820 unsigned Idx = countTrailingZeros(NonZeros);
8821 SDValue SubVec = Op.getOperand(Idx);
8822 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8823 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8824 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8827 if (NumOperands > 2) {
8828 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8829 ResVT.getVectorNumElements()/2);
8830 ArrayRef<SDUse> Ops = Op->ops();
8831 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8832 Ops.slice(0, NumOperands/2));
8833 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8834 Ops.slice(NumOperands/2));
8835 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8838 assert(NumNonZero == 2 && "Simple cases not handled?");
8840 if (ResVT.getVectorNumElements() >= 16)
8841 return Op; // The operation is legal with KUNPCK
8843 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8844 DAG.getUNDEF(ResVT), Op.getOperand(0),
8845 DAG.getIntPtrConstant(0, dl));
8846 unsigned NumElems = ResVT.getVectorNumElements();
8847 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8848 DAG.getIntPtrConstant(NumElems/2, dl));
8851 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8852 const X86Subtarget &Subtarget,
8853 SelectionDAG &DAG) {
8854 MVT VT = Op.getSimpleValueType();
8855 if (VT.getVectorElementType() == MVT::i1)
8856 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8858 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8859 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8860 Op.getNumOperands() == 4)));
8862 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8863 // from two other 128-bit ones.
8865 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8866 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
8869 //===----------------------------------------------------------------------===//
8870 // Vector shuffle lowering
8872 // This is an experimental code path for lowering vector shuffles on x86. It is
8873 // designed to handle arbitrary vector shuffles and blends, gracefully
8874 // degrading performance as necessary. It works hard to recognize idiomatic
8875 // shuffles and lower them to optimal instruction patterns without leaving
8876 // a framework that allows reasonably efficient handling of all vector shuffle
8878 //===----------------------------------------------------------------------===//
8880 /// Tiny helper function to identify a no-op mask.
8882 /// This is a somewhat boring predicate function. It checks whether the mask
8883 /// array input, which is assumed to be a single-input shuffle mask of the kind
8884 /// used by the X86 shuffle instructions (not a fully general
8885 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8886 /// in-place shuffle are 'no-op's.
8887 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8888 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8889 assert(Mask[i] >= -1 && "Out of bound mask element!");
8890 if (Mask[i] >= 0 && Mask[i] != i)
8896 /// Test whether there are elements crossing 128-bit lanes in this
8899 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8900 /// and we routinely test for these.
8901 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8902 int LaneSize = 128 / VT.getScalarSizeInBits();
8903 int Size = Mask.size();
8904 for (int i = 0; i < Size; ++i)
8905 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8910 /// Test whether a shuffle mask is equivalent within each sub-lane.
8912 /// This checks a shuffle mask to see if it is performing the same
8913 /// lane-relative shuffle in each sub-lane. This trivially implies
8914 /// that it is also not lane-crossing. It may however involve a blend from the
8915 /// same lane of a second vector.
8917 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8918 /// non-trivial to compute in the face of undef lanes. The representation is
8919 /// suitable for use with existing 128-bit shuffles as entries from the second
8920 /// vector have been remapped to [LaneSize, 2*LaneSize).
8921 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8923 SmallVectorImpl<int> &RepeatedMask) {
8924 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8925 RepeatedMask.assign(LaneSize, -1);
8926 int Size = Mask.size();
8927 for (int i = 0; i < Size; ++i) {
8928 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8931 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8932 // This entry crosses lanes, so there is no way to model this shuffle.
8935 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8936 // Adjust second vector indices to start at LaneSize instead of Size.
8937 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8938 : Mask[i] % LaneSize + LaneSize;
8939 if (RepeatedMask[i % LaneSize] < 0)
8940 // This is the first non-undef entry in this slot of a 128-bit lane.
8941 RepeatedMask[i % LaneSize] = LocalM;
8942 else if (RepeatedMask[i % LaneSize] != LocalM)
8943 // Found a mismatch with the repeated mask.
8949 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8951 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8952 SmallVectorImpl<int> &RepeatedMask) {
8953 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8957 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) {
8958 SmallVector<int, 32> RepeatedMask;
8959 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8962 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8964 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8965 SmallVectorImpl<int> &RepeatedMask) {
8966 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8969 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8970 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8971 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8973 SmallVectorImpl<int> &RepeatedMask) {
8974 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8975 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8976 int Size = Mask.size();
8977 for (int i = 0; i < Size; ++i) {
8978 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8979 if (Mask[i] == SM_SentinelUndef)
8981 if (Mask[i] == SM_SentinelZero) {
8982 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8984 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8987 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8988 // This entry crosses lanes, so there is no way to model this shuffle.
8991 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8992 // Adjust second vector indices to start at LaneSize instead of Size.
8994 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8995 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8996 // This is the first non-undef entry in this slot of a 128-bit lane.
8997 RepeatedMask[i % LaneSize] = LocalM;
8998 else if (RepeatedMask[i % LaneSize] != LocalM)
8999 // Found a mismatch with the repeated mask.
9005 /// Checks whether a shuffle mask is equivalent to an explicit list of
9008 /// This is a fast way to test a shuffle mask against a fixed pattern:
9010 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
9012 /// It returns true if the mask is exactly as wide as the argument list, and
9013 /// each element of the mask is either -1 (signifying undef) or the value given
9014 /// in the argument.
9015 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
9016 ArrayRef<int> ExpectedMask) {
9017 if (Mask.size() != ExpectedMask.size())
9020 int Size = Mask.size();
9022 // If the values are build vectors, we can look through them to find
9023 // equivalent inputs that make the shuffles equivalent.
9024 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
9025 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
9027 for (int i = 0; i < Size; ++i) {
9028 assert(Mask[i] >= -1 && "Out of bound mask element!");
9029 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
9030 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
9031 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
9032 if (!MaskBV || !ExpectedBV ||
9033 MaskBV->getOperand(Mask[i] % Size) !=
9034 ExpectedBV->getOperand(ExpectedMask[i] % Size))
9042 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
9044 /// The masks must be exactly the same width.
9046 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
9047 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
9049 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
9050 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
9051 ArrayRef<int> ExpectedMask) {
9052 int Size = Mask.size();
9053 if (Size != (int)ExpectedMask.size())
9056 for (int i = 0; i < Size; ++i)
9057 if (Mask[i] == SM_SentinelUndef)
9059 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
9061 else if (Mask[i] != ExpectedMask[i])
9067 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
9069 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
9070 const APInt &Zeroable) {
9071 int NumElts = Mask.size();
9072 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
9074 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
9075 for (int i = 0; i != NumElts; ++i) {
9077 if (M == SM_SentinelUndef)
9079 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
9080 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
9085 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9087 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9088 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9091 SmallVector<int, 8> Unpcklwd;
9092 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9093 /* Unary = */ false);
9094 SmallVector<int, 8> Unpckhwd;
9095 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9096 /* Unary = */ false);
9097 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9098 isTargetShuffleEquivalent(Mask, Unpckhwd));
9099 return IsUnpackwdMask;
9102 /// Get a 4-lane 8-bit shuffle immediate for a mask.
9104 /// This helper function produces an 8-bit shuffle immediate corresponding to
9105 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9106 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9109 /// NB: We rely heavily on "undef" masks preserving the input lane.
9110 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9111 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9112 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9113 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9114 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9115 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9118 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9119 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9120 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9121 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9125 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9126 SelectionDAG &DAG) {
9127 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9130 /// Compute whether each element of a shuffle is zeroable.
9132 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9133 /// Either it is an undef element in the shuffle mask, the element of the input
9134 /// referenced is undef, or the element of the input referenced is known to be
9135 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9136 /// as many lanes with this technique as possible to simplify the remaining
9138 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9139 SDValue V1, SDValue V2) {
9140 APInt Zeroable(Mask.size(), 0);
9141 V1 = peekThroughBitcasts(V1);
9142 V2 = peekThroughBitcasts(V2);
9144 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9145 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9147 int VectorSizeInBits = V1.getValueSizeInBits();
9148 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9149 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9151 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9153 // Handle the easy cases.
9154 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9159 // Determine shuffle input and normalize the mask.
9160 SDValue V = M < Size ? V1 : V2;
9163 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9164 if (V.getOpcode() != ISD::BUILD_VECTOR)
9167 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9168 // the (larger) source element must be UNDEF/ZERO.
9169 if ((Size % V.getNumOperands()) == 0) {
9170 int Scale = Size / V->getNumOperands();
9171 SDValue Op = V.getOperand(M / Scale);
9172 if (Op.isUndef() || X86::isZeroNode(Op))
9174 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9175 APInt Val = Cst->getAPIntValue();
9176 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9177 Val = Val.getLoBits(ScalarSizeInBits);
9180 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9181 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9182 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9183 Val = Val.getLoBits(ScalarSizeInBits);
9190 // If the BUILD_VECTOR has more elements then all the (smaller) source
9191 // elements must be UNDEF or ZERO.
9192 if ((V.getNumOperands() % Size) == 0) {
9193 int Scale = V->getNumOperands() / Size;
9194 bool AllZeroable = true;
9195 for (int j = 0; j < Scale; ++j) {
9196 SDValue Op = V.getOperand((M * Scale) + j);
9197 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9208 // The Shuffle result is as follow:
9209 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9210 // Each Zeroable's element correspond to a particular Mask's element.
9211 // As described in computeZeroableShuffleElements function.
9213 // The function looks for a sub-mask that the nonzero elements are in
9214 // increasing order. If such sub-mask exist. The function returns true.
9215 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9216 ArrayRef<int> Mask, const EVT &VectorType,
9217 bool &IsZeroSideLeft) {
9218 int NextElement = -1;
9219 // Check if the Mask's nonzero elements are in increasing order.
9220 for (int i = 0, e = Mask.size(); i < e; i++) {
9221 // Checks if the mask's zeros elements are built from only zeros.
9222 assert(Mask[i] >= -1 && "Out of bound mask element!");
9227 // Find the lowest non zero element
9228 if (NextElement < 0) {
9229 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9230 IsZeroSideLeft = NextElement != 0;
9232 // Exit if the mask's non zero elements are not in increasing order.
9233 if (NextElement != Mask[i])
9240 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9241 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9242 ArrayRef<int> Mask, SDValue V1,
9244 const APInt &Zeroable,
9245 const X86Subtarget &Subtarget,
9246 SelectionDAG &DAG) {
9247 int Size = Mask.size();
9248 int LaneSize = 128 / VT.getScalarSizeInBits();
9249 const int NumBytes = VT.getSizeInBits() / 8;
9250 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9252 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9253 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9254 (Subtarget.hasBWI() && VT.is512BitVector()));
9256 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9257 // Sign bit set in i8 mask means zero element.
9258 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9261 for (int i = 0; i < NumBytes; ++i) {
9262 int M = Mask[i / NumEltBytes];
9264 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9267 if (Zeroable[i / NumEltBytes]) {
9268 PSHUFBMask[i] = ZeroMask;
9272 // We can only use a single input of V1 or V2.
9273 SDValue SrcV = (M >= Size ? V2 : V1);
9279 // PSHUFB can't cross lanes, ensure this doesn't happen.
9280 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9284 M = M * NumEltBytes + (i % NumEltBytes);
9285 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9287 assert(V && "Failed to find a source input");
9289 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9290 return DAG.getBitcast(
9291 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9292 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9295 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9296 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9299 // X86 has dedicated shuffle that can be lowered to VEXPAND
9300 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9301 const APInt &Zeroable,
9302 ArrayRef<int> Mask, SDValue &V1,
9303 SDValue &V2, SelectionDAG &DAG,
9304 const X86Subtarget &Subtarget) {
9305 bool IsLeftZeroSide = true;
9306 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9309 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9311 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9312 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9313 unsigned NumElts = VT.getVectorNumElements();
9314 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9315 "Unexpected number of vector elements");
9316 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9317 Subtarget, DAG, DL);
9318 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9319 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9320 return DAG.getSelect(DL, VT, VMask,
9321 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9325 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9326 unsigned &UnpackOpcode, bool IsUnary,
9327 ArrayRef<int> TargetMask,
9328 const SDLoc &DL, SelectionDAG &DAG,
9329 const X86Subtarget &Subtarget) {
9330 int NumElts = VT.getVectorNumElements();
9332 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9333 for (int i = 0; i != NumElts; i += 2) {
9334 int M1 = TargetMask[i + 0];
9335 int M2 = TargetMask[i + 1];
9336 Undef1 &= (SM_SentinelUndef == M1);
9337 Undef2 &= (SM_SentinelUndef == M2);
9338 Zero1 &= isUndefOrZero(M1);
9339 Zero2 &= isUndefOrZero(M2);
9341 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9342 "Zeroable shuffle detected");
9344 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9345 SmallVector<int, 64> Unpckl, Unpckh;
9346 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9347 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9348 UnpackOpcode = X86ISD::UNPCKL;
9349 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9350 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9354 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9355 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9356 UnpackOpcode = X86ISD::UNPCKH;
9357 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9358 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9362 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9363 if (IsUnary && (Zero1 || Zero2)) {
9364 // Don't bother if we can blend instead.
9365 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9366 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9369 bool MatchLo = true, MatchHi = true;
9370 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9371 int M = TargetMask[i];
9373 // Ignore if the input is known to be zero or the index is undef.
9374 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9375 (M == SM_SentinelUndef))
9378 MatchLo &= (M == Unpckl[i]);
9379 MatchHi &= (M == Unpckh[i]);
9382 if (MatchLo || MatchHi) {
9383 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9384 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9385 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9390 // If a binary shuffle, commute and try again.
9392 ShuffleVectorSDNode::commuteMask(Unpckl);
9393 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9394 UnpackOpcode = X86ISD::UNPCKL;
9399 ShuffleVectorSDNode::commuteMask(Unpckh);
9400 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9401 UnpackOpcode = X86ISD::UNPCKH;
9410 // X86 has dedicated unpack instructions that can handle specific blend
9411 // operations: UNPCKH and UNPCKL.
9412 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9413 ArrayRef<int> Mask, SDValue V1,
9414 SDValue V2, SelectionDAG &DAG) {
9415 SmallVector<int, 8> Unpckl;
9416 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9417 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9418 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9420 SmallVector<int, 8> Unpckh;
9421 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9422 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9423 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9425 // Commute and try again.
9426 ShuffleVectorSDNode::commuteMask(Unpckl);
9427 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9428 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9430 ShuffleVectorSDNode::commuteMask(Unpckh);
9431 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9432 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9437 static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
9439 int Size = (int)Mask.size();
9440 int Split = Size / Delta;
9441 int TruncatedVectorStart = SwappedOps ? Size : 0;
9443 // Match for mask starting with e.g.: <8, 10, 12, 14,... or <0, 2, 4, 6,...
9444 if (!isSequentialOrUndefInRange(Mask, 0, Split, TruncatedVectorStart, Delta))
9447 // The rest of the mask should not refer to the truncated vector's elements.
9448 if (isAnyInRange(Mask.slice(Split, Size - Split), TruncatedVectorStart,
9449 TruncatedVectorStart + Size))
9455 // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction.
9457 // An example is the following:
9459 // t0: ch = EntryToken
9460 // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0
9461 // t25: v4i32 = truncate t2
9462 // t41: v8i16 = bitcast t25
9463 // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16,
9464 // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0>
9465 // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21
9466 // t18: v2i64 = bitcast t51
9468 // Without avx512vl, this is lowered to:
9470 // vpmovqd %zmm0, %ymm0
9471 // vpshufb {{.*#+}} xmm0 =
9472 // xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
9474 // But when avx512vl is available, one can just use a single vpmovdw
9476 static SDValue lowerVectorShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
9477 MVT VT, SDValue V1, SDValue V2,
9479 const X86Subtarget &Subtarget) {
9480 if (VT != MVT::v16i8 && VT != MVT::v8i16)
9483 if (Mask.size() != VT.getVectorNumElements())
9486 bool SwappedOps = false;
9488 if (!ISD::isBuildVectorAllZeros(V2.getNode())) {
9489 if (!ISD::isBuildVectorAllZeros(V1.getNode()))
9498 // bitcast (truncate <8 x i32> %vec to <8 x i16>) to <16 x i8>
9499 // bitcast (truncate <4 x i64> %vec to <4 x i32>) to <8 x i16>
9501 // and similar ones.
9502 if (V1.getOpcode() != ISD::BITCAST)
9504 if (V1.getOperand(0).getOpcode() != ISD::TRUNCATE)
9507 SDValue Src = V1.getOperand(0).getOperand(0);
9508 MVT SrcVT = Src.getSimpleValueType();
9510 // The vptrunc** instructions truncating 128 bit and 256 bit vectors
9511 // are only available with avx512vl.
9512 if (!SrcVT.is512BitVector() && !Subtarget.hasVLX())
9515 // Down Convert Word to Byte is only available with avx512bw. The case with
9516 // 256-bit output doesn't contain a shuffle and is therefore not handled here.
9517 if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
9518 !Subtarget.hasBWI())
9521 // The first half/quarter of the mask should refer to every second/fourth
9522 // element of the vector truncated and bitcasted.
9523 if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
9524 !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
9527 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
9530 // X86 has dedicated pack instructions that can handle specific truncation
9531 // operations: PACKSS and PACKUS.
9532 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9533 SDValue &V2, unsigned &PackOpcode,
9534 ArrayRef<int> TargetMask,
9536 const X86Subtarget &Subtarget) {
9537 unsigned NumElts = VT.getVectorNumElements();
9538 unsigned BitSize = VT.getScalarSizeInBits();
9539 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9540 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9542 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9543 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9544 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9545 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9546 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9547 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9548 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9552 PackOpcode = X86ISD::PACKUS;
9556 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9557 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9561 PackOpcode = X86ISD::PACKSS;
9567 // Try binary shuffle.
9568 SmallVector<int, 32> BinaryMask;
9569 createPackShuffleMask(VT, BinaryMask, false);
9570 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9571 if (MatchPACK(V1, V2))
9574 // Try unary shuffle.
9575 SmallVector<int, 32> UnaryMask;
9576 createPackShuffleMask(VT, UnaryMask, true);
9577 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9578 if (MatchPACK(V1, V1))
9584 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9585 ArrayRef<int> Mask, SDValue V1,
9586 SDValue V2, SelectionDAG &DAG,
9587 const X86Subtarget &Subtarget) {
9589 unsigned PackOpcode;
9590 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9592 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9593 DAG.getBitcast(PackVT, V2));
9598 /// Try to emit a bitmask instruction for a shuffle.
9600 /// This handles cases where we can model a blend exactly as a bitmask due to
9601 /// one of the inputs being zeroable.
9602 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9603 SDValue V2, ArrayRef<int> Mask,
9604 const APInt &Zeroable,
9605 SelectionDAG &DAG) {
9606 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9607 MVT EltVT = VT.getVectorElementType();
9608 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9609 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9610 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9612 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9615 if (Mask[i] % Size != i)
9616 return SDValue(); // Not a blend.
9618 V = Mask[i] < Size ? V1 : V2;
9619 else if (V != (Mask[i] < Size ? V1 : V2))
9620 return SDValue(); // Can only let one input through the mask.
9622 VMaskOps[i] = AllOnes;
9625 return SDValue(); // No non-zeroable elements!
9627 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9628 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9631 /// Try to emit a blend instruction for a shuffle using bit math.
9633 /// This is used as a fallback approach when first class blend instructions are
9634 /// unavailable. Currently it is only suitable for integer vectors, but could
9635 /// be generalized for floating point vectors if desirable.
9636 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9637 SDValue V2, ArrayRef<int> Mask,
9638 SelectionDAG &DAG) {
9639 assert(VT.isInteger() && "Only supports integer vector types!");
9640 MVT EltVT = VT.getVectorElementType();
9641 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9642 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9643 SmallVector<SDValue, 16> MaskOps;
9644 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9645 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9646 return SDValue(); // Shuffled input!
9647 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9650 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9651 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9652 // We have to cast V2 around.
9653 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9654 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9655 DAG.getBitcast(MaskVT, V1Mask),
9656 DAG.getBitcast(MaskVT, V2)));
9657 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9660 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9661 SDValue PreservedSrc,
9662 const X86Subtarget &Subtarget,
9665 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9666 MutableArrayRef<int> TargetMask,
9667 bool &ForceV1Zero, bool &ForceV2Zero,
9668 uint64_t &BlendMask) {
9669 bool V1IsZeroOrUndef =
9670 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9671 bool V2IsZeroOrUndef =
9672 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9675 ForceV1Zero = false, ForceV2Zero = false;
9676 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9678 // Attempt to generate the binary blend mask. If an input is zero then
9679 // we can use any lane.
9680 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9681 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9682 int M = TargetMask[i];
9683 if (M == SM_SentinelUndef)
9687 if (M == i + Size) {
9688 BlendMask |= 1ull << i;
9691 if (M == SM_SentinelZero) {
9692 if (V1IsZeroOrUndef) {
9697 if (V2IsZeroOrUndef) {
9699 BlendMask |= 1ull << i;
9700 TargetMask[i] = i + Size;
9709 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9711 uint64_t ScaledMask = 0;
9712 for (int i = 0; i != Size; ++i)
9713 if (BlendMask & (1ull << i))
9714 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9718 /// Try to emit a blend instruction for a shuffle.
9720 /// This doesn't do any checks for the availability of instructions for blending
9721 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9722 /// be matched in the backend with the type given. What it does check for is
9723 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9724 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9725 SDValue V2, ArrayRef<int> Original,
9726 const APInt &Zeroable,
9727 const X86Subtarget &Subtarget,
9728 SelectionDAG &DAG) {
9729 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9731 uint64_t BlendMask = 0;
9732 bool ForceV1Zero = false, ForceV2Zero = false;
9733 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9737 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9739 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9741 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9743 switch (VT.SimpleTy) {
9748 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9749 DAG.getConstant(BlendMask, DL, MVT::i8));
9753 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9757 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9758 // that instruction.
9759 if (Subtarget.hasAVX2()) {
9760 // Scale the blend by the number of 32-bit dwords per element.
9761 int Scale = VT.getScalarSizeInBits() / 32;
9762 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9763 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9764 V1 = DAG.getBitcast(BlendVT, V1);
9765 V2 = DAG.getBitcast(BlendVT, V2);
9766 return DAG.getBitcast(
9767 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9768 DAG.getConstant(BlendMask, DL, MVT::i8)));
9772 // For integer shuffles we need to expand the mask and cast the inputs to
9773 // v8i16s prior to blending.
9774 int Scale = 8 / VT.getVectorNumElements();
9775 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9776 V1 = DAG.getBitcast(MVT::v8i16, V1);
9777 V2 = DAG.getBitcast(MVT::v8i16, V2);
9778 return DAG.getBitcast(VT,
9779 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9780 DAG.getConstant(BlendMask, DL, MVT::i8)));
9784 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9785 SmallVector<int, 8> RepeatedMask;
9786 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9787 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9788 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9790 for (int i = 0; i < 8; ++i)
9791 if (RepeatedMask[i] >= 8)
9792 BlendMask |= 1ull << i;
9793 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9794 DAG.getConstant(BlendMask, DL, MVT::i8));
9800 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9801 "256-bit byte-blends require AVX2 support!");
9803 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9805 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9806 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9807 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9810 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9811 if (SDValue Masked =
9812 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9815 // Scale the blend by the number of bytes per element.
9816 int Scale = VT.getScalarSizeInBits() / 8;
9818 // This form of blend is always done on bytes. Compute the byte vector
9820 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9822 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9823 // mix of LLVM's code generator and the x86 backend. We tell the code
9824 // generator that boolean values in the elements of an x86 vector register
9825 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9826 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9827 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9828 // of the element (the remaining are ignored) and 0 in that high bit would
9829 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9830 // the LLVM model for boolean values in vector elements gets the relevant
9831 // bit set, it is set backwards and over constrained relative to x86's
9833 SmallVector<SDValue, 32> VSELECTMask;
9834 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9835 for (int j = 0; j < Scale; ++j)
9836 VSELECTMask.push_back(
9837 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9838 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9841 V1 = DAG.getBitcast(BlendVT, V1);
9842 V2 = DAG.getBitcast(BlendVT, V2);
9843 return DAG.getBitcast(
9845 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9855 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9856 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9857 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9860 llvm_unreachable("Not a supported integer vector type!");
9864 /// Try to lower as a blend of elements from two inputs followed by
9865 /// a single-input permutation.
9867 /// This matches the pattern where we can blend elements from two inputs and
9868 /// then reduce the shuffle to a single-input permutation.
9869 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9870 SDValue V1, SDValue V2,
9872 SelectionDAG &DAG) {
9873 // We build up the blend mask while checking whether a blend is a viable way
9874 // to reduce the shuffle.
9875 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9876 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9878 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9882 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9884 if (BlendMask[Mask[i] % Size] < 0)
9885 BlendMask[Mask[i] % Size] = Mask[i];
9886 else if (BlendMask[Mask[i] % Size] != Mask[i])
9887 return SDValue(); // Can't blend in the needed input!
9889 PermuteMask[i] = Mask[i] % Size;
9892 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9893 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9896 /// Generic routine to decompose a shuffle and blend into independent
9897 /// blends and permutes.
9899 /// This matches the extremely common pattern for handling combined
9900 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9901 /// operations. It will try to pick the best arrangement of shuffles and
9903 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9907 SelectionDAG &DAG) {
9908 // Shuffle the input elements into the desired positions in V1 and V2 and
9909 // blend them together.
9910 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9911 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9912 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9913 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9914 if (Mask[i] >= 0 && Mask[i] < Size) {
9915 V1Mask[i] = Mask[i];
9917 } else if (Mask[i] >= Size) {
9918 V2Mask[i] = Mask[i] - Size;
9919 BlendMask[i] = i + Size;
9922 // Try to lower with the simpler initial blend strategy unless one of the
9923 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9924 // shuffle may be able to fold with a load or other benefit. However, when
9925 // we'll have to do 2x as many shuffles in order to achieve this, blending
9926 // first is a better strategy.
9927 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9928 if (SDValue BlendPerm =
9929 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9932 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9933 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9934 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9937 /// Try to lower a vector shuffle as a rotation.
9939 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9940 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9941 ArrayRef<int> Mask) {
9942 int NumElts = Mask.size();
9944 // We need to detect various ways of spelling a rotation:
9945 // [11, 12, 13, 14, 15, 0, 1, 2]
9946 // [-1, 12, 13, 14, -1, -1, 1, -1]
9947 // [-1, -1, -1, -1, -1, -1, 1, 2]
9948 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9949 // [-1, 4, 5, 6, -1, -1, 9, -1]
9950 // [-1, 4, 5, 6, -1, -1, -1, -1]
9953 for (int i = 0; i < NumElts; ++i) {
9955 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9956 "Unexpected mask index.");
9960 // Determine where a rotated vector would have started.
9961 int StartIdx = i - (M % NumElts);
9963 // The identity rotation isn't interesting, stop.
9966 // If we found the tail of a vector the rotation must be the missing
9967 // front. If we found the head of a vector, it must be how much of the
9969 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9972 Rotation = CandidateRotation;
9973 else if (Rotation != CandidateRotation)
9974 // The rotations don't match, so we can't match this mask.
9977 // Compute which value this mask is pointing at.
9978 SDValue MaskV = M < NumElts ? V1 : V2;
9980 // Compute which of the two target values this index should be assigned
9981 // to. This reflects whether the high elements are remaining or the low
9982 // elements are remaining.
9983 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9985 // Either set up this value if we've not encountered it before, or check
9986 // that it remains consistent.
9989 else if (TargetV != MaskV)
9990 // This may be a rotation, but it pulls from the inputs in some
9991 // unsupported interleaving.
9995 // Check that we successfully analyzed the mask, and normalize the results.
9996 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9997 assert((Lo || Hi) && "Failed to find a rotated input vector!");
10009 /// Try to lower a vector shuffle as a byte rotation.
10011 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
10012 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
10013 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
10014 /// try to generically lower a vector shuffle through such an pattern. It
10015 /// does not check for the profitability of lowering either as PALIGNR or
10016 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
10017 /// This matches shuffle vectors that look like:
10019 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
10021 /// Essentially it concatenates V1 and V2, shifts right by some number of
10022 /// elements, and takes the low elements as the result. Note that while this is
10023 /// specified as a *right shift* because x86 is little-endian, it is a *left
10024 /// rotate* of the vector lanes.
10025 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
10026 ArrayRef<int> Mask) {
10027 // Don't accept any shuffles with zero elements.
10028 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
10031 // PALIGNR works on 128-bit lanes.
10032 SmallVector<int, 16> RepeatedMask;
10033 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
10036 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
10040 // PALIGNR rotates bytes, so we need to scale the
10041 // rotation based on how many bytes are in the vector lane.
10042 int NumElts = RepeatedMask.size();
10043 int Scale = 16 / NumElts;
10044 return Rotation * Scale;
10047 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
10048 SDValue V1, SDValue V2,
10049 ArrayRef<int> Mask,
10050 const X86Subtarget &Subtarget,
10051 SelectionDAG &DAG) {
10052 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
10054 SDValue Lo = V1, Hi = V2;
10055 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
10056 if (ByteRotation <= 0)
10059 // Cast the inputs to i8 vector of correct length to match PALIGNR or
10061 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
10062 Lo = DAG.getBitcast(ByteVT, Lo);
10063 Hi = DAG.getBitcast(ByteVT, Hi);
10065 // SSSE3 targets can use the palignr instruction.
10066 if (Subtarget.hasSSSE3()) {
10067 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
10068 "512-bit PALIGNR requires BWI instructions");
10069 return DAG.getBitcast(
10070 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
10071 DAG.getConstant(ByteRotation, DL, MVT::i8)));
10074 assert(VT.is128BitVector() &&
10075 "Rotate-based lowering only supports 128-bit lowering!");
10076 assert(Mask.size() <= 16 &&
10077 "Can shuffle at most 16 bytes in a 128-bit vector!");
10078 assert(ByteVT == MVT::v16i8 &&
10079 "SSE2 rotate lowering only needed for v16i8!");
10081 // Default SSE2 implementation
10082 int LoByteShift = 16 - ByteRotation;
10083 int HiByteShift = ByteRotation;
10085 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
10086 DAG.getConstant(LoByteShift, DL, MVT::i8));
10087 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
10088 DAG.getConstant(HiByteShift, DL, MVT::i8));
10089 return DAG.getBitcast(VT,
10090 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
10093 /// Try to lower a vector shuffle as a dword/qword rotation.
10095 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
10096 /// rotation of the concatenation of two vectors; This routine will
10097 /// try to generically lower a vector shuffle through such an pattern.
10099 /// Essentially it concatenates V1 and V2, shifts right by some number of
10100 /// elements, and takes the low elements as the result. Note that while this is
10101 /// specified as a *right shift* because x86 is little-endian, it is a *left
10102 /// rotate* of the vector lanes.
10103 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
10104 SDValue V1, SDValue V2,
10105 ArrayRef<int> Mask,
10106 const X86Subtarget &Subtarget,
10107 SelectionDAG &DAG) {
10108 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
10109 "Only 32-bit and 64-bit elements are supported!");
10111 // 128/256-bit vectors are only supported with VLX.
10112 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
10113 && "VLX required for 128/256-bit vectors");
10115 SDValue Lo = V1, Hi = V2;
10116 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
10120 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
10121 DAG.getConstant(Rotation, DL, MVT::i8));
10124 /// Try to lower a vector shuffle as a bit shift (shifts in zeros).
10126 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
10127 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
10128 /// matches elements from one of the input vectors shuffled to the left or
10129 /// right with zeroable elements 'shifted in'. It handles both the strictly
10130 /// bit-wise element shifts and the byte shift across an entire 128-bit double
10131 /// quad word lane.
10133 /// PSHL : (little-endian) left bit shift.
10134 /// [ zz, 0, zz, 2 ]
10135 /// [ -1, 4, zz, -1 ]
10136 /// PSRL : (little-endian) right bit shift.
10137 /// [ 1, zz, 3, zz]
10138 /// [ -1, -1, 7, zz]
10139 /// PSLLDQ : (little-endian) left byte shift
10140 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
10141 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
10142 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
10143 /// PSRLDQ : (little-endian) right byte shift
10144 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
10145 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
10146 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
10147 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
10148 unsigned ScalarSizeInBits,
10149 ArrayRef<int> Mask, int MaskOffset,
10150 const APInt &Zeroable,
10151 const X86Subtarget &Subtarget) {
10152 int Size = Mask.size();
10153 unsigned SizeInBits = Size * ScalarSizeInBits;
10155 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
10156 for (int i = 0; i < Size; i += Scale)
10157 for (int j = 0; j < Shift; ++j)
10158 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
10164 auto MatchShift = [&](int Shift, int Scale, bool Left) {
10165 for (int i = 0; i != Size; i += Scale) {
10166 unsigned Pos = Left ? i + Shift : i;
10167 unsigned Low = Left ? i : i + Shift;
10168 unsigned Len = Scale - Shift;
10169 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
10173 int ShiftEltBits = ScalarSizeInBits * Scale;
10174 bool ByteShift = ShiftEltBits > 64;
10175 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
10176 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10177 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10179 // Normalize the scale for byte shifts to still produce an i64 element
10181 Scale = ByteShift ? Scale / 2 : Scale;
10183 // We need to round trip through the appropriate type for the shift.
10184 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10185 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10186 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10187 return (int)ShiftAmt;
10190 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10191 // keep doubling the size of the integer elements up to that. We can
10192 // then shift the elements of the integer vector by whole multiples of
10193 // their width within the elements of the larger integer vector. Test each
10194 // multiple to see if we can find a match with the moved element indices
10195 // and that the shifted in elements are all zeroable.
10196 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10197 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10198 for (int Shift = 1; Shift != Scale; ++Shift)
10199 for (bool Left : {true, false})
10200 if (CheckZeros(Shift, Scale, Left)) {
10201 int ShiftAmt = MatchShift(Shift, Scale, Left);
10210 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10211 SDValue V2, ArrayRef<int> Mask,
10212 const APInt &Zeroable,
10213 const X86Subtarget &Subtarget,
10214 SelectionDAG &DAG) {
10215 int Size = Mask.size();
10216 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10222 // Try to match shuffle against V1 shift.
10223 int ShiftAmt = matchVectorShuffleAsShift(
10224 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
10226 // If V1 failed, try to match shuffle against V2 shift.
10227 if (ShiftAmt < 0) {
10229 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
10230 Mask, Size, Zeroable, Subtarget);
10237 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
10238 "Illegal integer vector type");
10239 V = DAG.getBitcast(ShiftVT, V);
10240 V = DAG.getNode(Opcode, DL, ShiftVT, V,
10241 DAG.getConstant(ShiftAmt, DL, MVT::i8));
10242 return DAG.getBitcast(VT, V);
10245 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
10246 // Remainder of lower half result is zero and upper half is all undef.
10247 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
10248 ArrayRef<int> Mask, uint64_t &BitLen,
10249 uint64_t &BitIdx, const APInt &Zeroable) {
10250 int Size = Mask.size();
10251 int HalfSize = Size / 2;
10252 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10253 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
10255 // Upper half must be undefined.
10256 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10259 // Determine the extraction length from the part of the
10260 // lower half that isn't zeroable.
10261 int Len = HalfSize;
10262 for (; Len > 0; --Len)
10263 if (!Zeroable[Len - 1])
10265 assert(Len > 0 && "Zeroable shuffle mask");
10267 // Attempt to match first Len sequential elements from the lower half.
10270 for (int i = 0; i != Len; ++i) {
10272 if (M == SM_SentinelUndef)
10274 SDValue &V = (M < Size ? V1 : V2);
10277 // The extracted elements must start at a valid index and all mask
10278 // elements must be in the lower half.
10279 if (i > M || M >= HalfSize)
10282 if (Idx < 0 || (Src == V && Idx == (M - i))) {
10290 if (!Src || Idx < 0)
10293 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
10294 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10295 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10300 // INSERTQ: Extract lowest Len elements from lower half of second source and
10301 // insert over first source, starting at Idx.
10302 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
10303 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
10304 ArrayRef<int> Mask, uint64_t &BitLen,
10305 uint64_t &BitIdx) {
10306 int Size = Mask.size();
10307 int HalfSize = Size / 2;
10308 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10310 // Upper half must be undefined.
10311 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10314 for (int Idx = 0; Idx != HalfSize; ++Idx) {
10317 // Attempt to match first source from mask before insertion point.
10318 if (isUndefInRange(Mask, 0, Idx)) {
10320 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
10322 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
10328 // Extend the extraction length looking to match both the insertion of
10329 // the second source and the remaining elements of the first.
10330 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
10332 int Len = Hi - Idx;
10334 // Match insertion.
10335 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
10337 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
10343 // Match the remaining elements of the lower half.
10344 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
10346 } else if ((!Base || (Base == V1)) &&
10347 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
10349 } else if ((!Base || (Base == V2)) &&
10350 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
10357 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10358 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10368 /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
10369 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
10370 SDValue V2, ArrayRef<int> Mask,
10371 const APInt &Zeroable,
10372 SelectionDAG &DAG) {
10373 uint64_t BitLen, BitIdx;
10374 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
10375 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
10376 DAG.getConstant(BitLen, DL, MVT::i8),
10377 DAG.getConstant(BitIdx, DL, MVT::i8));
10379 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
10380 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
10381 V2 ? V2 : DAG.getUNDEF(VT),
10382 DAG.getConstant(BitLen, DL, MVT::i8),
10383 DAG.getConstant(BitIdx, DL, MVT::i8));
10388 /// Lower a vector shuffle as a zero or any extension.
10390 /// Given a specific number of elements, element bit width, and extension
10391 /// stride, produce either a zero or any extension based on the available
10392 /// features of the subtarget. The extended elements are consecutive and
10393 /// begin and can start from an offsetted element index in the input; to
10394 /// avoid excess shuffling the offset must either being in the bottom lane
10395 /// or at the start of a higher lane. All extended elements must be from
10397 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10398 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
10399 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10400 assert(Scale > 1 && "Need a scale to extend.");
10401 int EltBits = VT.getScalarSizeInBits();
10402 int NumElements = VT.getVectorNumElements();
10403 int NumEltsPerLane = 128 / EltBits;
10404 int OffsetLane = Offset / NumEltsPerLane;
10405 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
10406 "Only 8, 16, and 32 bit elements can be extended.");
10407 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
10408 assert(0 <= Offset && "Extension offset must be positive.");
10409 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
10410 "Extension offset must be in the first lane or start an upper lane.");
10412 // Check that an index is in same lane as the base offset.
10413 auto SafeOffset = [&](int Idx) {
10414 return OffsetLane == (Idx / NumEltsPerLane);
10417 // Shift along an input so that the offset base moves to the first element.
10418 auto ShuffleOffset = [&](SDValue V) {
10422 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10423 for (int i = 0; i * Scale < NumElements; ++i) {
10424 int SrcIdx = i + Offset;
10425 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10427 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10430 // Found a valid zext mask! Try various lowering strategies based on the
10431 // input type and available ISA extensions.
10432 if (Subtarget.hasSSE41()) {
10433 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10434 // PUNPCK will catch this in a later shuffle match.
10435 if (Offset && Scale == 2 && VT.is128BitVector())
10437 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10438 NumElements / Scale);
10439 InputV = ShuffleOffset(InputV);
10440 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10441 return DAG.getBitcast(VT, InputV);
10444 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10446 // For any extends we can cheat for larger element sizes and use shuffle
10447 // instructions that can fold with a load and/or copy.
10448 if (AnyExt && EltBits == 32) {
10449 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10451 return DAG.getBitcast(
10452 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10453 DAG.getBitcast(MVT::v4i32, InputV),
10454 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10456 if (AnyExt && EltBits == 16 && Scale > 2) {
10457 int PSHUFDMask[4] = {Offset / 2, -1,
10458 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10459 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10460 DAG.getBitcast(MVT::v4i32, InputV),
10461 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10462 int PSHUFWMask[4] = {1, -1, -1, -1};
10463 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10464 return DAG.getBitcast(
10465 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10466 DAG.getBitcast(MVT::v8i16, InputV),
10467 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10470 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10472 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10473 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10474 assert(VT.is128BitVector() && "Unexpected vector width!");
10476 int LoIdx = Offset * EltBits;
10477 SDValue Lo = DAG.getBitcast(
10478 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10479 DAG.getConstant(EltBits, DL, MVT::i8),
10480 DAG.getConstant(LoIdx, DL, MVT::i8)));
10482 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10483 !SafeOffset(Offset + 1))
10484 return DAG.getBitcast(VT, Lo);
10486 int HiIdx = (Offset + 1) * EltBits;
10487 SDValue Hi = DAG.getBitcast(
10488 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10489 DAG.getConstant(EltBits, DL, MVT::i8),
10490 DAG.getConstant(HiIdx, DL, MVT::i8)));
10491 return DAG.getBitcast(VT,
10492 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10495 // If this would require more than 2 unpack instructions to expand, use
10496 // pshufb when available. We can only use more than 2 unpack instructions
10497 // when zero extending i8 elements which also makes it easier to use pshufb.
10498 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10499 assert(NumElements == 16 && "Unexpected byte vector width!");
10500 SDValue PSHUFBMask[16];
10501 for (int i = 0; i < 16; ++i) {
10502 int Idx = Offset + (i / Scale);
10503 PSHUFBMask[i] = DAG.getConstant(
10504 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10506 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10507 return DAG.getBitcast(
10508 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10509 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10512 // If we are extending from an offset, ensure we start on a boundary that
10513 // we can unpack from.
10514 int AlignToUnpack = Offset % (NumElements / Scale);
10515 if (AlignToUnpack) {
10516 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10517 for (int i = AlignToUnpack; i < NumElements; ++i)
10518 ShMask[i - AlignToUnpack] = i;
10519 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10520 Offset -= AlignToUnpack;
10523 // Otherwise emit a sequence of unpacks.
10525 unsigned UnpackLoHi = X86ISD::UNPCKL;
10526 if (Offset >= (NumElements / 2)) {
10527 UnpackLoHi = X86ISD::UNPCKH;
10528 Offset -= (NumElements / 2);
10531 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10532 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10533 : getZeroVector(InputVT, Subtarget, DAG, DL);
10534 InputV = DAG.getBitcast(InputVT, InputV);
10535 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10539 } while (Scale > 1);
10540 return DAG.getBitcast(VT, InputV);
10543 /// Try to lower a vector shuffle as a zero extension on any microarch.
10545 /// This routine will try to do everything in its power to cleverly lower
10546 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10547 /// check for the profitability of this lowering, it tries to aggressively
10548 /// match this pattern. It will use all of the micro-architectural details it
10549 /// can to emit an efficient lowering. It handles both blends with all-zero
10550 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10551 /// masking out later).
10553 /// The reason we have dedicated lowering for zext-style shuffles is that they
10554 /// are both incredibly common and often quite performance sensitive.
10555 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10556 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10557 const APInt &Zeroable, const X86Subtarget &Subtarget,
10558 SelectionDAG &DAG) {
10559 int Bits = VT.getSizeInBits();
10560 int NumLanes = Bits / 128;
10561 int NumElements = VT.getVectorNumElements();
10562 int NumEltsPerLane = NumElements / NumLanes;
10563 assert(VT.getScalarSizeInBits() <= 32 &&
10564 "Exceeds 32-bit integer zero extension limit");
10565 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10567 // Define a helper function to check a particular ext-scale and lower to it if
10569 auto Lower = [&](int Scale) -> SDValue {
10571 bool AnyExt = true;
10574 for (int i = 0; i < NumElements; ++i) {
10577 continue; // Valid anywhere but doesn't tell us anything.
10578 if (i % Scale != 0) {
10579 // Each of the extended elements need to be zeroable.
10583 // We no longer are in the anyext case.
10588 // Each of the base elements needs to be consecutive indices into the
10589 // same input vector.
10590 SDValue V = M < NumElements ? V1 : V2;
10591 M = M % NumElements;
10594 Offset = M - (i / Scale);
10595 } else if (InputV != V)
10596 return SDValue(); // Flip-flopping inputs.
10598 // Offset must start in the lowest 128-bit lane or at the start of an
10600 // FIXME: Is it ever worth allowing a negative base offset?
10601 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10602 (Offset % NumEltsPerLane) == 0))
10605 // If we are offsetting, all referenced entries must come from the same
10607 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10610 if ((M % NumElements) != (Offset + (i / Scale)))
10611 return SDValue(); // Non-consecutive strided elements.
10615 // If we fail to find an input, we have a zero-shuffle which should always
10616 // have already been handled.
10617 // FIXME: Maybe handle this here in case during blending we end up with one?
10621 // If we are offsetting, don't extend if we only match a single input, we
10622 // can always do better by using a basic PSHUF or PUNPCK.
10623 if (Offset != 0 && Matches < 2)
10626 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10627 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10630 // The widest scale possible for extending is to a 64-bit integer.
10631 assert(Bits % 64 == 0 &&
10632 "The number of bits in a vector must be divisible by 64 on x86!");
10633 int NumExtElements = Bits / 64;
10635 // Each iteration, try extending the elements half as much, but into twice as
10637 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10638 assert(NumElements % NumExtElements == 0 &&
10639 "The input vector size must be divisible by the extended size.");
10640 if (SDValue V = Lower(NumElements / NumExtElements))
10644 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10648 // Returns one of the source operands if the shuffle can be reduced to a
10649 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10650 auto CanZExtLowHalf = [&]() {
10651 for (int i = NumElements / 2; i != NumElements; ++i)
10654 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10656 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10661 if (SDValue V = CanZExtLowHalf()) {
10662 V = DAG.getBitcast(MVT::v2i64, V);
10663 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10664 return DAG.getBitcast(VT, V);
10667 // No viable ext lowering found.
10671 /// Try to get a scalar value for a specific element of a vector.
10673 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10674 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10675 SelectionDAG &DAG) {
10676 MVT VT = V.getSimpleValueType();
10677 MVT EltVT = VT.getVectorElementType();
10678 V = peekThroughBitcasts(V);
10680 // If the bitcasts shift the element size, we can't extract an equivalent
10681 // element from it.
10682 MVT NewVT = V.getSimpleValueType();
10683 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10686 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10687 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10688 // Ensure the scalar operand is the same size as the destination.
10689 // FIXME: Add support for scalar truncation where possible.
10690 SDValue S = V.getOperand(Idx);
10691 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10692 return DAG.getBitcast(EltVT, S);
10698 /// Helper to test for a load that can be folded with x86 shuffles.
10700 /// This is particularly important because the set of instructions varies
10701 /// significantly based on whether the operand is a load or not.
10702 static bool isShuffleFoldableLoad(SDValue V) {
10703 V = peekThroughBitcasts(V);
10704 return ISD::isNON_EXTLoad(V.getNode());
10707 /// Try to lower insertion of a single element into a zero vector.
10709 /// This is a common pattern that we have especially efficient patterns to lower
10710 /// across all subtarget feature sets.
10711 static SDValue lowerVectorShuffleAsElementInsertion(
10712 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10713 const APInt &Zeroable, const X86Subtarget &Subtarget,
10714 SelectionDAG &DAG) {
10716 MVT EltVT = VT.getVectorElementType();
10719 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10721 bool IsV1Zeroable = true;
10722 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10723 if (i != V2Index && !Zeroable[i]) {
10724 IsV1Zeroable = false;
10728 // Check for a single input from a SCALAR_TO_VECTOR node.
10729 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10730 // all the smarts here sunk into that routine. However, the current
10731 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10732 // vector shuffle lowering is dead.
10733 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10735 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10736 // We need to zext the scalar if it is smaller than an i32.
10737 V2S = DAG.getBitcast(EltVT, V2S);
10738 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10739 // Using zext to expand a narrow element won't work for non-zero
10744 // Zero-extend directly to i32.
10745 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10746 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10748 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10749 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10750 EltVT == MVT::i16) {
10751 // Either not inserting from the low element of the input or the input
10752 // element size is too small to use VZEXT_MOVL to clear the high bits.
10756 if (!IsV1Zeroable) {
10757 // If V1 can't be treated as a zero vector we have fewer options to lower
10758 // this. We can't support integer vectors or non-zero targets cheaply, and
10759 // the V1 elements can't be permuted in any way.
10760 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10761 if (!VT.isFloatingPoint() || V2Index != 0)
10763 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10764 V1Mask[V2Index] = -1;
10765 if (!isNoopShuffleMask(V1Mask))
10767 if (!VT.is128BitVector())
10770 // Otherwise, use MOVSD or MOVSS.
10771 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10772 "Only two types of floating point element types to handle!");
10773 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10777 // This lowering only works for the low element with floating point vectors.
10778 if (VT.isFloatingPoint() && V2Index != 0)
10781 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10783 V2 = DAG.getBitcast(VT, V2);
10785 if (V2Index != 0) {
10786 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10787 // the desired position. Otherwise it is more efficient to do a vector
10788 // shift left. We know that we can do a vector shift left because all
10789 // the inputs are zero.
10790 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10791 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10792 V2Shuffle[V2Index] = 0;
10793 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10795 V2 = DAG.getBitcast(MVT::v16i8, V2);
10797 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10798 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL, MVT::i8));
10799 V2 = DAG.getBitcast(VT, V2);
10805 /// Try to lower broadcast of a single - truncated - integer element,
10806 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10808 /// This assumes we have AVX2.
10809 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10810 SDValue V0, int BroadcastIdx,
10811 const X86Subtarget &Subtarget,
10812 SelectionDAG &DAG) {
10813 assert(Subtarget.hasAVX2() &&
10814 "We can only lower integer broadcasts with AVX2!");
10816 EVT EltVT = VT.getVectorElementType();
10817 EVT V0VT = V0.getValueType();
10819 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10820 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10822 EVT V0EltVT = V0VT.getVectorElementType();
10823 if (!V0EltVT.isInteger())
10826 const unsigned EltSize = EltVT.getSizeInBits();
10827 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10829 // This is only a truncation if the original element type is larger.
10830 if (V0EltSize <= EltSize)
10833 assert(((V0EltSize % EltSize) == 0) &&
10834 "Scalar type sizes must all be powers of 2 on x86!");
10836 const unsigned V0Opc = V0.getOpcode();
10837 const unsigned Scale = V0EltSize / EltSize;
10838 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10840 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10841 V0Opc != ISD::BUILD_VECTOR)
10844 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10846 // If we're extracting non-least-significant bits, shift so we can truncate.
10847 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10848 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10849 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10850 if (const int OffsetIdx = BroadcastIdx % Scale)
10851 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10852 DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8));
10854 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10855 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10858 /// Try to lower broadcast of a single element.
10860 /// For convenience, this code also bundles all of the subtarget feature set
10861 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10862 /// a convenient way to factor it out.
10863 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10864 SDValue V1, SDValue V2,
10865 ArrayRef<int> Mask,
10866 const X86Subtarget &Subtarget,
10867 SelectionDAG &DAG) {
10868 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10869 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10870 (Subtarget.hasAVX2() && VT.isInteger())))
10873 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10874 // we can only broadcast from a register with AVX2.
10875 unsigned NumElts = Mask.size();
10876 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10878 : X86ISD::VBROADCAST;
10879 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10881 // Check that the mask is a broadcast.
10882 int BroadcastIdx = -1;
10883 for (int i = 0; i != (int)NumElts; ++i) {
10884 SmallVector<int, 8> BroadcastMask(NumElts, i);
10885 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10891 if (BroadcastIdx < 0)
10893 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10894 "a sorted mask where the broadcast "
10897 // Go up the chain of (vector) values to find a scalar load that we can
10898 // combine with the broadcast.
10901 switch (V.getOpcode()) {
10902 case ISD::BITCAST: {
10903 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10904 SDValue VSrc = V.getOperand(0);
10905 unsigned NumEltBits = V.getScalarValueSizeInBits();
10906 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10907 if ((NumEltBits % NumSrcBits) == 0)
10908 BroadcastIdx *= (NumEltBits / NumSrcBits);
10909 else if ((NumSrcBits % NumEltBits) == 0 &&
10910 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10911 BroadcastIdx /= (NumSrcBits / NumEltBits);
10917 case ISD::CONCAT_VECTORS: {
10918 int OperandSize = Mask.size() / V.getNumOperands();
10919 V = V.getOperand(BroadcastIdx / OperandSize);
10920 BroadcastIdx %= OperandSize;
10923 case ISD::INSERT_SUBVECTOR: {
10924 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10925 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10929 int BeginIdx = (int)ConstantIdx->getZExtValue();
10931 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10932 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10933 BroadcastIdx -= BeginIdx;
10944 // Ensure the source vector and BroadcastIdx are for a suitable type.
10945 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10946 unsigned NumEltBits = VT.getScalarSizeInBits();
10947 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10948 if ((NumSrcBits % NumEltBits) == 0)
10949 BroadcastIdx *= (NumSrcBits / NumEltBits);
10950 else if ((NumEltBits % NumSrcBits) == 0 &&
10951 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10952 BroadcastIdx /= (NumEltBits / NumSrcBits);
10956 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10957 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10958 V = DAG.getBitcast(SrcVT, V);
10961 // Check if this is a broadcast of a scalar. We special case lowering
10962 // for scalars so that we can more effectively fold with loads.
10963 // First, look through bitcast: if the original value has a larger element
10964 // type than the shuffle, the broadcast element is in essence truncated.
10965 // Make that explicit to ease folding.
10966 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10967 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10968 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10969 return TruncBroadcast;
10971 MVT BroadcastVT = VT;
10973 // Peek through any bitcast (only useful for loads).
10974 SDValue BC = peekThroughBitcasts(V);
10976 // Also check the simpler case, where we can directly reuse the scalar.
10977 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10978 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10979 V = V.getOperand(BroadcastIdx);
10981 // If we can't broadcast from a register, check that the input is a load.
10982 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10984 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10985 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10986 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10987 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10988 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10993 // If we are broadcasting a load that is only used by the shuffle
10994 // then we can reduce the vector load to the broadcasted scalar load.
10995 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10996 SDValue BaseAddr = Ld->getOperand(1);
10997 EVT SVT = BroadcastVT.getScalarType();
10998 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10999 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
11000 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
11001 DAG.getMachineFunction().getMachineMemOperand(
11002 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
11003 DAG.makeEquivalentMemoryOrdering(Ld, V);
11004 } else if (!BroadcastFromReg) {
11005 // We can't broadcast from a vector register.
11007 } else if (BroadcastIdx != 0) {
11008 // We can only broadcast from the zero-element of a vector register,
11009 // but it can be advantageous to broadcast from the zero-element of a
11011 if (!VT.is256BitVector() && !VT.is512BitVector())
11014 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
11015 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11018 // Only broadcast the zero-element of a 128-bit subvector.
11019 unsigned EltSize = VT.getScalarSizeInBits();
11020 if (((BroadcastIdx * EltSize) % 128) != 0)
11023 // The shuffle input might have been a bitcast we looked through; look at
11024 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
11025 // later bitcast it to BroadcastVT.
11026 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11027 "Unexpected vector element size");
11028 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
11029 "Unexpected vector size");
11030 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
11033 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
11034 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
11035 DAG.getBitcast(MVT::f64, V));
11037 // Bitcast back to the same scalar type as BroadcastVT.
11038 MVT SrcVT = V.getSimpleValueType();
11039 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
11040 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
11041 "Unexpected vector element size");
11042 if (SrcVT.isVector()) {
11043 unsigned NumSrcElts = SrcVT.getVectorNumElements();
11044 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
11046 SrcVT = BroadcastVT.getScalarType();
11048 V = DAG.getBitcast(SrcVT, V);
11051 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
11052 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
11053 V = DAG.getBitcast(MVT::f64, V);
11054 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
11055 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
11058 // We only support broadcasting from 128-bit vectors to minimize the
11059 // number of patterns we need to deal with in isel. So extract down to
11060 // 128-bits, removing as many bitcasts as possible.
11061 if (SrcVT.getSizeInBits() > 128) {
11062 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
11063 128 / SrcVT.getScalarSizeInBits());
11064 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
11065 V = DAG.getBitcast(ExtVT, V);
11068 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
11071 // Check for whether we can use INSERTPS to perform the shuffle. We only use
11072 // INSERTPS when the V1 elements are already in the correct locations
11073 // because otherwise we can just always use two SHUFPS instructions which
11074 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
11075 // perform INSERTPS if a single V1 element is out of place and all V2
11076 // elements are zeroable.
11077 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
11078 unsigned &InsertPSMask,
11079 const APInt &Zeroable,
11080 ArrayRef<int> Mask,
11081 SelectionDAG &DAG) {
11082 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
11083 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
11084 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11086 // Attempt to match INSERTPS with one element from VA or VB being
11087 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
11089 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
11090 ArrayRef<int> CandidateMask) {
11091 unsigned ZMask = 0;
11092 int VADstIndex = -1;
11093 int VBDstIndex = -1;
11094 bool VAUsedInPlace = false;
11096 for (int i = 0; i < 4; ++i) {
11097 // Synthesize a zero mask from the zeroable elements (includes undefs).
11103 // Flag if we use any VA inputs in place.
11104 if (i == CandidateMask[i]) {
11105 VAUsedInPlace = true;
11109 // We can only insert a single non-zeroable element.
11110 if (VADstIndex >= 0 || VBDstIndex >= 0)
11113 if (CandidateMask[i] < 4) {
11114 // VA input out of place for insertion.
11117 // VB input for insertion.
11122 // Don't bother if we have no (non-zeroable) element for insertion.
11123 if (VADstIndex < 0 && VBDstIndex < 0)
11126 // Determine element insertion src/dst indices. The src index is from the
11127 // start of the inserted vector, not the start of the concatenated vector.
11128 unsigned VBSrcIndex = 0;
11129 if (VADstIndex >= 0) {
11130 // If we have a VA input out of place, we use VA as the V2 element
11131 // insertion and don't use the original V2 at all.
11132 VBSrcIndex = CandidateMask[VADstIndex];
11133 VBDstIndex = VADstIndex;
11136 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
11139 // If no V1 inputs are used in place, then the result is created only from
11140 // the zero mask and the V2 insertion - so remove V1 dependency.
11141 if (!VAUsedInPlace)
11142 VA = DAG.getUNDEF(MVT::v4f32);
11144 // Update V1, V2 and InsertPSMask accordingly.
11148 // Insert the V2 element into the desired position.
11149 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
11150 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
11154 if (matchAsInsertPS(V1, V2, Mask))
11157 // Commute and try again.
11158 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
11159 ShuffleVectorSDNode::commuteMask(CommutedMask);
11160 if (matchAsInsertPS(V2, V1, CommutedMask))
11166 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
11167 SDValue V2, ArrayRef<int> Mask,
11168 const APInt &Zeroable,
11169 SelectionDAG &DAG) {
11170 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11171 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11173 // Attempt to match the insertps pattern.
11174 unsigned InsertPSMask;
11175 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11178 // Insert the V2 element into the desired position.
11179 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11180 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11183 /// Try to lower a shuffle as a permute of the inputs followed by an
11184 /// UNPCK instruction.
11186 /// This specifically targets cases where we end up with alternating between
11187 /// the two inputs, and so can permute them into something that feeds a single
11188 /// UNPCK instruction. Note that this routine only targets integer vectors
11189 /// because for floating point vectors we have a generalized SHUFPS lowering
11190 /// strategy that handles everything that doesn't *exactly* match an unpack,
11191 /// making this clever lowering unnecessary.
11192 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
11193 SDValue V1, SDValue V2,
11194 ArrayRef<int> Mask,
11195 SelectionDAG &DAG) {
11196 assert(!VT.isFloatingPoint() &&
11197 "This routine only supports integer vectors.");
11198 assert(VT.is128BitVector() &&
11199 "This routine only works on 128-bit vectors.");
11200 assert(!V2.isUndef() &&
11201 "This routine should only be used when blending two inputs.");
11202 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11204 int Size = Mask.size();
11207 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11209 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11211 bool UnpackLo = NumLoInputs >= NumHiInputs;
11213 auto TryUnpack = [&](int ScalarSize, int Scale) {
11214 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11215 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11217 for (int i = 0; i < Size; ++i) {
11221 // Each element of the unpack contains Scale elements from this mask.
11222 int UnpackIdx = i / Scale;
11224 // We only handle the case where V1 feeds the first slots of the unpack.
11225 // We rely on canonicalization to ensure this is the case.
11226 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11229 // Setup the mask for this input. The indexing is tricky as we have to
11230 // handle the unpack stride.
11231 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11232 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11236 // If we will have to shuffle both inputs to use the unpack, check whether
11237 // we can just unpack first and shuffle the result. If so, skip this unpack.
11238 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11239 !isNoopShuffleMask(V2Mask))
11242 // Shuffle the inputs into place.
11243 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11244 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11246 // Cast the inputs to the type we will use to unpack them.
11247 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11248 V1 = DAG.getBitcast(UnpackVT, V1);
11249 V2 = DAG.getBitcast(UnpackVT, V2);
11251 // Unpack the inputs and cast the result back to the desired type.
11252 return DAG.getBitcast(
11253 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11254 UnpackVT, V1, V2));
11257 // We try each unpack from the largest to the smallest to try and find one
11258 // that fits this mask.
11259 int OrigScalarSize = VT.getScalarSizeInBits();
11260 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11261 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11264 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11266 if (NumLoInputs == 0 || NumHiInputs == 0) {
11267 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11268 "We have to have *some* inputs!");
11269 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11271 // FIXME: We could consider the total complexity of the permute of each
11272 // possible unpacking. Or at the least we should consider how many
11273 // half-crossings are created.
11274 // FIXME: We could consider commuting the unpacks.
11276 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11277 for (int i = 0; i < Size; ++i) {
11281 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11284 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11286 return DAG.getVectorShuffle(
11287 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
11289 DAG.getUNDEF(VT), PermMask);
11295 /// Handle lowering of 2-lane 64-bit floating point shuffles.
11297 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
11298 /// support for floating point shuffles but not integer shuffles. These
11299 /// instructions will incur a domain crossing penalty on some chips though so
11300 /// it is better to avoid lowering through this for integer vectors where
11302 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11303 const APInt &Zeroable,
11304 SDValue V1, SDValue V2,
11305 const X86Subtarget &Subtarget,
11306 SelectionDAG &DAG) {
11307 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11308 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11309 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11311 if (V2.isUndef()) {
11312 // Check for being able to broadcast a single element.
11313 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11314 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
11317 // Straight shuffle of a single input vector. Simulate this by using the
11318 // single input as both of the "inputs" to this instruction..
11319 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
11321 if (Subtarget.hasAVX()) {
11322 // If we have AVX, we can use VPERMILPS which will allow folding a load
11323 // into the shuffle.
11324 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
11325 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11328 return DAG.getNode(
11329 X86ISD::SHUFP, DL, MVT::v2f64,
11330 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11331 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11332 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11334 assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11335 assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!");
11336 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11337 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11339 // When loading a scalar and then shuffling it into a vector we can often do
11340 // the insertion cheaply.
11341 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11342 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11344 // Try inverting the insertion since for v2 masks it is easy to do and we
11345 // can't reliably sort the mask one way or the other.
11346 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
11347 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
11348 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11349 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11352 // Try to use one of the special instruction patterns to handle two common
11353 // blend patterns if a zero-blend above didn't work.
11354 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
11355 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
11356 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
11357 // We can either use a special instruction to load over the low double or
11358 // to move just the low double.
11359 return DAG.getNode(
11360 X86ISD::MOVSD, DL, MVT::v2f64, V2,
11361 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
11363 if (Subtarget.hasSSE41())
11364 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
11365 Zeroable, Subtarget, DAG))
11368 // Use dedicated unpack instructions for masks that match their pattern.
11370 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
11373 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
11374 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
11375 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11378 /// Handle lowering of 2-lane 64-bit integer shuffles.
11380 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
11381 /// the integer unit to minimize domain crossing penalties. However, for blends
11382 /// it falls back to the floating point shuffle operation with appropriate bit
11384 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11385 const APInt &Zeroable,
11386 SDValue V1, SDValue V2,
11387 const X86Subtarget &Subtarget,
11388 SelectionDAG &DAG) {
11389 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11390 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11391 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11393 if (V2.isUndef()) {
11394 // Check for being able to broadcast a single element.
11395 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11396 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11399 // Straight shuffle of a single input vector. For everything from SSE2
11400 // onward this has a single fast instruction with no scary immediates.
11401 // We have to map the mask as it is actually a v4i32 shuffle instruction.
11402 V1 = DAG.getBitcast(MVT::v4i32, V1);
11403 int WidenedMask[4] = {
11404 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
11405 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
11406 return DAG.getBitcast(
11408 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11409 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
11411 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
11412 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
11413 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11414 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11416 // Try to use shift instructions.
11417 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
11418 Zeroable, Subtarget, DAG))
11421 // When loading a scalar and then shuffling it into a vector we can often do
11422 // the insertion cheaply.
11423 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11424 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11426 // Try inverting the insertion since for v2 masks it is easy to do and we
11427 // can't reliably sort the mask one way or the other.
11428 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11429 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11430 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11433 // We have different paths for blend lowering, but they all must use the
11434 // *exact* same predicate.
11435 bool IsBlendSupported = Subtarget.hasSSE41();
11436 if (IsBlendSupported)
11437 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11438 Zeroable, Subtarget, DAG))
11441 // Use dedicated unpack instructions for masks that match their pattern.
11443 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11446 // Try to use byte rotation instructions.
11447 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11448 if (Subtarget.hasSSSE3()) {
11449 if (Subtarget.hasVLX())
11450 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11451 Mask, Subtarget, DAG))
11454 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11455 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11459 // If we have direct support for blends, we should lower by decomposing into
11460 // a permute. That will be faster than the domain cross.
11461 if (IsBlendSupported)
11462 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11465 // We implement this with SHUFPD which is pretty lame because it will likely
11466 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11467 // However, all the alternatives are still more cycles and newer chips don't
11468 // have this problem. It would be really nice if x86 had better shuffles here.
11469 V1 = DAG.getBitcast(MVT::v2f64, V1);
11470 V2 = DAG.getBitcast(MVT::v2f64, V2);
11471 return DAG.getBitcast(MVT::v2i64,
11472 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11475 /// Test whether this can be lowered with a single SHUFPS instruction.
11477 /// This is used to disable more specialized lowerings when the shufps lowering
11478 /// will happen to be efficient.
11479 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11480 // This routine only handles 128-bit shufps.
11481 assert(Mask.size() == 4 && "Unsupported mask size!");
11482 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11483 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11484 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11485 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11487 // To lower with a single SHUFPS we need to have the low half and high half
11488 // each requiring a single input.
11489 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11491 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11497 /// Lower a vector shuffle using the SHUFPS instruction.
11499 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11500 /// It makes no assumptions about whether this is the *best* lowering, it simply
11502 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11503 ArrayRef<int> Mask, SDValue V1,
11504 SDValue V2, SelectionDAG &DAG) {
11505 SDValue LowV = V1, HighV = V2;
11506 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11508 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11510 if (NumV2Elements == 1) {
11511 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11513 // Compute the index adjacent to V2Index and in the same half by toggling
11515 int V2AdjIndex = V2Index ^ 1;
11517 if (Mask[V2AdjIndex] < 0) {
11518 // Handles all the cases where we have a single V2 element and an undef.
11519 // This will only ever happen in the high lanes because we commute the
11520 // vector otherwise.
11522 std::swap(LowV, HighV);
11523 NewMask[V2Index] -= 4;
11525 // Handle the case where the V2 element ends up adjacent to a V1 element.
11526 // To make this work, blend them together as the first step.
11527 int V1Index = V2AdjIndex;
11528 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11529 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11530 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11532 // Now proceed to reconstruct the final blend as we have the necessary
11533 // high or low half formed.
11540 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11541 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11543 } else if (NumV2Elements == 2) {
11544 if (Mask[0] < 4 && Mask[1] < 4) {
11545 // Handle the easy case where we have V1 in the low lanes and V2 in the
11549 } else if (Mask[2] < 4 && Mask[3] < 4) {
11550 // We also handle the reversed case because this utility may get called
11551 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11552 // arrange things in the right direction.
11558 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11559 // trying to place elements directly, just blend them and set up the final
11560 // shuffle to place them.
11562 // The first two blend mask elements are for V1, the second two are for
11564 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11565 Mask[2] < 4 ? Mask[2] : Mask[3],
11566 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11567 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11568 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11569 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11571 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11574 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11575 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11576 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11577 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11580 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11581 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11584 /// Lower 4-lane 32-bit floating point shuffles.
11586 /// Uses instructions exclusively from the floating point unit to minimize
11587 /// domain crossing penalties, as these are sufficient to implement all v4f32
11589 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11590 const APInt &Zeroable,
11591 SDValue V1, SDValue V2,
11592 const X86Subtarget &Subtarget,
11593 SelectionDAG &DAG) {
11594 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11595 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11596 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11598 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11600 if (NumV2Elements == 0) {
11601 // Check for being able to broadcast a single element.
11602 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11603 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11606 // Use even/odd duplicate instructions for masks that match their pattern.
11607 if (Subtarget.hasSSE3()) {
11608 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11609 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11610 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11611 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11614 if (Subtarget.hasAVX()) {
11615 // If we have AVX, we can use VPERMILPS which will allow folding a load
11616 // into the shuffle.
11617 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11618 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11621 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11622 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11623 if (!Subtarget.hasSSE2()) {
11624 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11625 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11626 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11627 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11630 // Otherwise, use a straight shuffle of a single input vector. We pass the
11631 // input vector to both operands to simulate this with a SHUFPS.
11632 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11633 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11636 // There are special ways we can lower some single-element blends. However, we
11637 // have custom ways we can lower more complex single-element blends below that
11638 // we defer to if both this and BLENDPS fail to match, so restrict this to
11639 // when the V2 input is targeting element 0 of the mask -- that is the fast
11641 if (NumV2Elements == 1 && Mask[0] >= 4)
11642 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11643 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11646 if (Subtarget.hasSSE41()) {
11647 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11648 Zeroable, Subtarget, DAG))
11651 // Use INSERTPS if we can complete the shuffle efficiently.
11653 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11656 if (!isSingleSHUFPSMask(Mask))
11657 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11658 DL, MVT::v4f32, V1, V2, Mask, DAG))
11662 // Use low/high mov instructions. These are only valid in SSE1 because
11663 // otherwise they are widened to v2f64 and never get here.
11664 if (!Subtarget.hasSSE2()) {
11665 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11666 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11667 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11668 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11671 // Use dedicated unpack instructions for masks that match their pattern.
11673 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11676 // Otherwise fall back to a SHUFPS lowering strategy.
11677 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11680 /// Lower 4-lane i32 vector shuffles.
11682 /// We try to handle these with integer-domain shuffles where we can, but for
11683 /// blends we use the floating point domain blend instructions.
11684 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11685 const APInt &Zeroable,
11686 SDValue V1, SDValue V2,
11687 const X86Subtarget &Subtarget,
11688 SelectionDAG &DAG) {
11689 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11690 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11691 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11693 // Whenever we can lower this as a zext, that instruction is strictly faster
11694 // than any alternative. It also allows us to fold memory operands into the
11695 // shuffle in many cases.
11696 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11697 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11700 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11702 if (NumV2Elements == 0) {
11703 // Check for being able to broadcast a single element.
11704 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11705 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11708 // Straight shuffle of a single input vector. For everything from SSE2
11709 // onward this has a single fast instruction with no scary immediates.
11710 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11711 // but we aren't actually going to use the UNPCK instruction because doing
11712 // so prevents folding a load into this instruction or making a copy.
11713 const int UnpackLoMask[] = {0, 0, 1, 1};
11714 const int UnpackHiMask[] = {2, 2, 3, 3};
11715 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11716 Mask = UnpackLoMask;
11717 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11718 Mask = UnpackHiMask;
11720 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11721 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11724 // Try to use shift instructions.
11725 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11726 Zeroable, Subtarget, DAG))
11729 // There are special ways we can lower some single-element blends.
11730 if (NumV2Elements == 1)
11731 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11732 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11735 // We have different paths for blend lowering, but they all must use the
11736 // *exact* same predicate.
11737 bool IsBlendSupported = Subtarget.hasSSE41();
11738 if (IsBlendSupported)
11739 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11740 Zeroable, Subtarget, DAG))
11743 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11747 // Use dedicated unpack instructions for masks that match their pattern.
11749 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11752 // Try to use byte rotation instructions.
11753 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11754 if (Subtarget.hasSSSE3()) {
11755 if (Subtarget.hasVLX())
11756 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11757 Mask, Subtarget, DAG))
11760 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11761 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11765 // Assume that a single SHUFPS is faster than an alternative sequence of
11766 // multiple instructions (even if the CPU has a domain penalty).
11767 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11768 if (!isSingleSHUFPSMask(Mask)) {
11769 // If we have direct support for blends, we should lower by decomposing into
11770 // a permute. That will be faster than the domain cross.
11771 if (IsBlendSupported)
11772 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11775 // Try to lower by permuting the inputs into an unpack instruction.
11776 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11777 DL, MVT::v4i32, V1, V2, Mask, DAG))
11781 // We implement this with SHUFPS because it can blend from two vectors.
11782 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11783 // up the inputs, bypassing domain shift penalties that we would incur if we
11784 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11786 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11787 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11788 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11789 return DAG.getBitcast(MVT::v4i32, ShufPS);
11792 /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11793 /// shuffle lowering, and the most complex part.
11795 /// The lowering strategy is to try to form pairs of input lanes which are
11796 /// targeted at the same half of the final vector, and then use a dword shuffle
11797 /// to place them onto the right half, and finally unpack the paired lanes into
11798 /// their final position.
11800 /// The exact breakdown of how to form these dword pairs and align them on the
11801 /// correct sides is really tricky. See the comments within the function for
11802 /// more of the details.
11804 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11805 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11806 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11807 /// vector, form the analogous 128-bit 8-element Mask.
11808 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11809 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11810 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11811 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11812 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11814 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11815 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11816 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11818 // Attempt to directly match PSHUFLW or PSHUFHW.
11819 if (isUndefOrInRange(LoMask, 0, 4) &&
11820 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11821 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11822 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11824 if (isUndefOrInRange(HiMask, 4, 8) &&
11825 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11826 for (int i = 0; i != 4; ++i)
11827 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11828 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11829 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11832 SmallVector<int, 4> LoInputs;
11833 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11834 array_pod_sort(LoInputs.begin(), LoInputs.end());
11835 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11836 SmallVector<int, 4> HiInputs;
11837 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11838 array_pod_sort(HiInputs.begin(), HiInputs.end());
11839 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11841 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11842 int NumHToL = LoInputs.size() - NumLToL;
11844 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11845 int NumHToH = HiInputs.size() - NumLToH;
11846 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11847 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11848 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11849 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11851 // If we are shuffling values from one half - check how many different DWORD
11852 // pairs we need to create. If only 1 or 2 then we can perform this as a
11853 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11854 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11855 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11856 V = DAG.getNode(ShufWOp, DL, VT, V,
11857 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11858 V = DAG.getBitcast(PSHUFDVT, V);
11859 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11860 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11861 return DAG.getBitcast(VT, V);
11864 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11865 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11866 SmallVector<std::pair<int, int>, 4> DWordPairs;
11867 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11869 // Collect the different DWORD pairs.
11870 for (int DWord = 0; DWord != 4; ++DWord) {
11871 int M0 = Mask[2 * DWord + 0];
11872 int M1 = Mask[2 * DWord + 1];
11873 M0 = (M0 >= 0 ? M0 % 4 : M0);
11874 M1 = (M1 >= 0 ? M1 % 4 : M1);
11875 if (M0 < 0 && M1 < 0)
11878 bool Match = false;
11879 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11880 auto &DWordPair = DWordPairs[j];
11881 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11882 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11883 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11884 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11885 PSHUFDMask[DWord] = DOffset + j;
11891 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11892 DWordPairs.push_back(std::make_pair(M0, M1));
11896 if (DWordPairs.size() <= 2) {
11897 DWordPairs.resize(2, std::make_pair(-1, -1));
11898 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11899 DWordPairs[1].first, DWordPairs[1].second};
11900 if ((NumHToL + NumHToH) == 0)
11901 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11902 if ((NumLToL + NumLToH) == 0)
11903 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11907 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11908 // such inputs we can swap two of the dwords across the half mark and end up
11909 // with <=2 inputs to each half in each half. Once there, we can fall through
11910 // to the generic code below. For example:
11912 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11913 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11915 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11916 // and an existing 2-into-2 on the other half. In this case we may have to
11917 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11918 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11919 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11920 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11921 // half than the one we target for fixing) will be fixed when we re-enter this
11922 // path. We will also combine away any sequence of PSHUFD instructions that
11923 // result into a single instruction. Here is an example of the tricky case:
11925 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11926 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11928 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11930 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11931 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11933 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11934 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11936 // The result is fine to be handled by the generic logic.
11937 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11938 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11939 int AOffset, int BOffset) {
11940 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11941 "Must call this with A having 3 or 1 inputs from the A half.");
11942 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11943 "Must call this with B having 1 or 3 inputs from the B half.");
11944 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11945 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11947 bool ThreeAInputs = AToAInputs.size() == 3;
11949 // Compute the index of dword with only one word among the three inputs in
11950 // a half by taking the sum of the half with three inputs and subtracting
11951 // the sum of the actual three inputs. The difference is the remaining
11953 int ADWord, BDWord;
11954 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11955 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11956 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11957 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11958 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11959 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11960 int TripleNonInputIdx =
11961 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11962 TripleDWord = TripleNonInputIdx / 2;
11964 // We use xor with one to compute the adjacent DWord to whichever one the
11966 OneInputDWord = (OneInput / 2) ^ 1;
11968 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11969 // and BToA inputs. If there is also such a problem with the BToB and AToB
11970 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11971 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11972 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11973 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11974 // Compute how many inputs will be flipped by swapping these DWords. We
11976 // to balance this to ensure we don't form a 3-1 shuffle in the other
11978 int NumFlippedAToBInputs =
11979 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11980 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11981 int NumFlippedBToBInputs =
11982 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11983 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11984 if ((NumFlippedAToBInputs == 1 &&
11985 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11986 (NumFlippedBToBInputs == 1 &&
11987 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11988 // We choose whether to fix the A half or B half based on whether that
11989 // half has zero flipped inputs. At zero, we may not be able to fix it
11990 // with that half. We also bias towards fixing the B half because that
11991 // will more commonly be the high half, and we have to bias one way.
11992 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11993 ArrayRef<int> Inputs) {
11994 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11995 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11996 // Determine whether the free index is in the flipped dword or the
11997 // unflipped dword based on where the pinned index is. We use this bit
11998 // in an xor to conditionally select the adjacent dword.
11999 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
12000 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12001 if (IsFixIdxInput == IsFixFreeIdxInput)
12003 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
12004 assert(IsFixIdxInput != IsFixFreeIdxInput &&
12005 "We need to be changing the number of flipped inputs!");
12006 int PSHUFHalfMask[] = {0, 1, 2, 3};
12007 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
12009 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
12010 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
12011 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
12013 for (int &M : Mask)
12014 if (M >= 0 && M == FixIdx)
12016 else if (M >= 0 && M == FixFreeIdx)
12019 if (NumFlippedBToBInputs != 0) {
12021 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
12022 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
12024 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
12025 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
12026 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
12031 int PSHUFDMask[] = {0, 1, 2, 3};
12032 PSHUFDMask[ADWord] = BDWord;
12033 PSHUFDMask[BDWord] = ADWord;
12034 V = DAG.getBitcast(
12036 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12037 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12039 // Adjust the mask to match the new locations of A and B.
12040 for (int &M : Mask)
12041 if (M >= 0 && M/2 == ADWord)
12042 M = 2 * BDWord + M % 2;
12043 else if (M >= 0 && M/2 == BDWord)
12044 M = 2 * ADWord + M % 2;
12046 // Recurse back into this routine to re-compute state now that this isn't
12047 // a 3 and 1 problem.
12048 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
12051 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
12052 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
12053 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
12054 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
12056 // At this point there are at most two inputs to the low and high halves from
12057 // each half. That means the inputs can always be grouped into dwords and
12058 // those dwords can then be moved to the correct half with a dword shuffle.
12059 // We use at most one low and one high word shuffle to collect these paired
12060 // inputs into dwords, and finally a dword shuffle to place them.
12061 int PSHUFLMask[4] = {-1, -1, -1, -1};
12062 int PSHUFHMask[4] = {-1, -1, -1, -1};
12063 int PSHUFDMask[4] = {-1, -1, -1, -1};
12065 // First fix the masks for all the inputs that are staying in their
12066 // original halves. This will then dictate the targets of the cross-half
12068 auto fixInPlaceInputs =
12069 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
12070 MutableArrayRef<int> SourceHalfMask,
12071 MutableArrayRef<int> HalfMask, int HalfOffset) {
12072 if (InPlaceInputs.empty())
12074 if (InPlaceInputs.size() == 1) {
12075 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12076 InPlaceInputs[0] - HalfOffset;
12077 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
12080 if (IncomingInputs.empty()) {
12081 // Just fix all of the in place inputs.
12082 for (int Input : InPlaceInputs) {
12083 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
12084 PSHUFDMask[Input / 2] = Input / 2;
12089 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
12090 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
12091 InPlaceInputs[0] - HalfOffset;
12092 // Put the second input next to the first so that they are packed into
12093 // a dword. We find the adjacent index by toggling the low bit.
12094 int AdjIndex = InPlaceInputs[0] ^ 1;
12095 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
12096 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
12097 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
12099 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
12100 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
12102 // Now gather the cross-half inputs and place them into a free dword of
12103 // their target half.
12104 // FIXME: This operation could almost certainly be simplified dramatically to
12105 // look more like the 3-1 fixing operation.
12106 auto moveInputsToRightHalf = [&PSHUFDMask](
12107 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
12108 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
12109 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
12111 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
12112 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
12114 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
12116 int LowWord = Word & ~1;
12117 int HighWord = Word | 1;
12118 return isWordClobbered(SourceHalfMask, LowWord) ||
12119 isWordClobbered(SourceHalfMask, HighWord);
12122 if (IncomingInputs.empty())
12125 if (ExistingInputs.empty()) {
12126 // Map any dwords with inputs from them into the right half.
12127 for (int Input : IncomingInputs) {
12128 // If the source half mask maps over the inputs, turn those into
12129 // swaps and use the swapped lane.
12130 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
12131 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
12132 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
12133 Input - SourceOffset;
12134 // We have to swap the uses in our half mask in one sweep.
12135 for (int &M : HalfMask)
12136 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
12138 else if (M == Input)
12139 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12141 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
12142 Input - SourceOffset &&
12143 "Previous placement doesn't match!");
12145 // Note that this correctly re-maps both when we do a swap and when
12146 // we observe the other side of the swap above. We rely on that to
12147 // avoid swapping the members of the input list directly.
12148 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
12151 // Map the input's dword into the correct half.
12152 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
12153 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
12155 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
12157 "Previous placement doesn't match!");
12160 // And just directly shift any other-half mask elements to be same-half
12161 // as we will have mirrored the dword containing the element into the
12162 // same position within that half.
12163 for (int &M : HalfMask)
12164 if (M >= SourceOffset && M < SourceOffset + 4) {
12165 M = M - SourceOffset + DestOffset;
12166 assert(M >= 0 && "This should never wrap below zero!");
12171 // Ensure we have the input in a viable dword of its current half. This
12172 // is particularly tricky because the original position may be clobbered
12173 // by inputs being moved and *staying* in that half.
12174 if (IncomingInputs.size() == 1) {
12175 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12176 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12178 SourceHalfMask[InputFixed - SourceOffset] =
12179 IncomingInputs[0] - SourceOffset;
12180 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12182 IncomingInputs[0] = InputFixed;
12184 } else if (IncomingInputs.size() == 2) {
12185 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12186 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12187 // We have two non-adjacent or clobbered inputs we need to extract from
12188 // the source half. To do this, we need to map them into some adjacent
12189 // dword slot in the source mask.
12190 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12191 IncomingInputs[1] - SourceOffset};
12193 // If there is a free slot in the source half mask adjacent to one of
12194 // the inputs, place the other input in it. We use (Index XOR 1) to
12195 // compute an adjacent index.
12196 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12197 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12198 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12199 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12200 InputsFixed[1] = InputsFixed[0] ^ 1;
12201 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12202 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12203 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12204 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12205 InputsFixed[0] = InputsFixed[1] ^ 1;
12206 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12207 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12208 // The two inputs are in the same DWord but it is clobbered and the
12209 // adjacent DWord isn't used at all. Move both inputs to the free
12211 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12212 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12213 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12214 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12216 // The only way we hit this point is if there is no clobbering
12217 // (because there are no off-half inputs to this half) and there is no
12218 // free slot adjacent to one of the inputs. In this case, we have to
12219 // swap an input with a non-input.
12220 for (int i = 0; i < 4; ++i)
12221 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
12222 "We can't handle any clobbers here!");
12223 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
12224 "Cannot have adjacent inputs here!");
12226 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12227 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
12229 // We also have to update the final source mask in this case because
12230 // it may need to undo the above swap.
12231 for (int &M : FinalSourceHalfMask)
12232 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
12233 M = InputsFixed[1] + SourceOffset;
12234 else if (M == InputsFixed[1] + SourceOffset)
12235 M = (InputsFixed[0] ^ 1) + SourceOffset;
12237 InputsFixed[1] = InputsFixed[0] ^ 1;
12240 // Point everything at the fixed inputs.
12241 for (int &M : HalfMask)
12242 if (M == IncomingInputs[0])
12243 M = InputsFixed[0] + SourceOffset;
12244 else if (M == IncomingInputs[1])
12245 M = InputsFixed[1] + SourceOffset;
12247 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
12248 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
12251 llvm_unreachable("Unhandled input size!");
12254 // Now hoist the DWord down to the right half.
12255 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
12256 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
12257 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
12258 for (int &M : HalfMask)
12259 for (int Input : IncomingInputs)
12261 M = FreeDWord * 2 + Input % 2;
12263 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
12264 /*SourceOffset*/ 4, /*DestOffset*/ 0);
12265 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
12266 /*SourceOffset*/ 0, /*DestOffset*/ 4);
12268 // Now enact all the shuffles we've computed to move the inputs into their
12270 if (!isNoopShuffleMask(PSHUFLMask))
12271 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12272 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
12273 if (!isNoopShuffleMask(PSHUFHMask))
12274 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12275 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
12276 if (!isNoopShuffleMask(PSHUFDMask))
12277 V = DAG.getBitcast(
12279 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12280 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12282 // At this point, each half should contain all its inputs, and we can then
12283 // just shuffle them into their final position.
12284 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
12285 "Failed to lift all the high half inputs to the low mask!");
12286 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
12287 "Failed to lift all the low half inputs to the high mask!");
12289 // Do a half shuffle for the low mask.
12290 if (!isNoopShuffleMask(LoMask))
12291 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12292 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12294 // Do a half shuffle with the high mask after shifting its values down.
12295 for (int &M : HiMask)
12298 if (!isNoopShuffleMask(HiMask))
12299 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12300 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12305 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12306 /// blend if only one input is used.
12307 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
12308 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12309 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
12311 SDValue V1Mask[16];
12312 SDValue V2Mask[16];
12316 int Size = Mask.size();
12317 int Scale = 16 / Size;
12318 for (int i = 0; i < 16; ++i) {
12319 if (Mask[i / Scale] < 0) {
12320 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
12322 const int ZeroMask = 0x80;
12323 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
12325 int V2Idx = Mask[i / Scale] < Size
12327 : (Mask[i / Scale] - Size) * Scale + i % Scale;
12328 if (Zeroable[i / Scale])
12329 V1Idx = V2Idx = ZeroMask;
12330 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
12331 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
12332 V1InUse |= (ZeroMask != V1Idx);
12333 V2InUse |= (ZeroMask != V2Idx);
12338 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12339 DAG.getBitcast(MVT::v16i8, V1),
12340 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
12342 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12343 DAG.getBitcast(MVT::v16i8, V2),
12344 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
12346 // If we need shuffled inputs from both, blend the two.
12348 if (V1InUse && V2InUse)
12349 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
12351 V = V1InUse ? V1 : V2;
12353 // Cast the result back to the correct type.
12354 return DAG.getBitcast(VT, V);
12357 /// Generic lowering of 8-lane i16 shuffles.
12359 /// This handles both single-input shuffles and combined shuffle/blends with
12360 /// two inputs. The single input shuffles are immediately delegated to
12361 /// a dedicated lowering routine.
12363 /// The blends are lowered in one of three fundamental ways. If there are few
12364 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
12365 /// of the input is significantly cheaper when lowered as an interleaving of
12366 /// the two inputs, try to interleave them. Otherwise, blend the low and high
12367 /// halves of the inputs separately (making them have relatively few inputs)
12368 /// and then concatenate them.
12369 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12370 const APInt &Zeroable,
12371 SDValue V1, SDValue V2,
12372 const X86Subtarget &Subtarget,
12373 SelectionDAG &DAG) {
12374 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12375 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12376 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12378 // Whenever we can lower this as a zext, that instruction is strictly faster
12379 // than any alternative.
12380 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12381 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12384 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
12386 if (NumV2Inputs == 0) {
12387 // Check for being able to broadcast a single element.
12388 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12389 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12392 // Try to use shift instructions.
12393 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
12394 Zeroable, Subtarget, DAG))
12397 // Use dedicated unpack instructions for masks that match their pattern.
12399 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12402 // Use dedicated pack instructions for masks that match their pattern.
12403 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
12407 // Try to use byte rotation instructions.
12408 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
12409 Mask, Subtarget, DAG))
12412 // Make a copy of the mask so it can be modified.
12413 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
12414 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
12415 MutableMask, Subtarget,
12419 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12420 "All single-input shuffles should be canonicalized to be V1-input "
12423 // Try to use shift instructions.
12424 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12425 Zeroable, Subtarget, DAG))
12428 // See if we can use SSE4A Extraction / Insertion.
12429 if (Subtarget.hasSSE4A())
12430 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12434 // There are special ways we can lower some single-element blends.
12435 if (NumV2Inputs == 1)
12436 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12437 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12440 // We have different paths for blend lowering, but they all must use the
12441 // *exact* same predicate.
12442 bool IsBlendSupported = Subtarget.hasSSE41();
12443 if (IsBlendSupported)
12444 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12445 Zeroable, Subtarget, DAG))
12448 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12452 // Use dedicated unpack instructions for masks that match their pattern.
12454 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12457 // Use dedicated pack instructions for masks that match their pattern.
12458 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12462 // Try to use byte rotation instructions.
12463 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12464 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12467 if (SDValue BitBlend =
12468 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12471 // Try to lower by permuting the inputs into an unpack instruction.
12472 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12476 // If we can't directly blend but can use PSHUFB, that will be better as it
12477 // can both shuffle and set up the inefficient blend.
12478 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12479 bool V1InUse, V2InUse;
12480 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12481 Zeroable, DAG, V1InUse, V2InUse);
12484 // We can always bit-blend if we have to so the fallback strategy is to
12485 // decompose into single-input permutes and blends.
12486 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12490 /// Check whether a compaction lowering can be done by dropping even
12491 /// elements and compute how many times even elements must be dropped.
12493 /// This handles shuffles which take every Nth element where N is a power of
12494 /// two. Example shuffle masks:
12496 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12497 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12498 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12499 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12500 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12501 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12503 /// Any of these lanes can of course be undef.
12505 /// This routine only supports N <= 3.
12506 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12509 /// \returns N above, or the number of times even elements must be dropped if
12510 /// there is such a number. Otherwise returns zero.
12511 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12512 bool IsSingleInput) {
12513 // The modulus for the shuffle vector entries is based on whether this is
12514 // a single input or not.
12515 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12516 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12517 "We should only be called with masks with a power-of-2 size!");
12519 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12521 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12522 // and 2^3 simultaneously. This is because we may have ambiguity with
12523 // partially undef inputs.
12524 bool ViableForN[3] = {true, true, true};
12526 for (int i = 0, e = Mask.size(); i < e; ++i) {
12527 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12532 bool IsAnyViable = false;
12533 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12534 if (ViableForN[j]) {
12535 uint64_t N = j + 1;
12537 // The shuffle mask must be equal to (i * 2^N) % M.
12538 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12539 IsAnyViable = true;
12541 ViableForN[j] = false;
12543 // Early exit if we exhaust the possible powers of two.
12548 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12552 // Return 0 as there is no viable power of two.
12556 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12557 ArrayRef<int> Mask, SDValue V1,
12558 SDValue V2, SelectionDAG &DAG) {
12559 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12560 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12562 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12564 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12566 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12569 /// Generic lowering of v16i8 shuffles.
12571 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12572 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12573 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12574 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12576 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12577 const APInt &Zeroable,
12578 SDValue V1, SDValue V2,
12579 const X86Subtarget &Subtarget,
12580 SelectionDAG &DAG) {
12581 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12582 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12583 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12585 // Try to use shift instructions.
12586 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12587 Zeroable, Subtarget, DAG))
12590 // Try to use byte rotation instructions.
12591 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12592 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12595 // Use dedicated pack instructions for masks that match their pattern.
12596 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12600 // Try to use a zext lowering.
12601 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12602 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12605 // See if we can use SSE4A Extraction / Insertion.
12606 if (Subtarget.hasSSE4A())
12607 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12611 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12613 // For single-input shuffles, there are some nicer lowering tricks we can use.
12614 if (NumV2Elements == 0) {
12615 // Check for being able to broadcast a single element.
12616 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12617 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12620 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12621 // Notably, this handles splat and partial-splat shuffles more efficiently.
12622 // However, it only makes sense if the pre-duplication shuffle simplifies
12623 // things significantly. Currently, this means we need to be able to
12624 // express the pre-duplication shuffle as an i16 shuffle.
12626 // FIXME: We should check for other patterns which can be widened into an
12627 // i16 shuffle as well.
12628 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12629 for (int i = 0; i < 16; i += 2)
12630 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12635 auto tryToWidenViaDuplication = [&]() -> SDValue {
12636 if (!canWidenViaDuplication(Mask))
12638 SmallVector<int, 4> LoInputs;
12639 copy_if(Mask, std::back_inserter(LoInputs),
12640 [](int M) { return M >= 0 && M < 8; });
12641 array_pod_sort(LoInputs.begin(), LoInputs.end());
12642 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12644 SmallVector<int, 4> HiInputs;
12645 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12646 array_pod_sort(HiInputs.begin(), HiInputs.end());
12647 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12650 bool TargetLo = LoInputs.size() >= HiInputs.size();
12651 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12652 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12654 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12655 SmallDenseMap<int, int, 8> LaneMap;
12656 for (int I : InPlaceInputs) {
12657 PreDupI16Shuffle[I/2] = I/2;
12660 int j = TargetLo ? 0 : 4, je = j + 4;
12661 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12662 // Check if j is already a shuffle of this input. This happens when
12663 // there are two adjacent bytes after we move the low one.
12664 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12665 // If we haven't yet mapped the input, search for a slot into which
12667 while (j < je && PreDupI16Shuffle[j] >= 0)
12671 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12674 // Map this input with the i16 shuffle.
12675 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12678 // Update the lane map based on the mapping we ended up with.
12679 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12681 V1 = DAG.getBitcast(
12683 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12684 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12686 // Unpack the bytes to form the i16s that will be shuffled into place.
12687 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12688 MVT::v16i8, V1, V1);
12690 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12691 for (int i = 0; i < 16; ++i)
12692 if (Mask[i] >= 0) {
12693 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12694 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12695 if (PostDupI16Shuffle[i / 2] < 0)
12696 PostDupI16Shuffle[i / 2] = MappedMask;
12698 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12699 "Conflicting entries in the original shuffle!");
12701 return DAG.getBitcast(
12703 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12704 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12706 if (SDValue V = tryToWidenViaDuplication())
12710 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12714 // Use dedicated unpack instructions for masks that match their pattern.
12716 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12719 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12720 // with PSHUFB. It is important to do this before we attempt to generate any
12721 // blends but after all of the single-input lowerings. If the single input
12722 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12723 // want to preserve that and we can DAG combine any longer sequences into
12724 // a PSHUFB in the end. But once we start blending from multiple inputs,
12725 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12726 // and there are *very* few patterns that would actually be faster than the
12727 // PSHUFB approach because of its ability to zero lanes.
12729 // FIXME: The only exceptions to the above are blends which are exact
12730 // interleavings with direct instructions supporting them. We currently don't
12731 // handle those well here.
12732 if (Subtarget.hasSSSE3()) {
12733 bool V1InUse = false;
12734 bool V2InUse = false;
12736 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12737 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12739 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12740 // do so. This avoids using them to handle blends-with-zero which is
12741 // important as a single pshufb is significantly faster for that.
12742 if (V1InUse && V2InUse) {
12743 if (Subtarget.hasSSE41())
12744 if (SDValue Blend = lowerVectorShuffleAsBlend(
12745 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12748 // We can use an unpack to do the blending rather than an or in some
12749 // cases. Even though the or may be (very minorly) more efficient, we
12750 // preference this lowering because there are common cases where part of
12751 // the complexity of the shuffles goes away when we do the final blend as
12753 // FIXME: It might be worth trying to detect if the unpack-feeding
12754 // shuffles will both be pshufb, in which case we shouldn't bother with
12756 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12757 DL, MVT::v16i8, V1, V2, Mask, DAG))
12760 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12761 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12762 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12768 // There are special ways we can lower some single-element blends.
12769 if (NumV2Elements == 1)
12770 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12771 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12774 if (SDValue BitBlend =
12775 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12778 // Check whether a compaction lowering can be done. This handles shuffles
12779 // which take every Nth element for some even N. See the helper function for
12782 // We special case these as they can be particularly efficiently handled with
12783 // the PACKUSB instruction on x86 and they show up in common patterns of
12784 // rearranging bytes to truncate wide elements.
12785 bool IsSingleInput = V2.isUndef();
12786 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12787 // NumEvenDrops is the power of two stride of the elements. Another way of
12788 // thinking about it is that we need to drop the even elements this many
12789 // times to get the original input.
12791 // First we need to zero all the dropped bytes.
12792 assert(NumEvenDrops <= 3 &&
12793 "No support for dropping even elements more than 3 times.");
12794 // We use the mask type to pick which bytes are preserved based on how many
12795 // elements are dropped.
12796 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12797 SDValue ByteClearMask = DAG.getBitcast(
12798 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12799 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12800 if (!IsSingleInput)
12801 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12803 // Now pack things back together.
12804 V1 = DAG.getBitcast(MVT::v8i16, V1);
12805 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12806 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12807 for (int i = 1; i < NumEvenDrops; ++i) {
12808 Result = DAG.getBitcast(MVT::v8i16, Result);
12809 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12815 // Handle multi-input cases by blending single-input shuffles.
12816 if (NumV2Elements > 0)
12817 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12820 // The fallback path for single-input shuffles widens this into two v8i16
12821 // vectors with unpacks, shuffles those, and then pulls them back together
12825 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12826 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12827 for (int i = 0; i < 16; ++i)
12829 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12831 SDValue VLoHalf, VHiHalf;
12832 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12833 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12835 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12836 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12837 // Use a mask to drop the high bytes.
12838 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12839 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12840 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12842 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12843 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12845 // Squash the masks to point directly into VLoHalf.
12846 for (int &M : LoBlendMask)
12849 for (int &M : HiBlendMask)
12853 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12854 // VHiHalf so that we can blend them as i16s.
12855 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12857 VLoHalf = DAG.getBitcast(
12858 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12859 VHiHalf = DAG.getBitcast(
12860 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12863 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12864 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12866 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12869 /// Dispatching routine to lower various 128-bit x86 vector shuffles.
12871 /// This routine breaks down the specific type of 128-bit shuffle and
12872 /// dispatches to the lowering routines accordingly.
12873 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12874 MVT VT, SDValue V1, SDValue V2,
12875 const APInt &Zeroable,
12876 const X86Subtarget &Subtarget,
12877 SelectionDAG &DAG) {
12878 switch (VT.SimpleTy) {
12880 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12882 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12884 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12886 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12888 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12890 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12893 llvm_unreachable("Unimplemented!");
12897 /// Generic routine to split vector shuffle into half-sized shuffles.
12899 /// This routine just extracts two subvectors, shuffles them independently, and
12900 /// then concatenates them back together. This should work effectively with all
12901 /// AVX vector shuffle types.
12902 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12903 SDValue V2, ArrayRef<int> Mask,
12904 SelectionDAG &DAG) {
12905 assert(VT.getSizeInBits() >= 256 &&
12906 "Only for 256-bit or wider vector shuffles!");
12907 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12908 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12910 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12911 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12913 int NumElements = VT.getVectorNumElements();
12914 int SplitNumElements = NumElements / 2;
12915 MVT ScalarVT = VT.getVectorElementType();
12916 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12918 // Rather than splitting build-vectors, just build two narrower build
12919 // vectors. This helps shuffling with splats and zeros.
12920 auto SplitVector = [&](SDValue V) {
12921 V = peekThroughBitcasts(V);
12923 MVT OrigVT = V.getSimpleValueType();
12924 int OrigNumElements = OrigVT.getVectorNumElements();
12925 int OrigSplitNumElements = OrigNumElements / 2;
12926 MVT OrigScalarVT = OrigVT.getVectorElementType();
12927 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12931 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12933 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12934 DAG.getIntPtrConstant(0, DL));
12935 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12936 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12939 SmallVector<SDValue, 16> LoOps, HiOps;
12940 for (int i = 0; i < OrigSplitNumElements; ++i) {
12941 LoOps.push_back(BV->getOperand(i));
12942 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12944 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12945 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12947 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12948 DAG.getBitcast(SplitVT, HiV));
12951 SDValue LoV1, HiV1, LoV2, HiV2;
12952 std::tie(LoV1, HiV1) = SplitVector(V1);
12953 std::tie(LoV2, HiV2) = SplitVector(V2);
12955 // Now create two 4-way blends of these half-width vectors.
12956 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12957 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12958 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12959 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12960 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12961 for (int i = 0; i < SplitNumElements; ++i) {
12962 int M = HalfMask[i];
12963 if (M >= NumElements) {
12964 if (M >= NumElements + SplitNumElements)
12968 V2BlendMask[i] = M - NumElements;
12969 BlendMask[i] = SplitNumElements + i;
12970 } else if (M >= 0) {
12971 if (M >= SplitNumElements)
12975 V1BlendMask[i] = M;
12980 // Because the lowering happens after all combining takes place, we need to
12981 // manually combine these blend masks as much as possible so that we create
12982 // a minimal number of high-level vector shuffle nodes.
12984 // First try just blending the halves of V1 or V2.
12985 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12986 return DAG.getUNDEF(SplitVT);
12987 if (!UseLoV2 && !UseHiV2)
12988 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12989 if (!UseLoV1 && !UseHiV1)
12990 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12992 SDValue V1Blend, V2Blend;
12993 if (UseLoV1 && UseHiV1) {
12995 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12997 // We only use half of V1 so map the usage down into the final blend mask.
12998 V1Blend = UseLoV1 ? LoV1 : HiV1;
12999 for (int i = 0; i < SplitNumElements; ++i)
13000 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
13001 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
13003 if (UseLoV2 && UseHiV2) {
13005 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
13007 // We only use half of V2 so map the usage down into the final blend mask.
13008 V2Blend = UseLoV2 ? LoV2 : HiV2;
13009 for (int i = 0; i < SplitNumElements; ++i)
13010 if (BlendMask[i] >= SplitNumElements)
13011 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
13013 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
13015 SDValue Lo = HalfBlend(LoMask);
13016 SDValue Hi = HalfBlend(HiMask);
13017 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
13020 /// Either split a vector in halves or decompose the shuffles and the
13023 /// This is provided as a good fallback for many lowerings of non-single-input
13024 /// shuffles with more than one 128-bit lane. In those cases, we want to select
13025 /// between splitting the shuffle into 128-bit components and stitching those
13026 /// back together vs. extracting the single-input shuffles and blending those
13028 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
13029 SDValue V1, SDValue V2,
13030 ArrayRef<int> Mask,
13031 SelectionDAG &DAG) {
13032 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
13033 "shuffles as it could then recurse on itself.");
13034 int Size = Mask.size();
13036 // If this can be modeled as a broadcast of two elements followed by a blend,
13037 // prefer that lowering. This is especially important because broadcasts can
13038 // often fold with memory operands.
13039 auto DoBothBroadcast = [&] {
13040 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
13043 if (V2BroadcastIdx < 0)
13044 V2BroadcastIdx = M - Size;
13045 else if (M - Size != V2BroadcastIdx)
13047 } else if (M >= 0) {
13048 if (V1BroadcastIdx < 0)
13049 V1BroadcastIdx = M;
13050 else if (M != V1BroadcastIdx)
13055 if (DoBothBroadcast())
13056 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
13059 // If the inputs all stem from a single 128-bit lane of each input, then we
13060 // split them rather than blending because the split will decompose to
13061 // unusually few instructions.
13062 int LaneCount = VT.getSizeInBits() / 128;
13063 int LaneSize = Size / LaneCount;
13064 SmallBitVector LaneInputs[2];
13065 LaneInputs[0].resize(LaneCount, false);
13066 LaneInputs[1].resize(LaneCount, false);
13067 for (int i = 0; i < Size; ++i)
13069 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
13070 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
13071 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13073 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
13074 // that the decomposed single-input shuffles don't end up here.
13075 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
13078 /// Lower a vector shuffle crossing multiple 128-bit lanes as
13079 /// a permutation and blend of those lanes.
13081 /// This essentially blends the out-of-lane inputs to each lane into the lane
13082 /// from a permuted copy of the vector. This lowering strategy results in four
13083 /// instructions in the worst case for a single-input cross lane shuffle which
13084 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
13085 /// of. Special cases for each particular shuffle pattern should be handled
13086 /// prior to trying this lowering.
13087 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
13088 SDValue V1, SDValue V2,
13089 ArrayRef<int> Mask,
13091 const X86Subtarget &Subtarget) {
13092 // FIXME: This should probably be generalized for 512-bit vectors as well.
13093 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
13094 int Size = Mask.size();
13095 int LaneSize = Size / 2;
13097 // If there are only inputs from one 128-bit lane, splitting will in fact be
13098 // less expensive. The flags track whether the given lane contains an element
13099 // that crosses to another lane.
13100 if (!Subtarget.hasAVX2()) {
13101 bool LaneCrossing[2] = {false, false};
13102 for (int i = 0; i < Size; ++i)
13103 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
13104 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
13105 if (!LaneCrossing[0] || !LaneCrossing[1])
13106 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13108 bool LaneUsed[2] = {false, false};
13109 for (int i = 0; i < Size; ++i)
13111 LaneUsed[(Mask[i] / LaneSize)] = true;
13112 if (!LaneUsed[0] || !LaneUsed[1])
13113 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13116 assert(V2.isUndef() &&
13117 "This last part of this routine only works on single input shuffles");
13119 SmallVector<int, 32> FlippedBlendMask(Size);
13120 for (int i = 0; i < Size; ++i)
13121 FlippedBlendMask[i] =
13122 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
13124 : Mask[i] % LaneSize +
13125 (i / LaneSize) * LaneSize + Size);
13127 // Flip the vector, and blend the results which should now be in-lane.
13128 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
13129 SDValue Flipped = DAG.getBitcast(PVT, V1);
13130 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
13132 Flipped = DAG.getBitcast(VT, Flipped);
13133 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
13136 /// Handle lowering 2-lane 128-bit shuffles.
13137 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
13138 SDValue V2, ArrayRef<int> Mask,
13139 const APInt &Zeroable,
13140 const X86Subtarget &Subtarget,
13141 SelectionDAG &DAG) {
13142 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
13143 if (Subtarget.hasAVX2() && V2.isUndef())
13146 SmallVector<int, 4> WidenedMask;
13147 if (!canWidenShuffleElements(Mask, WidenedMask))
13150 bool IsLowZero = (Zeroable & 0x3) == 0x3;
13151 bool IsHighZero = (Zeroable & 0xc) == 0xc;
13153 // Try to use an insert into a zero vector.
13154 if (WidenedMask[0] == 0 && IsHighZero) {
13155 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13156 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13157 DAG.getIntPtrConstant(0, DL));
13158 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
13159 getZeroVector(VT, Subtarget, DAG, DL), LoV,
13160 DAG.getIntPtrConstant(0, DL));
13163 // TODO: If minimizing size and one of the inputs is a zero vector and the
13164 // the zero vector has only one use, we could use a VPERM2X128 to save the
13165 // instruction bytes needed to explicitly generate the zero vector.
13167 // Blends are faster and handle all the non-lane-crossing cases.
13168 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
13169 Zeroable, Subtarget, DAG))
13172 // If either input operand is a zero vector, use VPERM2X128 because its mask
13173 // allows us to replace the zero input with an implicit zero.
13174 if (!IsLowZero && !IsHighZero) {
13175 // Check for patterns which can be matched with a single insert of a 128-bit
13177 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
13178 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
13180 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
13181 // this will likely become vinsertf128 which can't fold a 256-bit memop.
13182 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
13183 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13184 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13185 OnlyUsesV1 ? V1 : V2,
13186 DAG.getIntPtrConstant(0, DL));
13187 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13188 DAG.getIntPtrConstant(2, DL));
13192 // Try to use SHUF128 if possible.
13193 if (Subtarget.hasVLX()) {
13194 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
13195 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
13196 ((WidenedMask[1] % 2) << 1);
13197 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
13198 DAG.getConstant(PermMask, DL, MVT::i8));
13203 // Otherwise form a 128-bit permutation. After accounting for undefs,
13204 // convert the 64-bit shuffle mask selection values into 128-bit
13205 // selection bits by dividing the indexes by 2 and shifting into positions
13206 // defined by a vperm2*128 instruction's immediate control byte.
13208 // The immediate permute control byte looks like this:
13209 // [1:0] - select 128 bits from sources for low half of destination
13211 // [3] - zero low half of destination
13212 // [5:4] - select 128 bits from sources for high half of destination
13214 // [7] - zero high half of destination
13216 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
13218 unsigned PermMask = 0;
13219 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
13220 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
13222 // Check the immediate mask and replace unused sources with undef.
13223 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
13224 V1 = DAG.getUNDEF(VT);
13225 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
13226 V2 = DAG.getUNDEF(VT);
13228 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
13229 DAG.getConstant(PermMask, DL, MVT::i8));
13232 /// Lower a vector shuffle by first fixing the 128-bit lanes and then
13233 /// shuffling each lane.
13235 /// This will only succeed when the result of fixing the 128-bit lanes results
13236 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
13237 /// each 128-bit lanes. This handles many cases where we can quickly blend away
13238 /// the lane crosses early and then use simpler shuffles within each lane.
13240 /// FIXME: It might be worthwhile at some point to support this without
13241 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
13242 /// in x86 only floating point has interesting non-repeating shuffles, and even
13243 /// those are still *marginally* more expensive.
13244 static SDValue lowerVectorShuffleByMerging128BitLanes(
13245 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13246 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13247 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
13249 int Size = Mask.size();
13250 int LaneSize = 128 / VT.getScalarSizeInBits();
13251 int NumLanes = Size / LaneSize;
13252 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
13254 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
13255 // check whether the in-128-bit lane shuffles share a repeating pattern.
13256 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
13257 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
13258 for (int i = 0; i < Size; ++i) {
13262 int j = i / LaneSize;
13264 if (Lanes[j] < 0) {
13265 // First entry we've seen for this lane.
13266 Lanes[j] = Mask[i] / LaneSize;
13267 } else if (Lanes[j] != Mask[i] / LaneSize) {
13268 // This doesn't match the lane selected previously!
13272 // Check that within each lane we have a consistent shuffle mask.
13273 int k = i % LaneSize;
13274 if (InLaneMask[k] < 0) {
13275 InLaneMask[k] = Mask[i] % LaneSize;
13276 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
13277 // This doesn't fit a repeating in-lane mask.
13282 // First shuffle the lanes into place.
13283 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
13284 VT.getSizeInBits() / 64);
13285 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
13286 for (int i = 0; i < NumLanes; ++i)
13287 if (Lanes[i] >= 0) {
13288 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
13289 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
13292 V1 = DAG.getBitcast(LaneVT, V1);
13293 V2 = DAG.getBitcast(LaneVT, V2);
13294 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
13296 // Cast it back to the type we actually want.
13297 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
13299 // Now do a simple shuffle that isn't lane crossing.
13300 SmallVector<int, 8> NewMask((unsigned)Size, -1);
13301 for (int i = 0; i < Size; ++i)
13303 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
13304 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
13305 "Must not introduce lane crosses at this point!");
13307 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
13310 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
13311 /// This allows for fast cases such as subvector extraction/insertion
13312 /// or shuffling smaller vector types which can lower more efficiently.
13313 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
13314 SDValue V1, SDValue V2,
13315 ArrayRef<int> Mask,
13316 const X86Subtarget &Subtarget,
13317 SelectionDAG &DAG) {
13318 assert((VT.is256BitVector() || VT.is512BitVector()) &&
13319 "Expected 256-bit or 512-bit vector");
13321 unsigned NumElts = VT.getVectorNumElements();
13322 unsigned HalfNumElts = NumElts / 2;
13323 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
13325 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
13326 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
13327 if (!UndefLower && !UndefUpper)
13330 // Upper half is undef and lower half is whole upper subvector.
13331 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13333 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
13334 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13335 DAG.getIntPtrConstant(HalfNumElts, DL));
13336 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13337 DAG.getIntPtrConstant(0, DL));
13340 // Lower half is undef and upper half is whole lower subvector.
13341 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13343 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
13344 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13345 DAG.getIntPtrConstant(0, DL));
13346 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13347 DAG.getIntPtrConstant(HalfNumElts, DL));
13350 // If the shuffle only uses two of the four halves of the input operands,
13351 // then extract them and perform the 'half' shuffle at half width.
13352 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
13353 int HalfIdx1 = -1, HalfIdx2 = -1;
13354 SmallVector<int, 8> HalfMask(HalfNumElts);
13355 unsigned Offset = UndefLower ? HalfNumElts : 0;
13356 for (unsigned i = 0; i != HalfNumElts; ++i) {
13357 int M = Mask[i + Offset];
13363 // Determine which of the 4 half vectors this element is from.
13364 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
13365 int HalfIdx = M / HalfNumElts;
13367 // Determine the element index into its half vector source.
13368 int HalfElt = M % HalfNumElts;
13370 // We can shuffle with up to 2 half vectors, set the new 'half'
13371 // shuffle mask accordingly.
13372 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
13373 HalfMask[i] = HalfElt;
13374 HalfIdx1 = HalfIdx;
13377 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
13378 HalfMask[i] = HalfElt + HalfNumElts;
13379 HalfIdx2 = HalfIdx;
13383 // Too many half vectors referenced.
13386 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
13388 // Only shuffle the halves of the inputs when useful.
13389 int NumLowerHalves =
13390 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
13391 int NumUpperHalves =
13392 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
13394 // uuuuXXXX - don't extract uppers just to insert again.
13395 if (UndefLower && NumUpperHalves != 0)
13398 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
13399 if (UndefUpper && NumUpperHalves == 2)
13402 // AVX2 - XXXXuuuu - always extract lowers.
13403 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
13404 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
13405 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13407 // AVX2 supports variable 32-bit element cross-lane shuffles.
13408 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
13409 // XXXXuuuu - don't extract lowers and uppers.
13410 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
13415 // AVX512 - XXXXuuuu - always extract lowers.
13416 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
13419 auto GetHalfVector = [&](int HalfIdx) {
13421 return DAG.getUNDEF(HalfVT);
13422 SDValue V = (HalfIdx < 2 ? V1 : V2);
13423 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13424 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13425 DAG.getIntPtrConstant(HalfIdx, DL));
13428 SDValue Half1 = GetHalfVector(HalfIdx1);
13429 SDValue Half2 = GetHalfVector(HalfIdx2);
13430 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13431 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13432 DAG.getIntPtrConstant(Offset, DL));
13435 /// Test whether the specified input (0 or 1) is in-place blended by the
13438 /// This returns true if the elements from a particular input are already in the
13439 /// slot required by the given mask and require no permutation.
13440 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13441 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13442 int Size = Mask.size();
13443 for (int i = 0; i < Size; ++i)
13444 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13450 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13451 /// every lane can be represented as the same repeating mask - allowing us to
13452 /// shuffle the sources with the repeating shuffle and then permute the result
13453 /// to the destination lanes.
13454 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13455 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13456 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13457 int NumElts = VT.getVectorNumElements();
13458 int NumLanes = VT.getSizeInBits() / 128;
13459 int NumLaneElts = NumElts / NumLanes;
13461 // On AVX2 we may be able to just shuffle the lowest elements and then
13462 // broadcast the result.
13463 if (Subtarget.hasAVX2()) {
13464 for (unsigned BroadcastSize : {16, 32, 64}) {
13465 if (BroadcastSize <= VT.getScalarSizeInBits())
13467 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13469 // Attempt to match a repeating pattern every NumBroadcastElts,
13470 // accounting for UNDEFs but only references the lowest 128-bit
13471 // lane of the inputs.
13472 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13473 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13474 for (int j = 0; j != NumBroadcastElts; ++j) {
13475 int M = Mask[i + j];
13478 int &R = RepeatMask[j];
13479 if (0 != ((M % NumElts) / NumLaneElts))
13481 if (0 <= R && R != M)
13488 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13489 if (!FindRepeatingBroadcastMask(RepeatMask))
13492 // Shuffle the (lowest) repeated elements in place for broadcast.
13493 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13495 // Shuffle the actual broadcast.
13496 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13497 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13498 for (int j = 0; j != NumBroadcastElts; ++j)
13499 BroadcastMask[i + j] = j;
13500 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13505 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13506 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13509 // Bail if we already have a repeated lane shuffle mask.
13510 SmallVector<int, 8> RepeatedShuffleMask;
13511 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13514 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13515 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13516 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13517 int NumSubLanes = NumLanes * SubLaneScale;
13518 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13520 // Check that all the sources are coming from the same lane and see if we can
13521 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13522 // determine the source sub-lane for each destination sub-lane.
13523 int TopSrcSubLane = -1;
13524 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13525 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13526 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13527 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13529 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13530 // Extract the sub-lane mask, check that it all comes from the same lane
13531 // and normalize the mask entries to come from the first lane.
13533 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13534 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13535 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13538 int Lane = (M % NumElts) / NumLaneElts;
13539 if ((0 <= SrcLane) && (SrcLane != Lane))
13542 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13543 SubLaneMask[Elt] = LocalM;
13546 // Whole sub-lane is UNDEF.
13550 // Attempt to match against the candidate repeated sub-lane masks.
13551 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13552 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13553 for (int i = 0; i != NumSubLaneElts; ++i) {
13554 if (M1[i] < 0 || M2[i] < 0)
13556 if (M1[i] != M2[i])
13562 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13563 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13566 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13567 for (int i = 0; i != NumSubLaneElts; ++i) {
13568 int M = SubLaneMask[i];
13571 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13572 "Unexpected mask element");
13573 RepeatedSubLaneMask[i] = M;
13576 // Track the top most source sub-lane - by setting the remaining to UNDEF
13577 // we can greatly simplify shuffle matching.
13578 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13579 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13580 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13584 // Bail if we failed to find a matching repeated sub-lane mask.
13585 if (Dst2SrcSubLanes[DstSubLane] < 0)
13588 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13589 "Unexpected source lane");
13591 // Create a repeating shuffle mask for the entire vector.
13592 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13593 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13594 int Lane = SubLane / SubLaneScale;
13595 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13596 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13597 int M = RepeatedSubLaneMask[Elt];
13600 int Idx = (SubLane * NumSubLaneElts) + Elt;
13601 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13604 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13606 // Shuffle each source sub-lane to its destination.
13607 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13608 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13609 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13610 if (SrcSubLane < 0)
13612 for (int j = 0; j != NumSubLaneElts; ++j)
13613 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13616 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13620 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13621 unsigned &ShuffleImm,
13622 ArrayRef<int> Mask) {
13623 int NumElts = VT.getVectorNumElements();
13624 assert(VT.getScalarSizeInBits() == 64 &&
13625 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13626 "Unexpected data type for VSHUFPD");
13628 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13629 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13631 bool ShufpdMask = true;
13632 bool CommutableMask = true;
13633 for (int i = 0; i < NumElts; ++i) {
13634 if (Mask[i] == SM_SentinelUndef)
13638 int Val = (i & 6) + NumElts * (i & 1);
13639 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13640 if (Mask[i] < Val || Mask[i] > Val + 1)
13641 ShufpdMask = false;
13642 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13643 CommutableMask = false;
13644 ShuffleImm |= (Mask[i] % 2) << i;
13649 if (CommutableMask) {
13657 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13658 ArrayRef<int> Mask, SDValue V1,
13659 SDValue V2, SelectionDAG &DAG) {
13660 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13661 "Unexpected data type for VSHUFPD");
13663 unsigned Immediate = 0;
13664 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13667 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13668 DAG.getConstant(Immediate, DL, MVT::i8));
13671 /// Handle lowering of 4-lane 64-bit floating point shuffles.
13673 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13674 /// isn't available.
13675 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13676 const APInt &Zeroable,
13677 SDValue V1, SDValue V2,
13678 const X86Subtarget &Subtarget,
13679 SelectionDAG &DAG) {
13680 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13681 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13682 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13684 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13685 Zeroable, Subtarget, DAG))
13688 if (V2.isUndef()) {
13689 // Check for being able to broadcast a single element.
13690 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13691 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13694 // Use low duplicate instructions for masks that match their pattern.
13695 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13696 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13698 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13699 // Non-half-crossing single input shuffles can be lowered with an
13700 // interleaved permutation.
13701 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13702 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13703 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13704 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13707 // With AVX2 we have direct support for this permutation.
13708 if (Subtarget.hasAVX2())
13709 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13710 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13712 // Try to create an in-lane repeating shuffle mask and then shuffle the
13713 // results into the target lanes.
13714 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13715 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13718 // Otherwise, fall back.
13719 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13723 // Use dedicated unpack instructions for masks that match their pattern.
13725 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13728 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13729 Zeroable, Subtarget, DAG))
13732 // Check if the blend happens to exactly fit that of SHUFPD.
13734 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13737 // Try to create an in-lane repeating shuffle mask and then shuffle the
13738 // results into the target lanes.
13739 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13740 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13743 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13744 // shuffle. However, if we have AVX2 and either inputs are already in place,
13745 // we will be able to shuffle even across lanes the other input in a single
13746 // instruction so skip this pattern.
13747 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13748 isShuffleMaskInputInPlace(1, Mask))))
13749 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13750 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13752 // If we have VLX support, we can use VEXPAND.
13753 if (Subtarget.hasVLX())
13754 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13755 V1, V2, DAG, Subtarget))
13758 // If we have AVX2 then we always want to lower with a blend because an v4 we
13759 // can fully permute the elements.
13760 if (Subtarget.hasAVX2())
13761 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13764 // Otherwise fall back on generic lowering.
13765 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13768 /// Handle lowering of 4-lane 64-bit integer shuffles.
13770 /// This routine is only called when we have AVX2 and thus a reasonable
13771 /// instruction set for v4i64 shuffling..
13772 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13773 const APInt &Zeroable,
13774 SDValue V1, SDValue V2,
13775 const X86Subtarget &Subtarget,
13776 SelectionDAG &DAG) {
13777 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13778 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13779 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13780 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13782 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13783 Zeroable, Subtarget, DAG))
13786 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13787 Zeroable, Subtarget, DAG))
13790 // Check for being able to broadcast a single element.
13791 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13792 Mask, Subtarget, DAG))
13795 if (V2.isUndef()) {
13796 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13797 // can use lower latency instructions that will operate on both lanes.
13798 SmallVector<int, 2> RepeatedMask;
13799 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13800 SmallVector<int, 4> PSHUFDMask;
13801 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13802 return DAG.getBitcast(
13804 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13805 DAG.getBitcast(MVT::v8i32, V1),
13806 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13809 // AVX2 provides a direct instruction for permuting a single input across
13811 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13812 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13815 // Try to use shift instructions.
13816 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13817 Zeroable, Subtarget, DAG))
13820 // If we have VLX support, we can use VALIGN or VEXPAND.
13821 if (Subtarget.hasVLX()) {
13822 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13823 Mask, Subtarget, DAG))
13826 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13827 V1, V2, DAG, Subtarget))
13831 // Try to use PALIGNR.
13832 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13833 Mask, Subtarget, DAG))
13836 // Use dedicated unpack instructions for masks that match their pattern.
13838 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13841 // Try to create an in-lane repeating shuffle mask and then shuffle the
13842 // results into the target lanes.
13843 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13844 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13847 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13848 // shuffle. However, if we have AVX2 and either inputs are already in place,
13849 // we will be able to shuffle even across lanes the other input in a single
13850 // instruction so skip this pattern.
13851 if (!isShuffleMaskInputInPlace(0, Mask) &&
13852 !isShuffleMaskInputInPlace(1, Mask))
13853 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13854 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13857 // Otherwise fall back on generic blend lowering.
13858 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13862 /// Handle lowering of 8-lane 32-bit floating point shuffles.
13864 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13865 /// isn't available.
13866 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13867 const APInt &Zeroable,
13868 SDValue V1, SDValue V2,
13869 const X86Subtarget &Subtarget,
13870 SelectionDAG &DAG) {
13871 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13872 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13873 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13875 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13876 Zeroable, Subtarget, DAG))
13879 // Check for being able to broadcast a single element.
13880 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13881 Mask, Subtarget, DAG))
13884 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13885 // options to efficiently lower the shuffle.
13886 SmallVector<int, 4> RepeatedMask;
13887 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13888 assert(RepeatedMask.size() == 4 &&
13889 "Repeated masks must be half the mask width!");
13891 // Use even/odd duplicate instructions for masks that match their pattern.
13892 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13893 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13894 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13895 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13898 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13899 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13901 // Use dedicated unpack instructions for masks that match their pattern.
13903 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13906 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13907 // have already handled any direct blends.
13908 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13911 // Try to create an in-lane repeating shuffle mask and then shuffle the
13912 // results into the target lanes.
13913 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13914 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13917 // If we have a single input shuffle with different shuffle patterns in the
13918 // two 128-bit lanes use the variable mask to VPERMILPS.
13919 if (V2.isUndef()) {
13920 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13921 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13922 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13924 if (Subtarget.hasAVX2())
13925 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13927 // Otherwise, fall back.
13928 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13932 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13934 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13935 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13937 // If we have VLX support, we can use VEXPAND.
13938 if (Subtarget.hasVLX())
13939 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13940 V1, V2, DAG, Subtarget))
13943 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13944 // since after split we get a more efficient code using vpunpcklwd and
13945 // vpunpckhwd instrs than vblend.
13946 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13947 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13951 // If we have AVX2 then we always want to lower with a blend because at v8 we
13952 // can fully permute the elements.
13953 if (Subtarget.hasAVX2())
13954 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13957 // Otherwise fall back on generic lowering.
13958 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13961 /// Handle lowering of 8-lane 32-bit integer shuffles.
13963 /// This routine is only called when we have AVX2 and thus a reasonable
13964 /// instruction set for v8i32 shuffling..
13965 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13966 const APInt &Zeroable,
13967 SDValue V1, SDValue V2,
13968 const X86Subtarget &Subtarget,
13969 SelectionDAG &DAG) {
13970 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13971 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13972 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13973 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13975 // Whenever we can lower this as a zext, that instruction is strictly faster
13976 // than any alternative. It also allows us to fold memory operands into the
13977 // shuffle in many cases.
13978 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13979 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13982 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13983 // since after split we get a more efficient code than vblend by using
13984 // vpunpcklwd and vpunpckhwd instrs.
13985 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13986 !Subtarget.hasAVX512())
13988 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13991 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13992 Zeroable, Subtarget, DAG))
13995 // Check for being able to broadcast a single element.
13996 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13997 Mask, Subtarget, DAG))
14000 // If the shuffle mask is repeated in each 128-bit lane we can use more
14001 // efficient instructions that mirror the shuffles across the two 128-bit
14003 SmallVector<int, 4> RepeatedMask;
14004 bool Is128BitLaneRepeatedShuffle =
14005 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
14006 if (Is128BitLaneRepeatedShuffle) {
14007 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14009 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
14010 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14012 // Use dedicated unpack instructions for masks that match their pattern.
14014 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
14018 // Try to use shift instructions.
14019 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
14020 Zeroable, Subtarget, DAG))
14023 // If we have VLX support, we can use VALIGN or EXPAND.
14024 if (Subtarget.hasVLX()) {
14025 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
14026 Mask, Subtarget, DAG))
14029 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
14030 V1, V2, DAG, Subtarget))
14034 // Try to use byte rotation instructions.
14035 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14036 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14039 // Try to create an in-lane repeating shuffle mask and then shuffle the
14040 // results into the target lanes.
14041 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14042 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14045 // If the shuffle patterns aren't repeated but it is a single input, directly
14046 // generate a cross-lane VPERMD instruction.
14047 if (V2.isUndef()) {
14048 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
14049 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
14052 // Assume that a single SHUFPS is faster than an alternative sequence of
14053 // multiple instructions (even if the CPU has a domain penalty).
14054 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14055 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14056 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
14057 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
14058 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
14059 CastV1, CastV2, DAG);
14060 return DAG.getBitcast(MVT::v8i32, ShufPS);
14063 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14065 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14066 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
14069 // Otherwise fall back on generic blend lowering.
14070 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
14074 /// Handle lowering of 16-lane 16-bit integer shuffles.
14076 /// This routine is only called when we have AVX2 and thus a reasonable
14077 /// instruction set for v16i16 shuffling..
14078 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14079 const APInt &Zeroable,
14080 SDValue V1, SDValue V2,
14081 const X86Subtarget &Subtarget,
14082 SelectionDAG &DAG) {
14083 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14084 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
14085 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14086 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
14088 // Whenever we can lower this as a zext, that instruction is strictly faster
14089 // than any alternative. It also allows us to fold memory operands into the
14090 // shuffle in many cases.
14091 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14092 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14095 // Check for being able to broadcast a single element.
14096 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
14097 Mask, Subtarget, DAG))
14100 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
14101 Zeroable, Subtarget, DAG))
14104 // Use dedicated unpack instructions for masks that match their pattern.
14106 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
14109 // Use dedicated pack instructions for masks that match their pattern.
14110 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
14114 // Try to use shift instructions.
14115 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
14116 Zeroable, Subtarget, DAG))
14119 // Try to use byte rotation instructions.
14120 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14121 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14124 // Try to create an in-lane repeating shuffle mask and then shuffle the
14125 // results into the target lanes.
14126 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14127 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14130 if (V2.isUndef()) {
14131 // There are no generalized cross-lane shuffle operations available on i16
14133 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
14134 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
14135 Mask, DAG, Subtarget);
14137 SmallVector<int, 8> RepeatedMask;
14138 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
14139 // As this is a single-input shuffle, the repeated mask should be
14140 // a strictly valid v8i16 mask that we can pass through to the v8i16
14141 // lowering to handle even the v16 case.
14142 return lowerV8I16GeneralSingleInputVectorShuffle(
14143 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
14147 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14148 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14151 // AVX512BWVL can lower to VPERMW.
14152 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14153 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
14155 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14157 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14158 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
14161 // Otherwise fall back on generic lowering.
14162 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
14165 /// Handle lowering of 32-lane 8-bit integer shuffles.
14167 /// This routine is only called when we have AVX2 and thus a reasonable
14168 /// instruction set for v32i8 shuffling..
14169 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14170 const APInt &Zeroable,
14171 SDValue V1, SDValue V2,
14172 const X86Subtarget &Subtarget,
14173 SelectionDAG &DAG) {
14174 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14175 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14176 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14177 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
14179 // Whenever we can lower this as a zext, that instruction is strictly faster
14180 // than any alternative. It also allows us to fold memory operands into the
14181 // shuffle in many cases.
14182 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14183 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14186 // Check for being able to broadcast a single element.
14187 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
14188 Mask, Subtarget, DAG))
14191 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
14192 Zeroable, Subtarget, DAG))
14195 // Use dedicated unpack instructions for masks that match their pattern.
14197 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
14200 // Use dedicated pack instructions for masks that match their pattern.
14201 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
14205 // Try to use shift instructions.
14206 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
14207 Zeroable, Subtarget, DAG))
14210 // Try to use byte rotation instructions.
14211 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14212 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14215 // Try to create an in-lane repeating shuffle mask and then shuffle the
14216 // results into the target lanes.
14217 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14218 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14221 // There are no generalized cross-lane shuffle operations available on i8
14223 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
14224 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
14227 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14228 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14231 // AVX512VBMIVL can lower to VPERMB.
14232 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14233 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
14235 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14237 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14238 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14241 // Otherwise fall back on generic lowering.
14242 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
14245 /// High-level routine to lower various 256-bit x86 vector shuffles.
14247 /// This routine either breaks down the specific type of a 256-bit x86 vector
14248 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
14249 /// together based on the available instructions.
14250 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14251 MVT VT, SDValue V1, SDValue V2,
14252 const APInt &Zeroable,
14253 const X86Subtarget &Subtarget,
14254 SelectionDAG &DAG) {
14255 // If we have a single input to the zero element, insert that into V1 if we
14256 // can do so cheaply.
14257 int NumElts = VT.getVectorNumElements();
14258 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14260 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14261 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14262 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14265 // Handle special cases where the lower or upper half is UNDEF.
14267 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14270 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
14271 // can check for those subtargets here and avoid much of the subtarget
14272 // querying in the per-vector-type lowering routines. With AVX1 we have
14273 // essentially *zero* ability to manipulate a 256-bit vector with integer
14274 // types. Since we'll use floating point types there eventually, just
14275 // immediately cast everything to a float and operate entirely in that domain.
14276 if (VT.isInteger() && !Subtarget.hasAVX2()) {
14277 int ElementBits = VT.getScalarSizeInBits();
14278 if (ElementBits < 32) {
14279 // No floating point type available, if we can't use the bit operations
14280 // for masking/blending then decompose into 128-bit vectors.
14282 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
14284 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
14286 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
14289 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
14290 VT.getVectorNumElements());
14291 V1 = DAG.getBitcast(FpVT, V1);
14292 V2 = DAG.getBitcast(FpVT, V2);
14293 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
14296 switch (VT.SimpleTy) {
14298 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14300 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14302 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14304 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14306 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14308 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14311 llvm_unreachable("Not a valid 256-bit x86 vector type!");
14315 /// Try to lower a vector shuffle as a 128-bit shuffles.
14316 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
14317 ArrayRef<int> Mask,
14318 const APInt &Zeroable,
14319 SDValue V1, SDValue V2,
14320 const X86Subtarget &Subtarget,
14321 SelectionDAG &DAG) {
14322 assert(VT.getScalarSizeInBits() == 64 &&
14323 "Unexpected element type size for 128bit shuffle.");
14325 // To handle 256 bit vector requires VLX and most probably
14326 // function lowerV2X128VectorShuffle() is better solution.
14327 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
14329 SmallVector<int, 4> WidenedMask;
14330 if (!canWidenShuffleElements(Mask, WidenedMask))
14333 // Try to use an insert into a zero vector.
14334 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
14335 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
14336 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
14337 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
14338 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14339 DAG.getIntPtrConstant(0, DL));
14340 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14341 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14342 DAG.getIntPtrConstant(0, DL));
14345 // Check for patterns which can be matched with a single insert of a 256-bit
14347 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
14348 {0, 1, 2, 3, 0, 1, 2, 3});
14349 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
14350 {0, 1, 2, 3, 8, 9, 10, 11})) {
14351 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
14352 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14353 OnlyUsesV1 ? V1 : V2,
14354 DAG.getIntPtrConstant(0, DL));
14355 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14356 DAG.getIntPtrConstant(4, DL));
14359 assert(WidenedMask.size() == 4);
14361 // See if this is an insertion of the lower 128-bits of V2 into V1.
14362 bool IsInsert = true;
14364 for (int i = 0; i < 4; ++i) {
14365 assert(WidenedMask[i] >= -1);
14366 if (WidenedMask[i] < 0)
14369 // Make sure all V1 subvectors are in place.
14370 if (WidenedMask[i] < 4) {
14371 if (WidenedMask[i] != i) {
14376 // Make sure we only have a single V2 index and its the lowest 128-bits.
14377 if (V2Index >= 0 || WidenedMask[i] != 4) {
14384 if (IsInsert && V2Index >= 0) {
14385 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14386 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
14387 DAG.getIntPtrConstant(0, DL));
14388 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
14391 // Try to lower to vshuf64x2/vshuf32x4.
14392 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
14393 unsigned PermMask = 0;
14394 // Insure elements came from the same Op.
14395 for (int i = 0; i < 4; ++i) {
14396 assert(WidenedMask[i] >= -1);
14397 if (WidenedMask[i] < 0)
14400 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
14401 unsigned OpIndex = i / 2;
14402 if (Ops[OpIndex].isUndef())
14404 else if (Ops[OpIndex] != Op)
14407 // Convert the 128-bit shuffle mask selection values into 128-bit selection
14408 // bits defined by a vshuf64x2 instruction's immediate control byte.
14409 PermMask |= (WidenedMask[i] % 4) << (i * 2);
14412 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
14413 DAG.getConstant(PermMask, DL, MVT::i8));
14416 /// Handle lowering of 8-lane 64-bit floating point shuffles.
14417 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14418 const APInt &Zeroable,
14419 SDValue V1, SDValue V2,
14420 const X86Subtarget &Subtarget,
14421 SelectionDAG &DAG) {
14422 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14423 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14424 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14426 if (V2.isUndef()) {
14427 // Use low duplicate instructions for masks that match their pattern.
14428 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14429 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14431 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14432 // Non-half-crossing single input shuffles can be lowered with an
14433 // interleaved permutation.
14434 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14435 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14436 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14437 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14438 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14439 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14442 SmallVector<int, 4> RepeatedMask;
14443 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14444 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14445 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14448 if (SDValue Shuf128 =
14449 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14453 if (SDValue Unpck =
14454 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14457 // Check if the blend happens to exactly fit that of SHUFPD.
14459 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14462 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14463 V2, DAG, Subtarget))
14466 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14467 Zeroable, Subtarget, DAG))
14470 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14473 /// Handle lowering of 16-lane 32-bit floating point shuffles.
14474 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14475 const APInt &Zeroable,
14476 SDValue V1, SDValue V2,
14477 const X86Subtarget &Subtarget,
14478 SelectionDAG &DAG) {
14479 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14480 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14481 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14483 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14484 // options to efficiently lower the shuffle.
14485 SmallVector<int, 4> RepeatedMask;
14486 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14487 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14489 // Use even/odd duplicate instructions for masks that match their pattern.
14490 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14491 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14492 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14493 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14496 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14497 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14499 // Use dedicated unpack instructions for masks that match their pattern.
14500 if (SDValue Unpck =
14501 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14504 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14505 Zeroable, Subtarget, DAG))
14508 // Otherwise, fall back to a SHUFPS sequence.
14509 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14512 // If we have a single input shuffle with different shuffle patterns in the
14513 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14514 if (V2.isUndef() &&
14515 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14516 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14517 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14520 // If we have AVX512F support, we can use VEXPAND.
14521 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14522 V1, V2, DAG, Subtarget))
14525 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14528 /// Handle lowering of 8-lane 64-bit integer shuffles.
14529 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14530 const APInt &Zeroable,
14531 SDValue V1, SDValue V2,
14532 const X86Subtarget &Subtarget,
14533 SelectionDAG &DAG) {
14534 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14535 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14536 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14538 if (V2.isUndef()) {
14539 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14540 // can use lower latency instructions that will operate on all four
14542 SmallVector<int, 2> Repeated128Mask;
14543 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14544 SmallVector<int, 4> PSHUFDMask;
14545 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14546 return DAG.getBitcast(
14548 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14549 DAG.getBitcast(MVT::v16i32, V1),
14550 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14553 SmallVector<int, 4> Repeated256Mask;
14554 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14555 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14556 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14559 if (SDValue Shuf128 =
14560 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14561 V1, V2, Subtarget, DAG))
14564 // Try to use shift instructions.
14565 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14566 Zeroable, Subtarget, DAG))
14569 // Try to use VALIGN.
14570 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14571 Mask, Subtarget, DAG))
14574 // Try to use PALIGNR.
14575 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14576 Mask, Subtarget, DAG))
14579 if (SDValue Unpck =
14580 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14582 // If we have AVX512F support, we can use VEXPAND.
14583 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14584 V2, DAG, Subtarget))
14587 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14588 Zeroable, Subtarget, DAG))
14591 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14594 /// Handle lowering of 16-lane 32-bit integer shuffles.
14595 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14596 const APInt &Zeroable,
14597 SDValue V1, SDValue V2,
14598 const X86Subtarget &Subtarget,
14599 SelectionDAG &DAG) {
14600 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14601 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14602 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14604 // Whenever we can lower this as a zext, that instruction is strictly faster
14605 // than any alternative. It also allows us to fold memory operands into the
14606 // shuffle in many cases.
14607 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14608 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14611 // If the shuffle mask is repeated in each 128-bit lane we can use more
14612 // efficient instructions that mirror the shuffles across the four 128-bit
14614 SmallVector<int, 4> RepeatedMask;
14615 bool Is128BitLaneRepeatedShuffle =
14616 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14617 if (Is128BitLaneRepeatedShuffle) {
14618 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14620 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14621 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14623 // Use dedicated unpack instructions for masks that match their pattern.
14625 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14629 // Try to use shift instructions.
14630 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14631 Zeroable, Subtarget, DAG))
14634 // Try to use VALIGN.
14635 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14636 Mask, Subtarget, DAG))
14639 // Try to use byte rotation instructions.
14640 if (Subtarget.hasBWI())
14641 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14642 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14645 // Assume that a single SHUFPS is faster than using a permv shuffle.
14646 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14647 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14648 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14649 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14650 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14651 CastV1, CastV2, DAG);
14652 return DAG.getBitcast(MVT::v16i32, ShufPS);
14654 // If we have AVX512F support, we can use VEXPAND.
14655 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14656 V1, V2, DAG, Subtarget))
14659 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14660 Zeroable, Subtarget, DAG))
14662 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14665 /// Handle lowering of 32-lane 16-bit integer shuffles.
14666 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14667 const APInt &Zeroable,
14668 SDValue V1, SDValue V2,
14669 const X86Subtarget &Subtarget,
14670 SelectionDAG &DAG) {
14671 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14672 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14673 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14674 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14676 // Whenever we can lower this as a zext, that instruction is strictly faster
14677 // than any alternative. It also allows us to fold memory operands into the
14678 // shuffle in many cases.
14679 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14680 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14683 // Use dedicated unpack instructions for masks that match their pattern.
14685 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14688 // Try to use shift instructions.
14689 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14690 Zeroable, Subtarget, DAG))
14693 // Try to use byte rotation instructions.
14694 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14695 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14698 if (V2.isUndef()) {
14699 SmallVector<int, 8> RepeatedMask;
14700 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14701 // As this is a single-input shuffle, the repeated mask should be
14702 // a strictly valid v8i16 mask that we can pass through to the v8i16
14703 // lowering to handle even the v32 case.
14704 return lowerV8I16GeneralSingleInputVectorShuffle(
14705 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14709 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14710 Zeroable, Subtarget, DAG))
14713 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14714 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14717 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14720 /// Handle lowering of 64-lane 8-bit integer shuffles.
14721 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14722 const APInt &Zeroable,
14723 SDValue V1, SDValue V2,
14724 const X86Subtarget &Subtarget,
14725 SelectionDAG &DAG) {
14726 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14727 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14728 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14729 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14731 // Whenever we can lower this as a zext, that instruction is strictly faster
14732 // than any alternative. It also allows us to fold memory operands into the
14733 // shuffle in many cases.
14734 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14735 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14738 // Use dedicated unpack instructions for masks that match their pattern.
14740 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14743 // Try to use shift instructions.
14744 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14745 Zeroable, Subtarget, DAG))
14748 // Try to use byte rotation instructions.
14749 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14750 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14753 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14754 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14757 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14758 if (Subtarget.hasVBMI())
14759 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14761 // Try to create an in-lane repeating shuffle mask and then shuffle the
14762 // results into the target lanes.
14763 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14764 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14767 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14768 Zeroable, Subtarget, DAG))
14771 // FIXME: Implement direct support for this type!
14772 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14775 /// High-level routine to lower various 512-bit x86 vector shuffles.
14777 /// This routine either breaks down the specific type of a 512-bit x86 vector
14778 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14779 /// together based on the available instructions.
14780 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14781 MVT VT, SDValue V1, SDValue V2,
14782 const APInt &Zeroable,
14783 const X86Subtarget &Subtarget,
14784 SelectionDAG &DAG) {
14785 assert(Subtarget.hasAVX512() &&
14786 "Cannot lower 512-bit vectors w/ basic ISA!");
14788 // If we have a single input to the zero element, insert that into V1 if we
14789 // can do so cheaply.
14790 int NumElts = Mask.size();
14791 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14793 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14794 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14795 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14798 // Handle special cases where the lower or upper half is UNDEF.
14800 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14803 // Check for being able to broadcast a single element.
14804 if (SDValue Broadcast =
14805 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14808 // Dispatch to each element type for lowering. If we don't have support for
14809 // specific element type shuffles at 512 bits, immediately split them and
14810 // lower them. Each lowering routine of a given type is allowed to assume that
14811 // the requisite ISA extensions for that element type are available.
14812 switch (VT.SimpleTy) {
14814 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14816 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14818 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14820 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14822 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14824 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14827 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14831 // Lower vXi1 vector shuffles.
14832 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14833 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14834 // vector, shuffle and then truncate it back.
14835 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14836 MVT VT, SDValue V1, SDValue V2,
14837 const APInt &Zeroable,
14838 const X86Subtarget &Subtarget,
14839 SelectionDAG &DAG) {
14840 unsigned NumElts = Mask.size();
14842 // Try to recognize shuffles that are just padding a subvector with zeros.
14843 unsigned SubvecElts = 0;
14844 for (int i = 0; i != (int)NumElts; ++i) {
14845 if (Mask[i] >= 0 && Mask[i] != i)
14850 assert(SubvecElts != NumElts && "Identity shuffle?");
14852 // Clip to a power 2.
14853 SubvecElts = PowerOf2Floor(SubvecElts);
14855 // Make sure the number of zeroable bits in the top at least covers the bits
14856 // not covered by the subvector.
14857 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14858 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14859 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14860 V1, DAG.getIntPtrConstant(0, DL));
14861 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14862 getZeroVector(VT, Subtarget, DAG, DL),
14863 Extract, DAG.getIntPtrConstant(0, DL));
14867 assert(Subtarget.hasAVX512() &&
14868 "Cannot lower 512-bit vectors w/o basic ISA!");
14870 switch (VT.SimpleTy) {
14872 llvm_unreachable("Expected a vector of i1 elements");
14874 ExtVT = MVT::v2i64;
14877 ExtVT = MVT::v4i32;
14880 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14882 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14885 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14886 // 256-bit operation available.
14887 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14890 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14891 // 256-bit operation available.
14892 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14893 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14896 ExtVT = MVT::v64i8;
14900 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14901 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14903 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14904 // i1 was sign extended we can use X86ISD::CVT2MASK.
14905 int NumElems = VT.getVectorNumElements();
14906 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14907 (Subtarget.hasDQI() && (NumElems < 32)))
14908 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT),
14909 Shuffle, ISD::SETGT);
14911 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14914 /// Helper function that returns true if the shuffle mask should be
14915 /// commuted to improve canonicalization.
14916 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14917 int NumElements = Mask.size();
14919 int NumV1Elements = 0, NumV2Elements = 0;
14923 else if (M < NumElements)
14928 // Commute the shuffle as needed such that more elements come from V1 than
14929 // V2. This allows us to match the shuffle pattern strictly on how many
14930 // elements come from V1 without handling the symmetric cases.
14931 if (NumV2Elements > NumV1Elements)
14934 assert(NumV1Elements > 0 && "No V1 indices");
14936 if (NumV2Elements == 0)
14939 // When the number of V1 and V2 elements are the same, try to minimize the
14940 // number of uses of V2 in the low half of the vector. When that is tied,
14941 // ensure that the sum of indices for V1 is equal to or lower than the sum
14942 // indices for V2. When those are equal, try to ensure that the number of odd
14943 // indices for V1 is lower than the number of odd indices for V2.
14944 if (NumV1Elements == NumV2Elements) {
14945 int LowV1Elements = 0, LowV2Elements = 0;
14946 for (int M : Mask.slice(0, NumElements / 2))
14947 if (M >= NumElements)
14951 if (LowV2Elements > LowV1Elements)
14953 if (LowV2Elements == LowV1Elements) {
14954 int SumV1Indices = 0, SumV2Indices = 0;
14955 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14956 if (Mask[i] >= NumElements)
14958 else if (Mask[i] >= 0)
14960 if (SumV2Indices < SumV1Indices)
14962 if (SumV2Indices == SumV1Indices) {
14963 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14964 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14965 if (Mask[i] >= NumElements)
14966 NumV2OddIndices += i % 2;
14967 else if (Mask[i] >= 0)
14968 NumV1OddIndices += i % 2;
14969 if (NumV2OddIndices < NumV1OddIndices)
14978 /// Top-level lowering for x86 vector shuffles.
14980 /// This handles decomposition, canonicalization, and lowering of all x86
14981 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14982 /// above in helper routines. The canonicalization attempts to widen shuffles
14983 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14984 /// s.t. only one of the two inputs needs to be tested, etc.
14985 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14986 SelectionDAG &DAG) {
14987 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14988 ArrayRef<int> Mask = SVOp->getMask();
14989 SDValue V1 = Op.getOperand(0);
14990 SDValue V2 = Op.getOperand(1);
14991 MVT VT = Op.getSimpleValueType();
14992 int NumElements = VT.getVectorNumElements();
14994 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14996 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14997 "Can't lower MMX shuffles");
14999 bool V1IsUndef = V1.isUndef();
15000 bool V2IsUndef = V2.isUndef();
15001 if (V1IsUndef && V2IsUndef)
15002 return DAG.getUNDEF(VT);
15004 // When we create a shuffle node we put the UNDEF node to second operand,
15005 // but in some cases the first operand may be transformed to UNDEF.
15006 // In this case we should just commute the node.
15008 return DAG.getCommutedVectorShuffle(*SVOp);
15010 // Check for non-undef masks pointing at an undef vector and make the masks
15011 // undef as well. This makes it easier to match the shuffle based solely on
15015 if (M >= NumElements) {
15016 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
15017 for (int &M : NewMask)
15018 if (M >= NumElements)
15020 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
15023 // Check for illegal shuffle mask element index values.
15024 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
15025 assert(llvm::all_of(Mask,
15026 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
15027 "Out of bounds shuffle index");
15029 // We actually see shuffles that are entirely re-arrangements of a set of
15030 // zero inputs. This mostly happens while decomposing complex shuffles into
15031 // simple ones. Directly lower these as a buildvector of zeros.
15032 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
15033 if (Zeroable.isAllOnesValue())
15034 return getZeroVector(VT, Subtarget, DAG, DL);
15036 // Try to collapse shuffles into using a vector type with fewer elements but
15037 // wider element types. We cap this to not form integers or floating point
15038 // elements wider than 64 bits, but it might be interesting to form i128
15039 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
15040 SmallVector<int, 16> WidenedMask;
15041 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
15042 canWidenShuffleElements(Mask, WidenedMask)) {
15043 MVT NewEltVT = VT.isFloatingPoint()
15044 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
15045 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
15046 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
15047 // Make sure that the new vector type is legal. For example, v2f64 isn't
15049 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
15050 V1 = DAG.getBitcast(NewVT, V1);
15051 V2 = DAG.getBitcast(NewVT, V2);
15052 return DAG.getBitcast(
15053 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
15057 // Commute the shuffle if it will improve canonicalization.
15058 if (canonicalizeShuffleMaskWithCommute(Mask))
15059 return DAG.getCommutedVectorShuffle(*SVOp);
15062 lowerVectorShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
15065 // For each vector width, delegate to a specialized lowering routine.
15066 if (VT.is128BitVector())
15067 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15070 if (VT.is256BitVector())
15071 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15074 if (VT.is512BitVector())
15075 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15079 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
15082 llvm_unreachable("Unimplemented!");
15085 /// Try to lower a VSELECT instruction to a vector shuffle.
15086 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
15087 const X86Subtarget &Subtarget,
15088 SelectionDAG &DAG) {
15089 SDValue Cond = Op.getOperand(0);
15090 SDValue LHS = Op.getOperand(1);
15091 SDValue RHS = Op.getOperand(2);
15093 MVT VT = Op.getSimpleValueType();
15095 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
15097 auto *CondBV = cast<BuildVectorSDNode>(Cond);
15099 // Only non-legal VSELECTs reach this lowering, convert those into generic
15100 // shuffles and re-use the shuffle lowering path for blends.
15101 SmallVector<int, 32> Mask;
15102 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
15103 SDValue CondElt = CondBV->getOperand(i);
15105 // We can't map undef to undef here. They have different meanings. Treat
15106 // as the same as zero.
15107 if (CondElt.isUndef() || isNullConstant(CondElt))
15111 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
15114 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
15115 // A vselect where all conditions and data are constants can be optimized into
15116 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
15117 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
15118 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
15119 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
15122 // Try to lower this to a blend-style vector shuffle. This can handle all
15123 // constant condition cases.
15124 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
15127 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
15128 // with patterns on the mask registers on AVX-512.
15129 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
15132 // Variable blends are only legal from SSE4.1 onward.
15133 if (!Subtarget.hasSSE41())
15137 MVT VT = Op.getSimpleValueType();
15139 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
15140 // into an i1 condition so that we can use the mask-based 512-bit blend
15142 if (VT.getSizeInBits() == 512) {
15143 SDValue Cond = Op.getOperand(0);
15144 // The vNi1 condition case should be handled above as it can be trivially
15146 assert(Cond.getValueType().getScalarSizeInBits() ==
15147 VT.getScalarSizeInBits() &&
15148 "Should have a size-matched integer condition!");
15149 // Build a mask by testing the condition against zero.
15150 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
15151 SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond,
15152 getZeroVector(VT, Subtarget, DAG, dl),
15154 // Now return a new VSELECT using the mask.
15155 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
15158 // Only some types will be legal on some subtargets. If we can emit a legal
15159 // VSELECT-matching blend, return Op, and but if we need to expand, return
15161 switch (VT.SimpleTy) {
15163 // Most of the vector types have blends past SSE4.1.
15167 // The byte blends for AVX vectors were introduced only in AVX2.
15168 if (Subtarget.hasAVX2())
15174 case MVT::v16i16: {
15175 // Bitcast everything to the vXi8 type and use a vXi8 vselect.
15176 MVT CastVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
15177 SDValue Cond = DAG.getBitcast(CastVT, Op->getOperand(0));
15178 SDValue LHS = DAG.getBitcast(CastVT, Op->getOperand(1));
15179 SDValue RHS = DAG.getBitcast(CastVT, Op->getOperand(2));
15180 SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS);
15181 return DAG.getBitcast(VT, Select);
15186 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
15187 MVT VT = Op.getSimpleValueType();
15190 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
15193 if (VT.getSizeInBits() == 8) {
15194 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
15195 Op.getOperand(0), Op.getOperand(1));
15196 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15199 if (VT == MVT::f32) {
15200 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
15201 // the result back to FR32 register. It's only worth matching if the
15202 // result has a single use which is a store or a bitcast to i32. And in
15203 // the case of a store, it's not worth it if the index is a constant 0,
15204 // because a MOVSSmr can be used instead, which is smaller and faster.
15205 if (!Op.hasOneUse())
15207 SDNode *User = *Op.getNode()->use_begin();
15208 if ((User->getOpcode() != ISD::STORE ||
15209 isNullConstant(Op.getOperand(1))) &&
15210 (User->getOpcode() != ISD::BITCAST ||
15211 User->getValueType(0) != MVT::i32))
15213 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15214 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
15216 return DAG.getBitcast(MVT::f32, Extract);
15219 if (VT == MVT::i32 || VT == MVT::i64) {
15220 // ExtractPS/pextrq works with constant index.
15221 if (isa<ConstantSDNode>(Op.getOperand(1)))
15228 /// Extract one bit from mask vector, like v16i1 or v8i1.
15229 /// AVX-512 feature.
15230 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
15231 const X86Subtarget &Subtarget) {
15232 SDValue Vec = Op.getOperand(0);
15234 MVT VecVT = Vec.getSimpleValueType();
15235 SDValue Idx = Op.getOperand(1);
15236 MVT EltVT = Op.getSimpleValueType();
15238 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
15239 "Unexpected vector type in ExtractBitFromMaskVector");
15241 // variable index can't be handled in mask registers,
15242 // extend vector to VR512/128
15243 if (!isa<ConstantSDNode>(Idx)) {
15244 unsigned NumElts = VecVT.getVectorNumElements();
15245 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
15246 // than extending to 128/256bit.
15247 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15248 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15249 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
15250 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
15251 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
15254 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15256 // If the kshift instructions of the correct width aren't natively supported
15257 // then we need to promote the vector to the native size to get the correct
15258 // zeroing behavior.
15259 if (VecVT.getVectorNumElements() < 16) {
15260 VecVT = MVT::v16i1;
15261 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
15262 DAG.getUNDEF(VecVT), Vec,
15263 DAG.getIntPtrConstant(0, dl));
15266 // Extracts from element 0 are always allowed.
15268 // Use kshiftr instruction to move to the lower element.
15269 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
15270 DAG.getConstant(IdxVal, dl, MVT::i8));
15273 // Shrink to v16i1 since that's always legal.
15274 if (VecVT.getVectorNumElements() > 16) {
15275 VecVT = MVT::v16i1;
15276 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
15277 DAG.getIntPtrConstant(0, dl));
15280 // Convert to a bitcast+aext/trunc.
15281 MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
15282 return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
15286 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15287 SelectionDAG &DAG) const {
15289 SDValue Vec = Op.getOperand(0);
15290 MVT VecVT = Vec.getSimpleValueType();
15291 SDValue Idx = Op.getOperand(1);
15293 if (VecVT.getVectorElementType() == MVT::i1)
15294 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
15296 if (!isa<ConstantSDNode>(Idx)) {
15297 // Its more profitable to go through memory (1 cycles throughput)
15298 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
15299 // IACA tool was used to get performance estimation
15300 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
15302 // example : extractelement <16 x i8> %a, i32 %i
15304 // Block Throughput: 3.00 Cycles
15305 // Throughput Bottleneck: Port5
15307 // | Num Of | Ports pressure in cycles | |
15308 // | Uops | 0 - DV | 5 | 6 | 7 | |
15309 // ---------------------------------------------
15310 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
15311 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
15312 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
15313 // Total Num Of Uops: 4
15316 // Block Throughput: 1.00 Cycles
15317 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
15319 // | | Ports pressure in cycles | |
15320 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
15321 // ---------------------------------------------------------
15322 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
15323 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
15324 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
15325 // Total Num Of Uops: 4
15330 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15332 // If this is a 256-bit vector result, first extract the 128-bit vector and
15333 // then extract the element from the 128-bit vector.
15334 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
15335 // Get the 128-bit vector.
15336 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
15337 MVT EltVT = VecVT.getVectorElementType();
15339 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
15340 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
15342 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
15343 // this can be done with a mask.
15344 IdxVal &= ElemsPerChunk - 1;
15345 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
15346 DAG.getConstant(IdxVal, dl, MVT::i32));
15349 assert(VecVT.is128BitVector() && "Unexpected vector length");
15351 MVT VT = Op.getSimpleValueType();
15353 if (VT.getSizeInBits() == 16) {
15354 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
15355 // we're going to zero extend the register or fold the store (SSE41 only).
15356 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
15357 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
15358 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
15359 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15360 DAG.getBitcast(MVT::v4i32, Vec), Idx));
15362 // Transform it so it match pextrw which produces a 32-bit result.
15363 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
15364 Op.getOperand(0), Op.getOperand(1));
15365 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15368 if (Subtarget.hasSSE41())
15369 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
15372 // TODO: We only extract a single element from v16i8, we can probably afford
15373 // to be more aggressive here before using the default approach of spilling to
15375 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
15376 // Extract either the lowest i32 or any i16, and extract the sub-byte.
15377 int DWordIdx = IdxVal / 4;
15378 if (DWordIdx == 0) {
15379 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15380 DAG.getBitcast(MVT::v4i32, Vec),
15381 DAG.getIntPtrConstant(DWordIdx, dl));
15382 int ShiftVal = (IdxVal % 4) * 8;
15384 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
15385 DAG.getConstant(ShiftVal, dl, MVT::i8));
15386 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15389 int WordIdx = IdxVal / 2;
15390 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
15391 DAG.getBitcast(MVT::v8i16, Vec),
15392 DAG.getIntPtrConstant(WordIdx, dl));
15393 int ShiftVal = (IdxVal % 2) * 8;
15395 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
15396 DAG.getConstant(ShiftVal, dl, MVT::i8));
15397 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15400 if (VT.getSizeInBits() == 32) {
15404 // SHUFPS the element to the lowest double word, then movss.
15405 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
15406 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15407 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15408 DAG.getIntPtrConstant(0, dl));
15411 if (VT.getSizeInBits() == 64) {
15412 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
15413 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
15414 // to match extract_elt for f64.
15418 // UNPCKHPD the element to the lowest double word, then movsd.
15419 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
15420 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
15421 int Mask[2] = { 1, -1 };
15422 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15423 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15424 DAG.getIntPtrConstant(0, dl));
15430 /// Insert one bit to mask vector, like v16i1 or v8i1.
15431 /// AVX-512 feature.
15432 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15433 const X86Subtarget &Subtarget) {
15435 SDValue Vec = Op.getOperand(0);
15436 SDValue Elt = Op.getOperand(1);
15437 SDValue Idx = Op.getOperand(2);
15438 MVT VecVT = Vec.getSimpleValueType();
15440 if (!isa<ConstantSDNode>(Idx)) {
15441 // Non constant index. Extend source and destination,
15442 // insert element and then truncate the result.
15443 unsigned NumElts = VecVT.getVectorNumElements();
15444 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15445 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15446 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15447 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15448 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15449 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15452 // Copy into a k-register, extract to v1i1 and insert_subvector.
15453 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15455 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15459 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15460 SelectionDAG &DAG) const {
15461 MVT VT = Op.getSimpleValueType();
15462 MVT EltVT = VT.getVectorElementType();
15463 unsigned NumElts = VT.getVectorNumElements();
15465 if (EltVT == MVT::i1)
15466 return InsertBitToMaskVector(Op, DAG, Subtarget);
15469 SDValue N0 = Op.getOperand(0);
15470 SDValue N1 = Op.getOperand(1);
15471 SDValue N2 = Op.getOperand(2);
15472 if (!isa<ConstantSDNode>(N2))
15474 auto *N2C = cast<ConstantSDNode>(N2);
15475 unsigned IdxVal = N2C->getZExtValue();
15477 bool IsZeroElt = X86::isZeroNode(N1);
15478 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15480 // If we are inserting a element, see if we can do this more efficiently with
15481 // a blend shuffle with a rematerializable vector than a costly integer
15483 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15484 16 <= EltVT.getSizeInBits()) {
15485 SmallVector<int, 8> BlendMask;
15486 for (unsigned i = 0; i != NumElts; ++i)
15487 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15488 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15489 : getOnesVector(VT, DAG, dl);
15490 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15493 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15494 // into that, and then insert the subvector back into the result.
15495 if (VT.is256BitVector() || VT.is512BitVector()) {
15496 // With a 256-bit vector, we can insert into the zero element efficiently
15497 // using a blend if we have AVX or AVX2 and the right data type.
15498 if (VT.is256BitVector() && IdxVal == 0) {
15499 // TODO: It is worthwhile to cast integer to floating point and back
15500 // and incur a domain crossing penalty if that's what we'll end up
15501 // doing anyway after extracting to a 128-bit vector.
15502 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15503 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15504 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15505 N2 = DAG.getIntPtrConstant(1, dl);
15506 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15510 // Get the desired 128-bit vector chunk.
15511 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15513 // Insert the element into the desired chunk.
15514 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15515 assert(isPowerOf2_32(NumEltsIn128));
15516 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15517 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15519 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15520 DAG.getConstant(IdxIn128, dl, MVT::i32));
15522 // Insert the changed part back into the bigger vector
15523 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15525 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15527 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15528 // argument. SSE41 required for pinsrb.
15529 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15531 if (VT == MVT::v8i16) {
15532 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15533 Opc = X86ISD::PINSRW;
15535 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15536 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15537 Opc = X86ISD::PINSRB;
15540 if (N1.getValueType() != MVT::i32)
15541 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15542 if (N2.getValueType() != MVT::i32)
15543 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15544 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15547 if (Subtarget.hasSSE41()) {
15548 if (EltVT == MVT::f32) {
15549 // Bits [7:6] of the constant are the source select. This will always be
15550 // zero here. The DAG Combiner may combine an extract_elt index into
15551 // these bits. For example (insert (extract, 3), 2) could be matched by
15552 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15553 // Bits [5:4] of the constant are the destination select. This is the
15554 // value of the incoming immediate.
15555 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15556 // combine either bitwise AND or insert of float 0.0 to set these bits.
15558 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15559 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15560 // If this is an insertion of 32-bits into the low 32-bits of
15561 // a vector, we prefer to generate a blend with immediate rather
15562 // than an insertps. Blends are simpler operations in hardware and so
15563 // will always have equal or better performance than insertps.
15564 // But if optimizing for size and there's a load folding opportunity,
15565 // generate insertps because blendps does not have a 32-bit memory
15567 N2 = DAG.getIntPtrConstant(1, dl);
15568 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15569 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15571 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15572 // Create this as a scalar to vector..
15573 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15574 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15577 // PINSR* works with constant index.
15578 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15585 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15586 SelectionDAG &DAG) {
15588 MVT OpVT = Op.getSimpleValueType();
15590 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15592 if (X86::isZeroNode(Op.getOperand(0)))
15593 return getZeroVector(OpVT, Subtarget, DAG, dl);
15595 // If this is a 256-bit vector result, first insert into a 128-bit
15596 // vector and then insert into the 256-bit vector.
15597 if (!OpVT.is128BitVector()) {
15598 // Insert into a 128-bit vector.
15599 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15600 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15601 OpVT.getVectorNumElements() / SizeFactor);
15603 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15605 // Insert the 128-bit vector.
15606 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15608 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15610 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15611 if (OpVT == MVT::v4i32)
15614 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15615 return DAG.getBitcast(
15616 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15619 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15620 // simple superregister reference or explicit instructions to insert
15621 // the upper bits of a vector.
15622 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15623 SelectionDAG &DAG) {
15624 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15626 return insert1BitVector(Op, DAG, Subtarget);
15629 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15630 SelectionDAG &DAG) {
15631 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15632 "Only vXi1 extract_subvectors need custom lowering");
15635 SDValue Vec = Op.getOperand(0);
15636 SDValue Idx = Op.getOperand(1);
15638 if (!isa<ConstantSDNode>(Idx))
15641 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15642 if (IdxVal == 0) // the operation is legal
15645 MVT VecVT = Vec.getSimpleValueType();
15646 unsigned NumElems = VecVT.getVectorNumElements();
15648 // Extend to natively supported kshift.
15649 MVT WideVecVT = VecVT;
15650 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15651 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15652 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15653 DAG.getUNDEF(WideVecVT), Vec,
15654 DAG.getIntPtrConstant(0, dl));
15657 // Shift to the LSB.
15658 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15659 DAG.getConstant(IdxVal, dl, MVT::i8));
15661 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15662 DAG.getIntPtrConstant(0, dl));
15665 // Returns the appropriate wrapper opcode for a global reference.
15666 unsigned X86TargetLowering::getGlobalWrapperKind(
15667 const GlobalValue *GV, const unsigned char OpFlags) const {
15668 // References to absolute symbols are never PC-relative.
15669 if (GV && GV->isAbsoluteSymbolRef())
15670 return X86ISD::Wrapper;
15672 CodeModel::Model M = getTargetMachine().getCodeModel();
15673 if (Subtarget.isPICStyleRIPRel() &&
15674 (M == CodeModel::Small || M == CodeModel::Kernel))
15675 return X86ISD::WrapperRIP;
15677 // GOTPCREL references must always use RIP.
15678 if (OpFlags == X86II::MO_GOTPCREL)
15679 return X86ISD::WrapperRIP;
15681 return X86ISD::Wrapper;
15684 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15685 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15686 // one of the above mentioned nodes. It has to be wrapped because otherwise
15687 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15688 // be used to form addressing mode. These wrapped nodes will be selected
15691 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15692 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15694 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15695 // global base reg.
15696 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15698 auto PtrVT = getPointerTy(DAG.getDataLayout());
15699 SDValue Result = DAG.getTargetConstantPool(
15700 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15702 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15703 // With PIC, the address is actually $g + Offset.
15706 DAG.getNode(ISD::ADD, DL, PtrVT,
15707 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15713 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15714 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15716 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15717 // global base reg.
15718 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15720 auto PtrVT = getPointerTy(DAG.getDataLayout());
15721 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15723 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15725 // With PIC, the address is actually $g + Offset.
15728 DAG.getNode(ISD::ADD, DL, PtrVT,
15729 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15735 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15736 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15738 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15739 // global base reg.
15740 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15741 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15743 auto PtrVT = getPointerTy(DAG.getDataLayout());
15744 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15747 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15749 // With PIC, the address is actually $g + Offset.
15750 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15752 DAG.getNode(ISD::ADD, DL, PtrVT,
15753 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15756 // For symbols that require a load from a stub to get the address, emit the
15758 if (isGlobalStubReference(OpFlag))
15759 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15760 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15766 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15767 // Create the TargetBlockAddressAddress node.
15768 unsigned char OpFlags =
15769 Subtarget.classifyBlockAddressReference();
15770 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15771 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15773 auto PtrVT = getPointerTy(DAG.getDataLayout());
15774 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15775 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15777 // With PIC, the address is actually $g + Offset.
15778 if (isGlobalRelativeToPICBase(OpFlags)) {
15779 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15780 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15786 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15787 const SDLoc &dl, int64_t Offset,
15788 SelectionDAG &DAG) const {
15789 // Create the TargetGlobalAddress node, folding in the constant
15790 // offset if it is legal.
15791 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15792 CodeModel::Model M = DAG.getTarget().getCodeModel();
15793 auto PtrVT = getPointerTy(DAG.getDataLayout());
15795 if (OpFlags == X86II::MO_NO_FLAG &&
15796 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15797 // A direct static reference to a global.
15798 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15801 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15804 Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result);
15806 // With PIC, the address is actually $g + Offset.
15807 if (isGlobalRelativeToPICBase(OpFlags)) {
15808 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15809 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15812 // For globals that require a load from a stub to get the address, emit the
15814 if (isGlobalStubReference(OpFlags))
15815 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15816 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15818 // If there was a non-zero offset that we didn't fold, create an explicit
15819 // addition for it.
15821 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15822 DAG.getConstant(Offset, dl, PtrVT));
15828 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15829 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15830 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15831 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15835 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15836 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15837 unsigned char OperandFlags, bool LocalDynamic = false) {
15838 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15839 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15841 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15842 GA->getValueType(0),
15846 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15850 SDValue Ops[] = { Chain, TGA, *InFlag };
15851 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15853 SDValue Ops[] = { Chain, TGA };
15854 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15857 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15858 MFI.setAdjustsStack(true);
15859 MFI.setHasCalls(true);
15861 SDValue Flag = Chain.getValue(1);
15862 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15865 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15867 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15870 SDLoc dl(GA); // ? function entry point might be better
15871 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15872 DAG.getNode(X86ISD::GlobalBaseReg,
15873 SDLoc(), PtrVT), InFlag);
15874 InFlag = Chain.getValue(1);
15876 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15879 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15881 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15883 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15884 X86::RAX, X86II::MO_TLSGD);
15887 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15893 // Get the start address of the TLS block for this module.
15894 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15895 .getInfo<X86MachineFunctionInfo>();
15896 MFI->incNumLocalDynamicTLSAccesses();
15900 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15901 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15904 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15905 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15906 InFlag = Chain.getValue(1);
15907 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15908 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15911 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15915 unsigned char OperandFlags = X86II::MO_DTPOFF;
15916 unsigned WrapperKind = X86ISD::Wrapper;
15917 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15918 GA->getValueType(0),
15919 GA->getOffset(), OperandFlags);
15920 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15922 // Add x@dtpoff with the base.
15923 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15926 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15927 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15928 const EVT PtrVT, TLSModel::Model model,
15929 bool is64Bit, bool isPIC) {
15932 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15933 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15934 is64Bit ? 257 : 256));
15936 SDValue ThreadPointer =
15937 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15938 MachinePointerInfo(Ptr));
15940 unsigned char OperandFlags = 0;
15941 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15943 unsigned WrapperKind = X86ISD::Wrapper;
15944 if (model == TLSModel::LocalExec) {
15945 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15946 } else if (model == TLSModel::InitialExec) {
15948 OperandFlags = X86II::MO_GOTTPOFF;
15949 WrapperKind = X86ISD::WrapperRIP;
15951 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15954 llvm_unreachable("Unexpected model");
15957 // emit "addl x@ntpoff,%eax" (local exec)
15958 // or "addl x@indntpoff,%eax" (initial exec)
15959 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15961 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15962 GA->getOffset(), OperandFlags);
15963 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15965 if (model == TLSModel::InitialExec) {
15966 if (isPIC && !is64Bit) {
15967 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15968 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15972 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15973 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15976 // The address of the thread local variable is the add of the thread
15977 // pointer with the offset of the variable.
15978 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15982 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15984 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15986 if (DAG.getTarget().useEmulatedTLS())
15987 return LowerToTLSEmulatedModel(GA, DAG);
15989 const GlobalValue *GV = GA->getGlobal();
15990 auto PtrVT = getPointerTy(DAG.getDataLayout());
15991 bool PositionIndependent = isPositionIndependent();
15993 if (Subtarget.isTargetELF()) {
15994 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15996 case TLSModel::GeneralDynamic:
15997 if (Subtarget.is64Bit())
15998 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15999 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
16000 case TLSModel::LocalDynamic:
16001 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
16002 Subtarget.is64Bit());
16003 case TLSModel::InitialExec:
16004 case TLSModel::LocalExec:
16005 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
16006 PositionIndependent);
16008 llvm_unreachable("Unknown TLS model.");
16011 if (Subtarget.isTargetDarwin()) {
16012 // Darwin only has one model of TLS. Lower to that.
16013 unsigned char OpFlag = 0;
16014 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
16015 X86ISD::WrapperRIP : X86ISD::Wrapper;
16017 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
16018 // global base reg.
16019 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
16021 OpFlag = X86II::MO_TLVP_PIC_BASE;
16023 OpFlag = X86II::MO_TLVP;
16025 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
16026 GA->getValueType(0),
16027 GA->getOffset(), OpFlag);
16028 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
16030 // With PIC32, the address is actually $g + Offset.
16032 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
16033 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
16036 // Lowering the machine isd will make sure everything is in the right
16038 SDValue Chain = DAG.getEntryNode();
16039 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
16040 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
16041 SDValue Args[] = { Chain, Offset };
16042 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
16043 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
16044 DAG.getIntPtrConstant(0, DL, true),
16045 Chain.getValue(1), DL);
16047 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
16048 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
16049 MFI.setAdjustsStack(true);
16051 // And our return value (tls address) is in the standard call return value
16053 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
16054 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
16057 if (Subtarget.isTargetKnownWindowsMSVC() ||
16058 Subtarget.isTargetWindowsItanium() ||
16059 Subtarget.isTargetWindowsGNU()) {
16060 // Just use the implicit TLS architecture
16061 // Need to generate something similar to:
16062 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
16064 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
16065 // mov rcx, qword [rdx+rcx*8]
16066 // mov eax, .tls$:tlsvar
16067 // [rax+rcx] contains the address
16068 // Windows 64bit: gs:0x58
16069 // Windows 32bit: fs:__tls_array
16072 SDValue Chain = DAG.getEntryNode();
16074 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
16075 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
16076 // use its literal value of 0x2C.
16077 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
16078 ? Type::getInt8PtrTy(*DAG.getContext(),
16080 : Type::getInt32PtrTy(*DAG.getContext(),
16083 SDValue TlsArray = Subtarget.is64Bit()
16084 ? DAG.getIntPtrConstant(0x58, dl)
16085 : (Subtarget.isTargetWindowsGNU()
16086 ? DAG.getIntPtrConstant(0x2C, dl)
16087 : DAG.getExternalSymbol("_tls_array", PtrVT));
16089 SDValue ThreadPointer =
16090 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
16093 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
16094 res = ThreadPointer;
16096 // Load the _tls_index variable
16097 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
16098 if (Subtarget.is64Bit())
16099 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
16100 MachinePointerInfo(), MVT::i32);
16102 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
16104 auto &DL = DAG.getDataLayout();
16106 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8);
16107 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
16109 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
16112 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
16114 // Get the offset of start of .tls section
16115 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
16116 GA->getValueType(0),
16117 GA->getOffset(), X86II::MO_SECREL);
16118 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
16120 // The address of the thread local variable is the add of the thread
16121 // pointer with the offset of the variable.
16122 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
16125 llvm_unreachable("TLS not implemented for this target.");
16128 /// Lower SRA_PARTS and friends, which return two i32 values
16129 /// and take a 2 x i32 value to shift plus a shift amount.
16130 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
16131 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
16132 MVT VT = Op.getSimpleValueType();
16133 unsigned VTBits = VT.getSizeInBits();
16135 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
16136 SDValue ShOpLo = Op.getOperand(0);
16137 SDValue ShOpHi = Op.getOperand(1);
16138 SDValue ShAmt = Op.getOperand(2);
16139 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
16140 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
16142 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16143 DAG.getConstant(VTBits - 1, dl, MVT::i8));
16144 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
16145 DAG.getConstant(VTBits - 1, dl, MVT::i8))
16146 : DAG.getConstant(0, dl, VT);
16148 SDValue Tmp2, Tmp3;
16149 if (Op.getOpcode() == ISD::SHL_PARTS) {
16150 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
16151 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
16153 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
16154 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
16157 // If the shift amount is larger or equal than the width of a part we can't
16158 // rely on the results of shld/shrd. Insert a test and select the appropriate
16159 // values for large shift amounts.
16160 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
16161 DAG.getConstant(VTBits, dl, MVT::i8));
16162 SDValue Cond = DAG.getSetCC(dl, MVT::i8, AndNode,
16163 DAG.getConstant(0, dl, MVT::i8), ISD::SETNE);
16166 if (Op.getOpcode() == ISD::SHL_PARTS) {
16167 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16168 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16170 Lo = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp3, Tmp2);
16171 Hi = DAG.getNode(ISD::SELECT, dl, VT, Cond, Tmp1, Tmp3);
16174 return DAG.getMergeValues({ Lo, Hi }, dl);
16177 // Try to use a packed vector operation to handle i64 on 32-bit targets when
16178 // AVX512DQ is enabled.
16179 static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
16180 const X86Subtarget &Subtarget) {
16181 assert((Op.getOpcode() == ISD::SINT_TO_FP ||
16182 Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
16183 SDValue Src = Op.getOperand(0);
16184 MVT SrcVT = Src.getSimpleValueType();
16185 MVT VT = Op.getSimpleValueType();
16187 if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() ||
16188 (VT != MVT::f32 && VT != MVT::f64))
16191 // Pack the i64 into a vector, do the operation and extract.
16193 // Using 256-bit to ensure result is 128-bits for f32 case.
16194 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
16195 MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts);
16196 MVT VecVT = MVT::getVectorVT(VT, NumElts);
16199 SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
16200 SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
16201 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
16202 DAG.getIntPtrConstant(0, dl));
16205 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
16206 SelectionDAG &DAG) const {
16207 SDValue Src = Op.getOperand(0);
16208 MVT SrcVT = Src.getSimpleValueType();
16209 MVT VT = Op.getSimpleValueType();
16212 if (SrcVT.isVector()) {
16213 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
16214 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
16215 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
16216 DAG.getUNDEF(SrcVT)));
16221 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
16222 "Unknown SINT_TO_FP to lower!");
16224 // These are really Legal; return the operand so the caller accepts it as
16226 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
16228 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit()) {
16232 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16235 SDValue ValueToStore = Op.getOperand(0);
16236 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
16237 !Subtarget.is64Bit())
16238 // Bitcasting to f64 here allows us to do a single 64-bit store from
16239 // an SSE register, avoiding the store forwarding penalty that would come
16240 // with two 32-bit stores.
16241 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16243 unsigned Size = SrcVT.getSizeInBits()/8;
16244 MachineFunction &MF = DAG.getMachineFunction();
16245 auto PtrVT = getPointerTy(MF.getDataLayout());
16246 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
16247 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16248 SDValue Chain = DAG.getStore(
16249 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16250 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16251 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
16254 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
16256 SelectionDAG &DAG) const {
16260 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
16262 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
16264 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
16266 unsigned ByteSize = SrcVT.getSizeInBits()/8;
16268 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
16269 MachineMemOperand *MMO;
16271 int SSFI = FI->getIndex();
16272 MMO = DAG.getMachineFunction().getMachineMemOperand(
16273 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16274 MachineMemOperand::MOLoad, ByteSize, ByteSize);
16276 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
16277 StackSlot = StackSlot.getOperand(1);
16279 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
16280 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
16282 Tys, Ops, SrcVT, MMO);
16285 Chain = Result.getValue(1);
16286 SDValue InFlag = Result.getValue(2);
16288 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
16289 // shouldn't be necessary except that RFP cannot be live across
16290 // multiple blocks. When stackifier is fixed, they can be uncoupled.
16291 MachineFunction &MF = DAG.getMachineFunction();
16292 unsigned SSFISize = Op.getValueSizeInBits()/8;
16293 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
16294 auto PtrVT = getPointerTy(MF.getDataLayout());
16295 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16296 Tys = DAG.getVTList(MVT::Other);
16298 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
16300 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16301 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16302 MachineMemOperand::MOStore, SSFISize, SSFISize);
16304 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
16305 Ops, Op.getValueType(), MMO);
16306 Result = DAG.getLoad(
16307 Op.getValueType(), DL, Chain, StackSlot,
16308 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16314 /// 64-bit unsigned integer to double expansion.
16315 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
16316 const X86Subtarget &Subtarget) {
16317 // This algorithm is not obvious. Here it is what we're trying to output:
16320 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
16321 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
16323 haddpd %xmm0, %xmm0
16325 pshufd $0x4e, %xmm0, %xmm1
16331 LLVMContext *Context = DAG.getContext();
16333 // Build some magic constants.
16334 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
16335 Constant *C0 = ConstantDataVector::get(*Context, CV0);
16336 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
16337 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
16339 SmallVector<Constant*,2> CV1;
16341 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16342 APInt(64, 0x4330000000000000ULL))));
16344 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16345 APInt(64, 0x4530000000000000ULL))));
16346 Constant *C1 = ConstantVector::get(CV1);
16347 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
16349 // Load the 64-bit value into an XMM register.
16350 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
16353 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
16354 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16355 /* Alignment = */ 16);
16357 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
16360 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
16361 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16362 /* Alignment = */ 16);
16363 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
16364 // TODO: Are there any fast-math-flags to propagate here?
16365 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
16368 if (Subtarget.hasSSE3()) {
16369 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
16370 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
16372 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
16373 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
16374 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
16375 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
16378 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
16379 DAG.getIntPtrConstant(0, dl));
16382 /// 32-bit unsigned integer to float expansion.
16383 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
16384 const X86Subtarget &Subtarget) {
16386 // FP constant to bias correct the final result.
16387 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
16390 // Load the 32-bit value into an XMM register.
16391 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
16394 // Zero out the upper parts of the register.
16395 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
16397 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16398 DAG.getBitcast(MVT::v2f64, Load),
16399 DAG.getIntPtrConstant(0, dl));
16401 // Or the load with the bias.
16402 SDValue Or = DAG.getNode(
16403 ISD::OR, dl, MVT::v2i64,
16404 DAG.getBitcast(MVT::v2i64,
16405 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
16406 DAG.getBitcast(MVT::v2i64,
16407 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
16409 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16410 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
16412 // Subtract the bias.
16413 // TODO: Are there any fast-math-flags to propagate here?
16414 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
16416 // Handle final rounding.
16417 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
16420 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
16421 const X86Subtarget &Subtarget,
16423 if (Op.getSimpleValueType() != MVT::v2f64)
16426 SDValue N0 = Op.getOperand(0);
16427 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
16429 // Legalize to v4i32 type.
16430 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
16431 DAG.getUNDEF(MVT::v2i32));
16433 if (Subtarget.hasAVX512())
16434 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
16436 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
16437 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
16438 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
16439 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
16441 // Two to the power of half-word-size.
16442 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
16444 // Clear upper part of LO, lower HI.
16445 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
16446 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
16448 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
16449 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
16450 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
16452 // Add the two halves.
16453 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16456 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16457 const X86Subtarget &Subtarget) {
16458 // The algorithm is the following:
16459 // #ifdef __SSE4_1__
16460 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16461 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16462 // (uint4) 0x53000000, 0xaa);
16464 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16465 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16467 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16468 // return (float4) lo + fhi;
16470 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16471 // reassociate the two FADDs, and if we do that, the algorithm fails
16472 // spectacularly (PR24512).
16473 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16474 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16475 // there's also the MachineCombiner reassociations happening on Machine IR.
16476 if (DAG.getTarget().Options.UnsafeFPMath)
16480 SDValue V = Op->getOperand(0);
16481 MVT VecIntVT = V.getSimpleValueType();
16482 bool Is128 = VecIntVT == MVT::v4i32;
16483 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16484 // If we convert to something else than the supported type, e.g., to v4f64,
16486 if (VecFloatVT != Op->getSimpleValueType(0))
16489 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16490 "Unsupported custom type");
16492 // In the #idef/#else code, we have in common:
16493 // - The vector of constants:
16499 // Create the splat vector for 0x4b000000.
16500 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16501 // Create the splat vector for 0x53000000.
16502 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16504 // Create the right shift.
16505 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16506 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16509 if (Subtarget.hasSSE41()) {
16510 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16511 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16512 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16513 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16514 // Low will be bitcasted right away, so do not bother bitcasting back to its
16516 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16517 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16518 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16519 // (uint4) 0x53000000, 0xaa);
16520 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16521 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16522 // High will be bitcasted right away, so do not bother bitcasting back to
16523 // its original type.
16524 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16525 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16527 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16528 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16529 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16530 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16532 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16533 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16536 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16537 SDValue VecCstFAdd = DAG.getConstantFP(
16538 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16540 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16541 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16542 // TODO: Are there any fast-math-flags to propagate here?
16544 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16545 // return (float4) lo + fhi;
16546 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16547 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16550 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16551 const X86Subtarget &Subtarget) {
16552 SDValue N0 = Op.getOperand(0);
16553 MVT SrcVT = N0.getSimpleValueType();
16556 switch (SrcVT.SimpleTy) {
16558 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16560 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16563 assert(!Subtarget.hasAVX512());
16564 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16568 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16569 SelectionDAG &DAG) const {
16570 SDValue N0 = Op.getOperand(0);
16572 auto PtrVT = getPointerTy(DAG.getDataLayout());
16574 if (Op.getSimpleValueType().isVector())
16575 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16577 MVT SrcVT = N0.getSimpleValueType();
16578 MVT DstVT = Op.getSimpleValueType();
16580 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16581 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16582 // Conversions from unsigned i32 to f32/f64 are legal,
16583 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16587 if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
16590 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16591 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16592 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16593 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16594 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16597 // Make a 64-bit buffer, and use it to build an FILD.
16598 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16599 if (SrcVT == MVT::i32) {
16600 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16601 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16602 StackSlot, MachinePointerInfo());
16603 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16604 OffsetSlot, MachinePointerInfo());
16605 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16609 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16610 SDValue ValueToStore = Op.getOperand(0);
16611 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16612 // Bitcasting to f64 here allows us to do a single 64-bit store from
16613 // an SSE register, avoiding the store forwarding penalty that would come
16614 // with two 32-bit stores.
16615 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16616 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16617 MachinePointerInfo());
16618 // For i64 source, we need to add the appropriate power of 2 if the input
16619 // was negative. This is the same as the optimization in
16620 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16621 // we must be careful to do the computation in x87 extended precision, not
16622 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16623 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16624 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16625 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16626 MachineMemOperand::MOLoad, 8, 8);
16628 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16629 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16630 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16633 APInt FF(32, 0x5F800000ULL);
16635 // Check whether the sign bit is set.
16636 SDValue SignSet = DAG.getSetCC(
16637 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16638 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16640 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16641 SDValue FudgePtr = DAG.getConstantPool(
16642 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16644 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16645 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16646 SDValue Four = DAG.getIntPtrConstant(4, dl);
16647 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16648 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16650 // Load the value out, extending it from f32 to f80.
16651 // FIXME: Avoid the extend by constructing the right constant pool?
16652 SDValue Fudge = DAG.getExtLoad(
16653 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16654 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16655 /* Alignment = */ 4);
16656 // Extend everything to 80 bits to force it to be done on x87.
16657 // TODO: Are there any fast-math-flags to propagate here?
16658 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16659 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16660 DAG.getIntPtrConstant(0, dl));
16663 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16664 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16665 // just return an <SDValue(), SDValue()> pair.
16666 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16667 // to i16, i32 or i64, and we lower it to a legal sequence.
16668 // If lowered to the final integer result we return a <result, SDValue()> pair.
16669 // Otherwise we lower it to a sequence ending with a FIST, return a
16670 // <FIST, StackSlot> pair, and the caller is responsible for loading
16671 // the final integer result from StackSlot.
16672 std::pair<SDValue,SDValue>
16673 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16674 bool IsSigned, bool IsReplace) const {
16677 EVT DstTy = Op.getValueType();
16678 EVT TheVT = Op.getOperand(0).getValueType();
16679 auto PtrVT = getPointerTy(DAG.getDataLayout());
16681 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16682 // f16 must be promoted before using the lowering in this routine.
16683 // fp128 does not use this lowering.
16684 return std::make_pair(SDValue(), SDValue());
16687 // If using FIST to compute an unsigned i64, we'll need some fixup
16688 // to handle values above the maximum signed i64. A FIST is always
16689 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16690 bool UnsignedFixup = !IsSigned &&
16691 DstTy == MVT::i64 &&
16692 (!Subtarget.is64Bit() ||
16693 !isScalarFPTypeInSSEReg(TheVT));
16695 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16696 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16697 // The low 32 bits of the fist result will have the correct uint32 result.
16698 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16702 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16703 DstTy.getSimpleVT() >= MVT::i16 &&
16704 "Unknown FP_TO_INT to lower!");
16706 // These are really Legal.
16707 if (DstTy == MVT::i32 &&
16708 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16709 return std::make_pair(SDValue(), SDValue());
16710 if (Subtarget.is64Bit() &&
16711 DstTy == MVT::i64 &&
16712 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16713 return std::make_pair(SDValue(), SDValue());
16715 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16717 MachineFunction &MF = DAG.getMachineFunction();
16718 unsigned MemSize = DstTy.getSizeInBits()/8;
16719 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16720 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16723 switch (DstTy.getSimpleVT().SimpleTy) {
16724 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16725 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16726 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16727 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16730 SDValue Chain = DAG.getEntryNode();
16731 SDValue Value = Op.getOperand(0);
16732 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16734 if (UnsignedFixup) {
16736 // Conversion to unsigned i64 is implemented with a select,
16737 // depending on whether the source value fits in the range
16738 // of a signed i64. Let Thresh be the FP equivalent of
16739 // 0x8000000000000000ULL.
16741 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16742 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16743 // Fist-to-mem64 FistSrc
16744 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16745 // to XOR'ing the high 32 bits with Adjust.
16747 // Being a power of 2, Thresh is exactly representable in all FP formats.
16748 // For X87 we'd like to use the smallest FP type for this constant, but
16749 // for DAG type consistency we have to match the FP operand type.
16751 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16752 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16753 bool LosesInfo = false;
16754 if (TheVT == MVT::f64)
16755 // The rounding mode is irrelevant as the conversion should be exact.
16756 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16758 else if (TheVT == MVT::f80)
16759 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16760 APFloat::rmNearestTiesToEven, &LosesInfo);
16762 assert(Status == APFloat::opOK && !LosesInfo &&
16763 "FP conversion should have been exact");
16765 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16767 SDValue Cmp = DAG.getSetCC(DL,
16768 getSetCCResultType(DAG.getDataLayout(),
16769 *DAG.getContext(), TheVT),
16770 Value, ThreshVal, ISD::SETLT);
16771 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16772 DAG.getConstant(0, DL, MVT::i32),
16773 DAG.getConstant(0x80000000, DL, MVT::i32));
16774 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16775 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16776 *DAG.getContext(), TheVT),
16777 Value, ThreshVal, ISD::SETLT);
16778 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16781 // FIXME This causes a redundant load/store if the SSE-class value is already
16782 // in memory, such as if it is on the callstack.
16783 if (isScalarFPTypeInSSEReg(TheVT)) {
16784 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16785 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16786 MachinePointerInfo::getFixedStack(MF, SSFI));
16787 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16789 Chain, StackSlot, DAG.getValueType(TheVT)
16792 MachineMemOperand *MMO =
16793 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16794 MachineMemOperand::MOLoad, MemSize, MemSize);
16795 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16796 Chain = Value.getValue(1);
16797 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16798 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16801 MachineMemOperand *MMO =
16802 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16803 MachineMemOperand::MOStore, MemSize, MemSize);
16805 if (UnsignedFixup) {
16807 // Insert the FIST, load its result as two i32's,
16808 // and XOR the high i32 with Adjust.
16810 SDValue FistOps[] = { Chain, Value, StackSlot };
16811 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16812 FistOps, DstTy, MMO);
16815 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16816 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16819 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16820 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16822 if (Subtarget.is64Bit()) {
16823 // Join High32 and Low32 into a 64-bit result.
16824 // (High32 << 32) | Low32
16825 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16826 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16827 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16828 DAG.getConstant(32, DL, MVT::i8));
16829 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16830 return std::make_pair(Result, SDValue());
16833 SDValue ResultOps[] = { Low32, High32 };
16835 SDValue pair = IsReplace
16836 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16837 : DAG.getMergeValues(ResultOps, DL);
16838 return std::make_pair(pair, SDValue());
16840 // Build the FP_TO_INT*_IN_MEM
16841 SDValue Ops[] = { Chain, Value, StackSlot };
16842 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16844 return std::make_pair(FIST, StackSlot);
16848 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16849 const X86Subtarget &Subtarget) {
16850 MVT VT = Op->getSimpleValueType(0);
16851 SDValue In = Op->getOperand(0);
16852 MVT InVT = In.getSimpleValueType();
16855 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16856 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16857 "Expected same number of elements");
16858 assert((VT.getVectorElementType() == MVT::i16 ||
16859 VT.getVectorElementType() == MVT::i32 ||
16860 VT.getVectorElementType() == MVT::i64) &&
16861 "Unexpected element type");
16862 assert((InVT.getVectorElementType() == MVT::i8 ||
16863 InVT.getVectorElementType() == MVT::i16 ||
16864 InVT.getVectorElementType() == MVT::i32) &&
16865 "Unexpected element type");
16867 if (Subtarget.hasInt256())
16868 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16870 // Optimize vectors in AVX mode:
16873 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16874 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16875 // Concat upper and lower parts.
16878 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16879 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16880 // Concat upper and lower parts.
16883 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16884 SDValue Undef = DAG.getUNDEF(InVT);
16885 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16886 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16887 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16889 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16890 VT.getVectorNumElements()/2);
16892 OpLo = DAG.getBitcast(HVT, OpLo);
16893 OpHi = DAG.getBitcast(HVT, OpHi);
16895 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16898 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16899 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16900 const SDLoc &dl, SelectionDAG &DAG) {
16901 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16902 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16903 DAG.getIntPtrConstant(0, dl));
16904 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16905 DAG.getIntPtrConstant(8, dl));
16906 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16907 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16908 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16909 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16912 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16913 const X86Subtarget &Subtarget,
16914 SelectionDAG &DAG) {
16915 MVT VT = Op->getSimpleValueType(0);
16916 SDValue In = Op->getOperand(0);
16917 MVT InVT = In.getSimpleValueType();
16918 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16920 unsigned NumElts = VT.getVectorNumElements();
16922 // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
16923 // avoids a constant pool load.
16924 if (VT.getVectorElementType() != MVT::i8) {
16925 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
16926 return DAG.getNode(ISD::SRL, DL, VT, Extend,
16927 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
16930 // Extend VT if BWI is not supported.
16932 if (!Subtarget.hasBWI()) {
16933 // If v16i32 is to be avoided, we'll need to split and concatenate.
16934 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16935 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16937 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16940 // Widen to 512-bits if VLX is not supported.
16941 MVT WideVT = ExtVT;
16942 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16943 NumElts *= 512 / ExtVT.getSizeInBits();
16944 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16945 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16946 In, DAG.getIntPtrConstant(0, DL));
16947 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16951 SDValue One = DAG.getConstant(1, DL, WideVT);
16952 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16954 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16956 // Truncate if we had to extend above.
16958 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
16959 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
16962 // Extract back to 128/256-bit if we widened.
16964 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16965 DAG.getIntPtrConstant(0, DL));
16967 return SelectedVal;
16970 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16971 SelectionDAG &DAG) {
16972 SDValue In = Op.getOperand(0);
16973 MVT SVT = In.getSimpleValueType();
16975 if (SVT.getVectorElementType() == MVT::i1)
16976 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16978 assert(Subtarget.hasAVX() && "Expected AVX support");
16979 return LowerAVXExtend(Op, DAG, Subtarget);
16982 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16983 /// It makes use of the fact that vectors with enough leading sign/zero bits
16984 /// prevent the PACKSS/PACKUS from saturating the results.
16985 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16986 /// within each 128-bit lane.
16987 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16988 const SDLoc &DL, SelectionDAG &DAG,
16989 const X86Subtarget &Subtarget) {
16990 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16991 "Unexpected PACK opcode");
16993 // Requires SSE2 but AVX512 has fast vector truncate.
16994 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
16997 EVT SrcVT = In.getValueType();
16999 // No truncation required, we might get here due to recursive calls.
17000 if (SrcVT == DstVT)
17003 // We only support vector truncation to 64bits or greater from a
17004 // 128bits or greater source.
17005 unsigned DstSizeInBits = DstVT.getSizeInBits();
17006 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
17007 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
17010 unsigned NumElems = SrcVT.getVectorNumElements();
17011 if (!isPowerOf2_32(NumElems))
17014 LLVMContext &Ctx = *DAG.getContext();
17015 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
17016 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
17018 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
17020 // Pack to the largest type possible:
17021 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
17022 EVT InVT = MVT::i16, OutVT = MVT::i8;
17023 if (SrcVT.getScalarSizeInBits() > 16 &&
17024 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
17029 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
17030 if (SrcVT.is128BitVector()) {
17031 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
17032 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
17033 In = DAG.getBitcast(InVT, In);
17034 SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
17035 Res = extractSubVector(Res, 0, DAG, DL, 64);
17036 return DAG.getBitcast(DstVT, Res);
17039 // Extract lower/upper subvectors.
17040 unsigned NumSubElts = NumElems / 2;
17041 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17042 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
17044 unsigned SubSizeInBits = SrcSizeInBits / 2;
17045 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
17046 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
17048 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
17049 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
17050 Lo = DAG.getBitcast(InVT, Lo);
17051 Hi = DAG.getBitcast(InVT, Hi);
17052 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17053 return DAG.getBitcast(DstVT, Res);
17056 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
17057 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
17058 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
17059 Lo = DAG.getBitcast(InVT, Lo);
17060 Hi = DAG.getBitcast(InVT, Hi);
17061 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
17063 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
17064 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
17065 Res = DAG.getBitcast(MVT::v4i64, Res);
17066 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
17068 if (DstVT.is256BitVector())
17069 return DAG.getBitcast(DstVT, Res);
17071 // If 512bit -> 128bit truncate another stage.
17072 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17073 Res = DAG.getBitcast(PackedVT, Res);
17074 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17077 // Recursively pack lower/upper subvectors, concat result and pack again.
17078 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
17079 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
17080 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
17081 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
17083 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
17084 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
17085 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
17088 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
17089 const X86Subtarget &Subtarget) {
17092 MVT VT = Op.getSimpleValueType();
17093 SDValue In = Op.getOperand(0);
17094 MVT InVT = In.getSimpleValueType();
17096 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
17098 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
17099 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
17100 if (InVT.getScalarSizeInBits() <= 16) {
17101 if (Subtarget.hasBWI()) {
17102 // legal, will go to VPMOVB2M, VPMOVW2M
17103 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17104 // We need to shift to get the lsb into sign position.
17105 // Shift packed bytes not supported natively, bitcast to word
17106 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
17107 In = DAG.getNode(ISD::SHL, DL, ExtVT,
17108 DAG.getBitcast(ExtVT, In),
17109 DAG.getConstant(ShiftInx, DL, ExtVT));
17110 In = DAG.getBitcast(InVT, In);
17112 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17115 // Use TESTD/Q, extended vector to packed dword/qword.
17116 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
17117 "Unexpected vector type.");
17118 unsigned NumElts = InVT.getVectorNumElements();
17119 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
17120 // We need to change to a wider element type that we have support for.
17121 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
17122 // For 16 element vectors we extend to v16i32 unless we are explicitly
17123 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
17124 // we need to split into two 8 element vectors which we can extend to v8i32,
17125 // truncate and concat the results. There's an additional complication if
17126 // the original type is v16i8. In that case we can't split the v16i8 so
17127 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
17128 // to v8i32, truncate that to v8i1 and concat the two halves.
17129 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
17130 if (InVT == MVT::v16i8) {
17131 // First we need to sign extend up to 256-bits so we can split that.
17132 InVT = MVT::v16i16;
17133 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
17135 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
17136 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
17137 // We're split now, just emit two truncates and a concat. The two
17138 // truncates will trigger legalization to come back to this function.
17139 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
17140 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
17141 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
17143 // We either have 8 elements or we're allowed to use 512-bit vectors.
17144 // If we have VLX, we want to use the narrowest vector that can get the
17145 // job done so we use vXi32.
17146 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
17147 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
17148 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
17150 ShiftInx = InVT.getScalarSizeInBits() - 1;
17153 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
17154 // We need to shift to get the lsb into sign position.
17155 In = DAG.getNode(ISD::SHL, DL, InVT, In,
17156 DAG.getConstant(ShiftInx, DL, InVT));
17158 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
17159 if (Subtarget.hasDQI())
17160 return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT),
17162 return DAG.getSetCC(DL, VT, In, getZeroVector(InVT, Subtarget, DAG, DL),
17166 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
17168 MVT VT = Op.getSimpleValueType();
17169 SDValue In = Op.getOperand(0);
17170 MVT InVT = In.getSimpleValueType();
17171 unsigned InNumEltBits = InVT.getScalarSizeInBits();
17173 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
17174 "Invalid TRUNCATE operation");
17176 if (VT.getVectorElementType() == MVT::i1)
17177 return LowerTruncateVecI1(Op, DAG, Subtarget);
17179 // vpmovqb/w/d, vpmovdb/w, vpmovwb
17180 if (Subtarget.hasAVX512()) {
17181 // word to byte only under BWI
17182 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
17183 // Make sure we're allowed to promote 512-bits.
17184 if (Subtarget.canExtendTo512DQ())
17185 return DAG.getNode(ISD::TRUNCATE, DL, VT,
17186 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
17192 unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
17193 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
17195 // Truncate with PACKUS if we are truncating a vector with leading zero bits
17196 // that extend all the way to the packed/truncated value.
17197 // Pre-SSE41 we can only use PACKUSWB.
17199 DAG.computeKnownBits(In, Known);
17200 if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros())
17202 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
17205 // Truncate with PACKSS if we are truncating a vector with sign-bits that
17206 // extend all the way to the packed/truncated value.
17207 if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In))
17209 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
17212 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
17213 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
17214 if (Subtarget.hasInt256()) {
17215 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
17216 In = DAG.getBitcast(MVT::v8i32, In);
17217 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
17218 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
17219 DAG.getIntPtrConstant(0, DL));
17222 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17223 DAG.getIntPtrConstant(0, DL));
17224 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17225 DAG.getIntPtrConstant(2, DL));
17226 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17227 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17228 static const int ShufMask[] = {0, 2, 4, 6};
17229 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
17232 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
17233 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17234 if (Subtarget.hasInt256()) {
17235 In = DAG.getBitcast(MVT::v32i8, In);
17237 // The PSHUFB mask:
17238 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
17239 -1, -1, -1, -1, -1, -1, -1, -1,
17240 16, 17, 20, 21, 24, 25, 28, 29,
17241 -1, -1, -1, -1, -1, -1, -1, -1 };
17242 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
17243 In = DAG.getBitcast(MVT::v4i64, In);
17245 static const int ShufMask2[] = {0, 2, -1, -1};
17246 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
17247 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17248 DAG.getIntPtrConstant(0, DL));
17249 return DAG.getBitcast(VT, In);
17252 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17253 DAG.getIntPtrConstant(0, DL));
17255 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17256 DAG.getIntPtrConstant(4, DL));
17258 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
17259 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
17261 // The PSHUFB mask:
17262 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
17263 -1, -1, -1, -1, -1, -1, -1, -1};
17265 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
17266 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
17268 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17269 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17271 // The MOVLHPS Mask:
17272 static const int ShufMask2[] = {0, 1, 4, 5};
17273 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
17274 return DAG.getBitcast(MVT::v8i16, res);
17277 // Handle truncation of V256 to V128 using shuffles.
17278 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
17280 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
17282 unsigned NumElems = VT.getVectorNumElements();
17283 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
17285 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
17286 // Prepare truncation shuffle mask
17287 for (unsigned i = 0; i != NumElems; ++i)
17288 MaskVec[i] = i * 2;
17289 In = DAG.getBitcast(NVT, In);
17290 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
17291 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
17292 DAG.getIntPtrConstant(0, DL));
17295 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
17296 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
17297 MVT VT = Op.getSimpleValueType();
17299 if (VT.isVector()) {
17300 SDValue Src = Op.getOperand(0);
17303 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
17304 MVT ResVT = MVT::v4i32;
17305 MVT TruncVT = MVT::v4i1;
17306 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
17307 if (!IsSigned && !Subtarget.hasVLX()) {
17308 // Widen to 512-bits.
17309 ResVT = MVT::v8i32;
17310 TruncVT = MVT::v8i1;
17311 Opc = ISD::FP_TO_UINT;
17312 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
17313 DAG.getUNDEF(MVT::v8f64),
17314 Src, DAG.getIntPtrConstant(0, dl));
17316 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
17317 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
17318 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
17319 DAG.getIntPtrConstant(0, dl));
17322 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
17323 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
17324 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
17325 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
17326 DAG.getUNDEF(MVT::v2f32)));
17332 assert(!VT.isVector());
17334 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
17335 IsSigned, /*IsReplace=*/ false);
17336 SDValue FIST = Vals.first, StackSlot = Vals.second;
17337 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
17338 if (!FIST.getNode())
17341 if (StackSlot.getNode())
17342 // Load the result.
17343 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
17345 // The node is the result.
17349 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
17351 MVT VT = Op.getSimpleValueType();
17352 SDValue In = Op.getOperand(0);
17353 MVT SVT = In.getSimpleValueType();
17355 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
17357 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
17358 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
17359 In, DAG.getUNDEF(SVT)));
17362 /// The only differences between FABS and FNEG are the mask and the logic op.
17363 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
17364 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
17365 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
17366 "Wrong opcode for lowering FABS or FNEG.");
17368 bool IsFABS = (Op.getOpcode() == ISD::FABS);
17370 // If this is a FABS and it has an FNEG user, bail out to fold the combination
17371 // into an FNABS. We'll lower the FABS after that if it is still in use.
17373 for (SDNode *User : Op->uses())
17374 if (User->getOpcode() == ISD::FNEG)
17378 MVT VT = Op.getSimpleValueType();
17380 bool IsF128 = (VT == MVT::f128);
17382 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
17383 // decide if we should generate a 16-byte constant mask when we only need 4 or
17384 // 8 bytes for the scalar case.
17389 if (VT.isVector()) {
17391 EltVT = VT.getVectorElementType();
17392 } else if (IsF128) {
17393 // SSE instructions are used for optimized f128 logical operations.
17394 LogicVT = MVT::f128;
17397 // There are no scalar bitwise logical SSE/AVX instructions, so we
17398 // generate a 16-byte vector constant and logic op even for the scalar case.
17399 // Using a 16-byte mask allows folding the load of the mask with
17400 // the logic op, so it can save (~4 bytes) on code size.
17401 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17405 unsigned EltBits = EltVT.getSizeInBits();
17406 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
17408 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
17409 const fltSemantics &Sem =
17410 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
17411 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17412 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
17414 SDValue Op0 = Op.getOperand(0);
17415 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
17417 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
17418 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
17420 if (VT.isVector() || IsF128)
17421 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17423 // For the scalar case extend to a 128-bit vector, perform the logic op,
17424 // and extract the scalar result back out.
17425 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
17426 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17427 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
17428 DAG.getIntPtrConstant(0, dl));
17431 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
17432 SDValue Mag = Op.getOperand(0);
17433 SDValue Sign = Op.getOperand(1);
17436 // If the sign operand is smaller, extend it first.
17437 MVT VT = Op.getSimpleValueType();
17438 if (Sign.getSimpleValueType().bitsLT(VT))
17439 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
17441 // And if it is bigger, shrink it first.
17442 if (Sign.getSimpleValueType().bitsGT(VT))
17443 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
17445 // At this point the operands and the result should have the same
17446 // type, and that won't be f80 since that is not custom lowered.
17447 bool IsF128 = (VT == MVT::f128);
17448 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
17449 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
17450 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
17451 "Unexpected type in LowerFCOPYSIGN");
17453 MVT EltVT = VT.getScalarType();
17454 const fltSemantics &Sem =
17455 EltVT == MVT::f64 ? APFloat::IEEEdouble()
17456 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17458 // Perform all scalar logic operations as 16-byte vectors because there are no
17459 // scalar FP logic instructions in SSE.
17460 // TODO: This isn't necessary. If we used scalar types, we might avoid some
17461 // unnecessary splats, but we might miss load folding opportunities. Should
17462 // this decision be based on OptimizeForSize?
17463 bool IsFakeVector = !VT.isVector() && !IsF128;
17466 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17468 // The mask constants are automatically splatted for vector types.
17469 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17470 SDValue SignMask = DAG.getConstantFP(
17471 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17472 SDValue MagMask = DAG.getConstantFP(
17473 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17475 // First, clear all bits but the sign bit from the second operand (sign).
17477 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17478 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17480 // Next, clear the sign bit from the first operand (magnitude).
17481 // TODO: If we had general constant folding for FP logic ops, this check
17482 // wouldn't be necessary.
17484 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17485 APFloat APF = Op0CN->getValueAPF();
17487 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17489 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17491 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17492 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17495 // OR the magnitude value with the sign bit.
17496 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17497 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17498 DAG.getIntPtrConstant(0, dl));
17501 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17502 SDValue N0 = Op.getOperand(0);
17504 MVT VT = Op.getSimpleValueType();
17506 MVT OpVT = N0.getSimpleValueType();
17507 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17508 "Unexpected type for FGETSIGN");
17510 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17511 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17512 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17513 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17514 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17515 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17519 /// Helper for creating a X86ISD::SETCC node.
17520 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17521 SelectionDAG &DAG) {
17522 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17523 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17526 // Check whether an OR'd tree is PTEST-able.
17527 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17528 const X86Subtarget &Subtarget,
17529 SelectionDAG &DAG) {
17530 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17532 if (!Subtarget.hasSSE41())
17535 if (!Op->hasOneUse())
17538 SDNode *N = Op.getNode();
17541 SmallVector<SDValue, 8> Opnds;
17542 DenseMap<SDValue, unsigned> VecInMap;
17543 SmallVector<SDValue, 8> VecIns;
17544 EVT VT = MVT::Other;
17546 // Recognize a special case where a vector is casted into wide integer to
17548 Opnds.push_back(N->getOperand(0));
17549 Opnds.push_back(N->getOperand(1));
17551 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17552 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17553 // BFS traverse all OR'd operands.
17554 if (I->getOpcode() == ISD::OR) {
17555 Opnds.push_back(I->getOperand(0));
17556 Opnds.push_back(I->getOperand(1));
17557 // Re-evaluate the number of nodes to be traversed.
17558 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17562 // Quit if a non-EXTRACT_VECTOR_ELT
17563 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17566 // Quit if without a constant index.
17567 SDValue Idx = I->getOperand(1);
17568 if (!isa<ConstantSDNode>(Idx))
17571 SDValue ExtractedFromVec = I->getOperand(0);
17572 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17573 if (M == VecInMap.end()) {
17574 VT = ExtractedFromVec.getValueType();
17575 // Quit if not 128/256-bit vector.
17576 if (!VT.is128BitVector() && !VT.is256BitVector())
17578 // Quit if not the same type.
17579 if (VecInMap.begin() != VecInMap.end() &&
17580 VT != VecInMap.begin()->first.getValueType())
17582 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17583 VecIns.push_back(ExtractedFromVec);
17585 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17588 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17589 "Not extracted from 128-/256-bit vector.");
17591 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17593 for (DenseMap<SDValue, unsigned>::const_iterator
17594 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17595 // Quit if not all elements are used.
17596 if (I->second != FullMask)
17600 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17602 // Cast all vectors into TestVT for PTEST.
17603 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17604 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17606 // If more than one full vector is evaluated, OR them first before PTEST.
17607 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17608 // Each iteration will OR 2 nodes and append the result until there is only
17609 // 1 node left, i.e. the final OR'd value of all vectors.
17610 SDValue LHS = VecIns[Slot];
17611 SDValue RHS = VecIns[Slot + 1];
17612 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17615 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17616 VecIns.back(), VecIns.back());
17617 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17620 /// return true if \c Op has a use that doesn't just read flags.
17621 static bool hasNonFlagsUse(SDValue Op) {
17622 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17624 SDNode *User = *UI;
17625 unsigned UOpNo = UI.getOperandNo();
17626 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17627 // Look pass truncate.
17628 UOpNo = User->use_begin().getOperandNo();
17629 User = *User->use_begin();
17632 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17633 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17639 /// Emit nodes that will be selected as "test Op0,Op0", or something
17641 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17642 SelectionDAG &DAG) const {
17643 // CF and OF aren't always set the way we want. Determine which
17644 // of these we need.
17645 bool NeedCF = false;
17646 bool NeedOF = false;
17649 case X86::COND_A: case X86::COND_AE:
17650 case X86::COND_B: case X86::COND_BE:
17653 case X86::COND_G: case X86::COND_GE:
17654 case X86::COND_L: case X86::COND_LE:
17655 case X86::COND_O: case X86::COND_NO: {
17656 // Check if we really need to set the
17657 // Overflow flag. If NoSignedWrap is present
17658 // that is not actually needed.
17659 switch (Op->getOpcode()) {
17664 if (Op.getNode()->getFlags().hasNoSignedWrap())
17674 // See if we can use the EFLAGS value from the operand instead of
17675 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17676 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17677 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17678 // Emit a CMP with 0, which is the TEST pattern.
17679 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17680 DAG.getConstant(0, dl, Op.getValueType()));
17682 unsigned Opcode = 0;
17683 unsigned NumOperands = 0;
17685 // Truncate operations may prevent the merge of the SETCC instruction
17686 // and the arithmetic instruction before it. Attempt to truncate the operands
17687 // of the arithmetic instruction and use a reduced bit-width instruction.
17688 bool NeedTruncation = false;
17689 SDValue ArithOp = Op;
17690 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17691 SDValue Arith = Op->getOperand(0);
17692 // Both the trunc and the arithmetic op need to have one user each.
17693 if (Arith->hasOneUse())
17694 switch (Arith.getOpcode()) {
17701 NeedTruncation = true;
17707 // Sometimes flags can be set either with an AND or with an SRL/SHL
17708 // instruction. SRL/SHL variant should be preferred for masks longer than this
17710 const int ShiftToAndMaxMaskWidth = 32;
17711 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17713 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17714 // which may be the result of a CAST. We use the variable 'Op', which is the
17715 // non-casted variable when we check for possible users.
17716 switch (ArithOp.getOpcode()) {
17718 // We only want to rewrite this as a target-specific node with attached
17719 // flags if there is a reasonable chance of either using that to do custom
17720 // instructions selection that can fold some of the memory operands, or if
17721 // only the flags are used. If there are other uses, leave the node alone
17722 // and emit a test instruction.
17723 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17724 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17725 if (UI->getOpcode() != ISD::CopyToReg &&
17726 UI->getOpcode() != ISD::SETCC &&
17727 UI->getOpcode() != ISD::STORE)
17730 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17731 // An add of one will be selected as an INC.
17733 (!Subtarget.slowIncDec() ||
17734 DAG.getMachineFunction().getFunction().optForSize())) {
17735 Opcode = X86ISD::INC;
17740 // An add of negative one (subtract of one) will be selected as a DEC.
17741 if (C->isAllOnesValue() &&
17742 (!Subtarget.slowIncDec() ||
17743 DAG.getMachineFunction().getFunction().optForSize())) {
17744 Opcode = X86ISD::DEC;
17750 // Otherwise use a regular EFLAGS-setting add.
17751 Opcode = X86ISD::ADD;
17756 // If we have a constant logical shift that's only used in a comparison
17757 // against zero turn it into an equivalent AND. This allows turning it into
17758 // a TEST instruction later.
17759 if (ZeroCheck && Op->hasOneUse() &&
17760 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17761 EVT VT = Op.getValueType();
17762 unsigned BitWidth = VT.getSizeInBits();
17763 unsigned ShAmt = Op->getConstantOperandVal(1);
17764 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17766 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17767 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17768 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17769 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17771 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17772 DAG.getConstant(Mask, dl, VT));
17777 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17778 // because a TEST instruction will be better. However, AND should be
17779 // preferred if the instruction can be combined into ANDN.
17780 if (!hasNonFlagsUse(Op)) {
17781 SDValue Op0 = ArithOp->getOperand(0);
17782 SDValue Op1 = ArithOp->getOperand(1);
17783 EVT VT = ArithOp.getValueType();
17784 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17785 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17786 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17788 // If we cannot select an ANDN instruction, check if we can replace
17789 // AND+IMM64 with a shift before giving up. This is possible for masks
17790 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17791 if (!isProperAndn) {
17795 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17796 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17800 const APInt &Mask = CN->getAPIntValue();
17801 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17802 break; // Prefer TEST instruction.
17804 unsigned BitWidth = Mask.getBitWidth();
17805 unsigned LeadingOnes = Mask.countLeadingOnes();
17806 unsigned TrailingZeros = Mask.countTrailingZeros();
17808 if (LeadingOnes + TrailingZeros == BitWidth) {
17809 assert(TrailingZeros < VT.getSizeInBits() &&
17810 "Shift amount should be less than the type width");
17811 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17812 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17813 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17817 unsigned LeadingZeros = Mask.countLeadingZeros();
17818 unsigned TrailingOnes = Mask.countTrailingOnes();
17820 if (LeadingZeros + TrailingOnes == BitWidth) {
17821 assert(LeadingZeros < VT.getSizeInBits() &&
17822 "Shift amount should be less than the type width");
17823 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17824 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17825 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17836 // Similar to ISD::ADD above, check if the uses will preclude useful
17837 // lowering of the target-specific node.
17838 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17839 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17840 if (UI->getOpcode() != ISD::CopyToReg &&
17841 UI->getOpcode() != ISD::SETCC &&
17842 UI->getOpcode() != ISD::STORE)
17845 // Otherwise use a regular EFLAGS-setting instruction.
17846 switch (ArithOp.getOpcode()) {
17847 default: llvm_unreachable("unexpected operator!");
17848 case ISD::SUB: Opcode = X86ISD::SUB; break;
17849 case ISD::XOR: Opcode = X86ISD::XOR; break;
17850 case ISD::AND: Opcode = X86ISD::AND; break;
17851 case ISD::OR: Opcode = X86ISD::OR; break;
17863 return SDValue(Op.getNode(), 1);
17869 // If we found that truncation is beneficial, perform the truncation and
17871 if (NeedTruncation) {
17872 EVT VT = Op.getValueType();
17873 SDValue WideVal = Op->getOperand(0);
17874 EVT WideVT = WideVal.getValueType();
17875 unsigned ConvertedOp = 0;
17876 // Use a target machine opcode to prevent further DAGCombine
17877 // optimizations that may separate the arithmetic operations
17878 // from the setcc node.
17879 switch (WideVal.getOpcode()) {
17881 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17882 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17883 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17884 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17885 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17889 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17890 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17891 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17892 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17893 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17894 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17900 // Emit a CMP with 0, which is the TEST pattern.
17901 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17902 DAG.getConstant(0, dl, Op.getValueType()));
17904 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17905 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17907 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17908 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17909 return SDValue(New.getNode(), 1);
17912 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17914 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17915 const SDLoc &dl, SelectionDAG &DAG) const {
17916 if (isNullConstant(Op1))
17917 return EmitTest(Op0, X86CC, dl, DAG);
17919 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17920 "Unexpected comparison operation for MVT::i1 operands");
17922 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17923 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17924 // Only promote the compare up to I32 if it is a 16 bit operation
17925 // with an immediate. 16 bit immediates are to be avoided.
17926 if ((Op0.getValueType() == MVT::i16 &&
17927 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17928 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17929 !Subtarget.isAtom()) {
17930 unsigned ExtendOp =
17931 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17932 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17933 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17935 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17936 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17937 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17938 return SDValue(Sub.getNode(), 1);
17940 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17943 /// Convert a comparison if required by the subtarget.
17944 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17945 SelectionDAG &DAG) const {
17946 // If the subtarget does not support the FUCOMI instruction, floating-point
17947 // comparisons have to be converted.
17948 if (Subtarget.hasCMov() ||
17949 Cmp.getOpcode() != X86ISD::CMP ||
17950 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17951 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17954 // The instruction selector will select an FUCOM instruction instead of
17955 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17956 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17957 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17959 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17960 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17961 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17962 DAG.getConstant(8, dl, MVT::i8));
17963 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17965 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17966 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17967 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17970 /// Check if replacement of SQRT with RSQRT should be disabled.
17971 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17972 EVT VT = Op.getValueType();
17974 // We never want to use both SQRT and RSQRT instructions for the same input.
17975 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17979 return Subtarget.hasFastVectorFSQRT();
17980 return Subtarget.hasFastScalarFSQRT();
17983 /// The minimum architected relative accuracy is 2^-12. We need one
17984 /// Newton-Raphson step to have a good float result (24 bits of precision).
17985 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17986 SelectionDAG &DAG, int Enabled,
17987 int &RefinementSteps,
17988 bool &UseOneConstNR,
17989 bool Reciprocal) const {
17990 EVT VT = Op.getValueType();
17992 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17993 // It is likely not profitable to do this for f64 because a double-precision
17994 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17995 // instructions: convert to single, rsqrtss, convert back to double, refine
17996 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17997 // along with FMA, this could be a throughput win.
17998 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17999 // after legalize types.
18000 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18001 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
18002 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
18003 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18004 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18005 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18006 RefinementSteps = 1;
18008 UseOneConstNR = false;
18009 // There is no FSQRT for 512-bits, but there is RSQRT14.
18010 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT;
18011 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18016 /// The minimum architected relative accuracy is 2^-12. We need one
18017 /// Newton-Raphson step to have a good float result (24 bits of precision).
18018 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
18020 int &RefinementSteps) const {
18021 EVT VT = Op.getValueType();
18023 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
18024 // It is likely not profitable to do this for f64 because a double-precision
18025 // reciprocal estimate with refinement on x86 prior to FMA requires
18026 // 15 instructions: convert to single, rcpss, convert back to double, refine
18027 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
18028 // along with FMA, this could be a throughput win.
18030 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
18031 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
18032 (VT == MVT::v8f32 && Subtarget.hasAVX()) ||
18033 (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) {
18034 // Enable estimate codegen with 1 refinement step for vector division.
18035 // Scalar division estimates are disabled because they break too much
18036 // real-world code. These defaults are intended to match GCC behavior.
18037 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
18040 if (RefinementSteps == ReciprocalEstimate::Unspecified)
18041 RefinementSteps = 1;
18043 // There is no FSQRT for 512-bits, but there is RCP14.
18044 unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP;
18045 return DAG.getNode(Opcode, SDLoc(Op), VT, Op);
18050 /// If we have at least two divisions that use the same divisor, convert to
18051 /// multiplication by a reciprocal. This may need to be adjusted for a given
18052 /// CPU if a division's cost is not at least twice the cost of a multiplication.
18053 /// This is because we still need one division to calculate the reciprocal and
18054 /// then we need two multiplies by that reciprocal as replacements for the
18055 /// original divisions.
18056 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
18060 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
18061 /// according to equal/not-equal condition code \p CC.
18062 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
18063 const SDLoc &dl, SelectionDAG &DAG) {
18064 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
18065 // instruction. Since the shift amount is in-range-or-undefined, we know
18066 // that doing a bittest on the i32 value is ok. We extend to i32 because
18067 // the encoding for the i16 version is larger than the i32 version.
18068 // Also promote i16 to i32 for performance / code size reason.
18069 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
18070 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
18072 // See if we can use the 32-bit instruction instead of the 64-bit one for a
18073 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
18074 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
18075 // known to be zero.
18076 if (Src.getValueType() == MVT::i64 &&
18077 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
18078 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
18080 // If the operand types disagree, extend the shift amount to match. Since
18081 // BT ignores high bits (like shifts) we can use anyextend.
18082 if (Src.getValueType() != BitNo.getValueType())
18083 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
18085 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
18086 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
18087 return getSETCC(Cond, BT, dl , DAG);
18090 /// Result of 'and' is compared against zero. Change to a BT node if possible.
18091 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
18092 const SDLoc &dl, SelectionDAG &DAG) {
18093 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
18094 SDValue Op0 = And.getOperand(0);
18095 SDValue Op1 = And.getOperand(1);
18096 if (Op0.getOpcode() == ISD::TRUNCATE)
18097 Op0 = Op0.getOperand(0);
18098 if (Op1.getOpcode() == ISD::TRUNCATE)
18099 Op1 = Op1.getOperand(0);
18102 if (Op1.getOpcode() == ISD::SHL)
18103 std::swap(Op0, Op1);
18104 if (Op0.getOpcode() == ISD::SHL) {
18105 if (isOneConstant(Op0.getOperand(0))) {
18106 // If we looked past a truncate, check that it's only truncating away
18108 unsigned BitWidth = Op0.getValueSizeInBits();
18109 unsigned AndBitWidth = And.getValueSizeInBits();
18110 if (BitWidth > AndBitWidth) {
18112 DAG.computeKnownBits(Op0, Known);
18113 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
18117 RHS = Op0.getOperand(1);
18119 } else if (Op1.getOpcode() == ISD::Constant) {
18120 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
18121 uint64_t AndRHSVal = AndRHS->getZExtValue();
18122 SDValue AndLHS = Op0;
18124 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
18125 LHS = AndLHS.getOperand(0);
18126 RHS = AndLHS.getOperand(1);
18128 // Use BT if the immediate can't be encoded in a TEST instruction or we
18129 // are optimizing for size and the immedaite won't fit in a byte.
18130 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
18131 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
18132 isPowerOf2_64(AndRHSVal)) {
18134 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
18140 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
18145 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
18147 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
18152 // SSE Condition code mapping:
18161 switch (SetCCOpcode) {
18162 default: llvm_unreachable("Unexpected SETCC condition");
18164 case ISD::SETEQ: SSECC = 0; break;
18166 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
18168 case ISD::SETOLT: SSECC = 1; break;
18170 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
18172 case ISD::SETOLE: SSECC = 2; break;
18173 case ISD::SETUO: SSECC = 3; break;
18175 case ISD::SETNE: SSECC = 4; break;
18176 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
18177 case ISD::SETUGE: SSECC = 5; break;
18178 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
18179 case ISD::SETUGT: SSECC = 6; break;
18180 case ISD::SETO: SSECC = 7; break;
18181 case ISD::SETUEQ: SSECC = 8; break;
18182 case ISD::SETONE: SSECC = 12; break;
18185 std::swap(Op0, Op1);
18190 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
18191 /// concatenate the result back.
18192 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
18193 MVT VT = Op.getSimpleValueType();
18195 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
18196 "Unsupported value type for operation");
18198 unsigned NumElems = VT.getVectorNumElements();
18200 SDValue CC = Op.getOperand(2);
18202 // Extract the LHS vectors
18203 SDValue LHS = Op.getOperand(0);
18204 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
18205 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
18207 // Extract the RHS vectors
18208 SDValue RHS = Op.getOperand(1);
18209 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
18210 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
18212 // Issue the operation on the smaller types and concatenate the result back
18213 MVT EltVT = VT.getVectorElementType();
18214 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
18215 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
18216 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
18217 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
18220 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
18222 SDValue Op0 = Op.getOperand(0);
18223 SDValue Op1 = Op.getOperand(1);
18224 SDValue CC = Op.getOperand(2);
18225 MVT VT = Op.getSimpleValueType();
18228 assert(VT.getVectorElementType() == MVT::i1 &&
18229 "Cannot set masked compare for this operation");
18231 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
18233 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
18234 // This helps with vptestm matching.
18235 // TODO: Should we just canonicalize the setcc during DAG combine?
18236 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
18237 ISD::isBuildVectorAllZeros(Op0.getNode()))
18238 std::swap(Op0, Op1);
18240 // Prefer SETGT over SETLT.
18241 if (SetCCOpcode == ISD::SETLT) {
18242 SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode);
18243 std::swap(Op0, Op1);
18246 return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode);
18249 /// Try to turn a VSETULT into a VSETULE by modifying its second
18250 /// operand \p Op1. If non-trivial (for example because it's not constant)
18251 /// return an empty value.
18252 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
18253 SelectionDAG &DAG) {
18254 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
18258 MVT VT = Op1.getSimpleValueType();
18259 MVT EVT = VT.getVectorElementType();
18260 unsigned n = VT.getVectorNumElements();
18261 SmallVector<SDValue, 8> ULTOp1;
18263 for (unsigned i = 0; i < n; ++i) {
18264 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
18265 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
18268 // Avoid underflow.
18269 APInt Val = Elt->getAPIntValue();
18273 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
18276 return DAG.getBuildVector(VT, dl, ULTOp1);
18279 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18281 /// t = psubus Op0, Op1
18282 /// pcmpeq t, <0..0>
18283 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
18284 ISD::CondCode Cond, const SDLoc &dl,
18285 const X86Subtarget &Subtarget,
18286 SelectionDAG &DAG) {
18287 if (!Subtarget.hasSSE2())
18290 MVT VET = VT.getVectorElementType();
18291 if (VET != MVT::i8 && VET != MVT::i16)
18297 case ISD::SETULT: {
18298 // If the comparison is against a constant we can turn this into a
18299 // setule. With psubus, setule does not require a swap. This is
18300 // beneficial because the constant in the register is no longer
18301 // destructed as the destination so it can be hoisted out of a loop.
18302 // Only do this pre-AVX since vpcmp* is no longer destructive.
18303 if (Subtarget.hasAVX())
18305 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
18311 // Psubus is better than flip-sign because it requires no inversion.
18313 std::swap(Op0, Op1);
18319 SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
18320 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18321 getZeroVector(VT, Subtarget, DAG, dl));
18324 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
18325 SelectionDAG &DAG) {
18326 SDValue Op0 = Op.getOperand(0);
18327 SDValue Op1 = Op.getOperand(1);
18328 SDValue CC = Op.getOperand(2);
18329 MVT VT = Op.getSimpleValueType();
18330 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
18331 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
18336 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
18337 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
18341 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
18342 assert(VT.getVectorNumElements() <= 16);
18343 Opc = X86ISD::CMPM;
18345 Opc = X86ISD::CMPP;
18346 // The SSE/AVX packed FP comparison nodes are defined with a
18347 // floating-point vector result that matches the operand type. This allows
18348 // them to work with an SSE1 target (integer vector types are not legal).
18349 VT = Op0.getSimpleValueType();
18352 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
18353 // emit two comparisons and a logic op to tie them together.
18355 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
18356 if (SSECC >= 8 && !Subtarget.hasAVX()) {
18357 // LLVM predicate is SETUEQ or SETONE.
18359 unsigned CombineOpc;
18360 if (Cond == ISD::SETUEQ) {
18363 CombineOpc = X86ISD::FOR;
18365 assert(Cond == ISD::SETONE);
18368 CombineOpc = X86ISD::FAND;
18371 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18372 DAG.getConstant(CC0, dl, MVT::i8));
18373 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18374 DAG.getConstant(CC1, dl, MVT::i8));
18375 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
18377 // Handle all other FP comparisons here.
18378 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
18379 DAG.getConstant(SSECC, dl, MVT::i8));
18382 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
18383 // result type of SETCC. The bitcast is expected to be optimized away
18384 // during combining/isel.
18385 if (Opc == X86ISD::CMPP)
18386 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
18391 MVT VTOp0 = Op0.getSimpleValueType();
18392 assert(VTOp0 == Op1.getSimpleValueType() &&
18393 "Expected operands with same type!");
18394 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
18395 "Invalid number of packed elements for source and destination!");
18397 // This is being called by type legalization because v2i32 is marked custom
18398 // for result type legalization for v2f32.
18399 if (VTOp0 == MVT::v2i32)
18402 // The non-AVX512 code below works under the assumption that source and
18403 // destination types are the same.
18404 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
18405 "Value types for source and destination must be the same!");
18407 // Break 256-bit integer vector compare into smaller ones.
18408 if (VT.is256BitVector() && !Subtarget.hasInt256())
18409 return Lower256IntVSETCC(Op, DAG);
18411 // The result is boolean, but operands are int/float
18412 if (VT.getVectorElementType() == MVT::i1) {
18413 // In AVX-512 architecture setcc returns mask with i1 elements,
18414 // But there is no compare instruction for i8 and i16 elements in KNL.
18415 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
18416 "Unexpected operand type");
18417 return LowerIntVSETCC_AVX512(Op, DAG);
18420 // Lower using XOP integer comparisons.
18421 if (VT.is128BitVector() && Subtarget.hasXOP()) {
18422 // Translate compare code to XOP PCOM compare mode.
18423 unsigned CmpMode = 0;
18425 default: llvm_unreachable("Unexpected SETCC condition");
18427 case ISD::SETLT: CmpMode = 0x00; break;
18429 case ISD::SETLE: CmpMode = 0x01; break;
18431 case ISD::SETGT: CmpMode = 0x02; break;
18433 case ISD::SETGE: CmpMode = 0x03; break;
18434 case ISD::SETEQ: CmpMode = 0x04; break;
18435 case ISD::SETNE: CmpMode = 0x05; break;
18438 // Are we comparing unsigned or signed integers?
18440 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
18442 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18443 DAG.getConstant(CmpMode, dl, MVT::i8));
18446 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
18447 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
18448 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
18449 SDValue BC0 = peekThroughBitcasts(Op0);
18450 if (BC0.getOpcode() == ISD::AND) {
18452 SmallVector<APInt, 64> EltBits;
18453 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18454 VT.getScalarSizeInBits(), UndefElts,
18455 EltBits, false, false)) {
18456 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18458 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18464 // If this is a SETNE against the signed minimum value, change it to SETGT.
18465 // If this is a SETNE against the signed maximum value, change it to SETLT.
18466 // which will be swapped to SETGT.
18467 // Otherwise we use PCMPEQ+invert.
18469 if (Cond == ISD::SETNE &&
18470 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
18471 if (ConstValue.isMinSignedValue())
18473 else if (ConstValue.isMaxSignedValue())
18477 // If both operands are known non-negative, then an unsigned compare is the
18478 // same as a signed compare and there's no need to flip signbits.
18479 // TODO: We could check for more general simplifications here since we're
18480 // computing known bits.
18481 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18482 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18484 // Special case: Use min/max operations for unsigned compares. We only want
18485 // to do this for unsigned compares if we need to flip signs or if it allows
18486 // use to avoid an invert.
18487 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18488 if (ISD::isUnsignedIntSetCC(Cond) &&
18489 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
18490 TLI.isOperationLegal(ISD::UMIN, VT)) {
18491 bool Invert = false;
18494 default: llvm_unreachable("Unexpected condition code");
18495 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
18496 case ISD::SETULE: Opc = ISD::UMIN; break;
18497 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
18498 case ISD::SETUGE: Opc = ISD::UMAX; break;
18501 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18502 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18504 // If the logical-not of the result is required, perform that now.
18506 Result = DAG.getNOT(dl, Result, VT);
18511 // Try to use SUBUS and PCMPEQ.
18512 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
18515 // We are handling one of the integer comparisons here. Since SSE only has
18516 // GT and EQ comparisons for integer, swapping operands and multiple
18517 // operations may be required for some comparisons.
18518 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18520 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18521 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18522 bool Invert = Cond == ISD::SETNE ||
18523 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18526 std::swap(Op0, Op1);
18528 // Check that the operation in question is available (most are plain SSE2,
18529 // but PCMPGTQ and PCMPEQQ have different requirements).
18530 if (VT == MVT::v2i64) {
18531 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18532 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18534 // First cast everything to the right type.
18535 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18536 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18538 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18539 // bits of the inputs before performing those operations. The lower
18540 // compare is always unsigned.
18543 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18545 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18546 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18547 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18549 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18550 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18552 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18553 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18554 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18556 // Create masks for only the low parts/high parts of the 64 bit integers.
18557 static const int MaskHi[] = { 1, 1, 3, 3 };
18558 static const int MaskLo[] = { 0, 0, 2, 2 };
18559 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18560 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18561 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18563 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18564 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18567 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18569 return DAG.getBitcast(VT, Result);
18572 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18573 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18574 // pcmpeqd + pshufd + pand.
18575 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18577 // First cast everything to the right type.
18578 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18579 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18582 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18584 // Make sure the lower and upper halves are both all-ones.
18585 static const int Mask[] = { 1, 0, 3, 2 };
18586 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18587 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18590 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18592 return DAG.getBitcast(VT, Result);
18596 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18597 // bits of the inputs before performing those operations.
18599 MVT EltVT = VT.getVectorElementType();
18600 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18602 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18603 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18606 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18608 // If the logical-not of the result is required, perform that now.
18610 Result = DAG.getNOT(dl, Result, VT);
18615 // Try to select this as a KTEST+SETCC if possible.
18616 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18617 const SDLoc &dl, SelectionDAG &DAG,
18618 const X86Subtarget &Subtarget) {
18619 // Only support equality comparisons.
18620 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18623 // Must be a bitcast from vXi1.
18624 if (Op0.getOpcode() != ISD::BITCAST)
18627 Op0 = Op0.getOperand(0);
18628 MVT VT = Op0.getSimpleValueType();
18629 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18630 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18631 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18634 X86::CondCode X86CC;
18635 if (isNullConstant(Op1)) {
18636 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18637 } else if (isAllOnesConstant(Op1)) {
18638 // C flag is set for all ones.
18639 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18643 // If the input is an OR, we can combine it's operands into the KORTEST.
18646 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18647 LHS = Op0.getOperand(0);
18648 RHS = Op0.getOperand(1);
18651 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18652 return getSETCC(X86CC, KORTEST, dl, DAG);
18655 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18657 MVT VT = Op.getSimpleValueType();
18659 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18661 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18662 SDValue Op0 = Op.getOperand(0);
18663 SDValue Op1 = Op.getOperand(1);
18665 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18667 // Optimize to BT if possible.
18668 // Lower (X & (1 << N)) == 0 to BT(X, N).
18669 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18670 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18671 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18672 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18673 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18677 // Try to use PTEST for a tree ORs equality compared with 0.
18678 // TODO: We could do AND tree with all 1s as well by using the C flag.
18679 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18680 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18681 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18685 // Try to lower using KTEST.
18686 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18689 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18691 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18692 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18694 // If the input is a setcc, then reuse the input setcc or use a new one with
18695 // the inverted condition.
18696 if (Op0.getOpcode() == X86ISD::SETCC) {
18697 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18698 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18702 CCode = X86::GetOppositeBranchCondition(CCode);
18703 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18707 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18708 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18709 if (X86CC == X86::COND_INVALID)
18712 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18713 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18714 return getSETCC(X86CC, EFLAGS, dl, DAG);
18717 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18718 SDValue LHS = Op.getOperand(0);
18719 SDValue RHS = Op.getOperand(1);
18720 SDValue Carry = Op.getOperand(2);
18721 SDValue Cond = Op.getOperand(3);
18724 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18725 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18727 // Recreate the carry if needed.
18728 EVT CarryVT = Carry.getValueType();
18729 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18730 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18731 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18733 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18734 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18735 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18738 /// Return true if opcode is a X86 logical comparison.
18739 static bool isX86LogicalCmp(SDValue Op) {
18740 unsigned Opc = Op.getOpcode();
18741 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18742 Opc == X86ISD::SAHF)
18744 if (Op.getResNo() == 1 &&
18745 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18746 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18747 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18748 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18751 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18757 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18758 if (V.getOpcode() != ISD::TRUNCATE)
18761 SDValue VOp0 = V.getOperand(0);
18762 unsigned InBits = VOp0.getValueSizeInBits();
18763 unsigned Bits = V.getValueSizeInBits();
18764 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18767 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18768 bool AddTest = true;
18769 SDValue Cond = Op.getOperand(0);
18770 SDValue Op1 = Op.getOperand(1);
18771 SDValue Op2 = Op.getOperand(2);
18773 MVT VT = Op1.getSimpleValueType();
18776 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18777 // are available or VBLENDV if AVX is available.
18778 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18779 if (Cond.getOpcode() == ISD::SETCC &&
18780 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
18781 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18782 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18783 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18784 unsigned SSECC = translateX86FSETCC(
18785 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18787 if (Subtarget.hasAVX512()) {
18788 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18789 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18790 assert(!VT.isVector() && "Not a scalar type?");
18791 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18794 if (SSECC < 8 || Subtarget.hasAVX()) {
18795 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18796 DAG.getConstant(SSECC, DL, MVT::i8));
18798 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18799 // of 3 logic instructions for size savings and potentially speed.
18800 // Unfortunately, there is no scalar form of VBLENDV.
18802 // If either operand is a constant, don't try this. We can expect to
18803 // optimize away at least one of the logic instructions later in that
18804 // case, so that sequence would be faster than a variable blend.
18806 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18807 // uses XMM0 as the selection register. That may need just as many
18808 // instructions as the AND/ANDN/OR sequence due to register moves, so
18811 if (Subtarget.hasAVX() &&
18812 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18814 // Convert to vectors, do a VSELECT, and convert back to scalar.
18815 // All of the conversions should be optimized away.
18817 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18818 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18819 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18820 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18822 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18823 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18825 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18827 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18828 VSel, DAG.getIntPtrConstant(0, DL));
18830 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18831 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18832 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18836 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18837 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18838 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18839 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18842 // For v64i1 without 64-bit support we need to split and rejoin.
18843 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18844 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18845 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18846 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18847 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18848 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18849 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18850 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18851 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18854 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18856 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18857 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18858 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18859 Op1Scalar = Op1.getOperand(0);
18861 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18862 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18863 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18864 Op2Scalar = Op2.getOperand(0);
18865 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18866 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18867 Op1Scalar, Op2Scalar);
18868 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18869 return DAG.getBitcast(VT, newSelect);
18870 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18871 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18872 DAG.getIntPtrConstant(0, DL));
18876 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18877 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18878 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18879 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18880 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18881 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18882 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18883 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18886 if (Cond.getOpcode() == ISD::SETCC) {
18887 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18889 // If the condition was updated, it's possible that the operands of the
18890 // select were also updated (for example, EmitTest has a RAUW). Refresh
18891 // the local references to the select operands in case they got stale.
18892 Op1 = Op.getOperand(1);
18893 Op2 = Op.getOperand(2);
18897 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18898 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18899 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18900 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18901 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18902 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18903 if (Cond.getOpcode() == X86ISD::SETCC &&
18904 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18905 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18906 SDValue Cmp = Cond.getOperand(1);
18907 unsigned CondCode =
18908 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18910 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18911 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18912 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18913 SDValue CmpOp0 = Cmp.getOperand(0);
18915 // Apply further optimizations for special cases
18916 // (select (x != 0), -1, 0) -> neg & sbb
18917 // (select (x == 0), 0, -1) -> neg & sbb
18918 if (isNullConstant(Y) &&
18919 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18920 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18921 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18922 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18923 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18924 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18925 SDValue(Neg.getNode(), 1));
18929 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18930 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18931 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18933 SDValue Res = // Res = 0 or -1.
18934 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18935 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18937 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18938 Res = DAG.getNOT(DL, Res, Res.getValueType());
18940 if (!isNullConstant(Op2))
18941 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18943 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18944 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18945 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18946 SDValue CmpOp0 = Cmp.getOperand(0);
18947 SDValue Src1, Src2;
18948 // true if Op2 is XOR or OR operator and one of its operands
18950 // ( a , a op b) || ( b , a op b)
18951 auto isOrXorPattern = [&]() {
18952 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18953 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18955 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18962 if (isOrXorPattern()) {
18964 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18965 // we need mask of all zeros or ones with same size of the other
18967 if (CmpSz > VT.getSizeInBits())
18968 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18969 else if (CmpSz < VT.getSizeInBits())
18970 Neg = DAG.getNode(ISD::AND, DL, VT,
18971 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18972 DAG.getConstant(1, DL, VT));
18975 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18976 Neg); // -(and (x, 0x1))
18977 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18978 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18983 // Look past (and (setcc_carry (cmp ...)), 1).
18984 if (Cond.getOpcode() == ISD::AND &&
18985 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18986 isOneConstant(Cond.getOperand(1)))
18987 Cond = Cond.getOperand(0);
18989 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18990 // setting operand in place of the X86ISD::SETCC.
18991 unsigned CondOpcode = Cond.getOpcode();
18992 if (CondOpcode == X86ISD::SETCC ||
18993 CondOpcode == X86ISD::SETCC_CARRY) {
18994 CC = Cond.getOperand(0);
18996 SDValue Cmp = Cond.getOperand(1);
18997 unsigned Opc = Cmp.getOpcode();
18998 MVT VT = Op.getSimpleValueType();
19000 bool IllegalFPCMov = false;
19001 if (VT.isFloatingPoint() && !VT.isVector() &&
19002 !isScalarFPTypeInSSEReg(VT)) // FPStack?
19003 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
19005 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
19006 Opc == X86ISD::BT) { // FIXME
19010 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19011 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19012 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19013 Cond.getOperand(0).getValueType() != MVT::i8)) {
19014 SDValue LHS = Cond.getOperand(0);
19015 SDValue RHS = Cond.getOperand(1);
19016 unsigned X86Opcode;
19019 switch (CondOpcode) {
19020 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19021 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19022 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19023 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19024 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19025 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19026 default: llvm_unreachable("unexpected overflowing operator");
19028 if (CondOpcode == ISD::UMULO)
19029 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19032 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19034 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
19036 if (CondOpcode == ISD::UMULO)
19037 Cond = X86Op.getValue(2);
19039 Cond = X86Op.getValue(1);
19041 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
19046 // Look past the truncate if the high bits are known zero.
19047 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19048 Cond = Cond.getOperand(0);
19050 // We know the result of AND is compared against zero. Try to match
19052 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19053 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
19054 CC = NewSetCC.getOperand(0);
19055 Cond = NewSetCC.getOperand(1);
19062 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
19063 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
19066 // a < b ? -1 : 0 -> RES = ~setcc_carry
19067 // a < b ? 0 : -1 -> RES = setcc_carry
19068 // a >= b ? -1 : 0 -> RES = setcc_carry
19069 // a >= b ? 0 : -1 -> RES = ~setcc_carry
19070 if (Cond.getOpcode() == X86ISD::SUB) {
19071 Cond = ConvertCmpIfNecessary(Cond, DAG);
19072 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
19074 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
19075 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
19076 (isNullConstant(Op1) || isNullConstant(Op2))) {
19077 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
19078 DAG.getConstant(X86::COND_B, DL, MVT::i8),
19080 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
19081 return DAG.getNOT(DL, Res, Res.getValueType());
19086 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
19087 // widen the cmov and push the truncate through. This avoids introducing a new
19088 // branch during isel and doesn't add any extensions.
19089 if (Op.getValueType() == MVT::i8 &&
19090 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
19091 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
19092 if (T1.getValueType() == T2.getValueType() &&
19093 // Blacklist CopyFromReg to avoid partial register stalls.
19094 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
19095 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
19097 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19101 // Promote i16 cmovs if it won't prevent folding a load.
19102 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
19103 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
19104 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
19105 SDValue Ops[] = { Op2, Op1, CC, Cond };
19106 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
19107 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
19110 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
19111 // condition is true.
19112 SDValue Ops[] = { Op2, Op1, CC, Cond };
19113 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
19116 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
19117 const X86Subtarget &Subtarget,
19118 SelectionDAG &DAG) {
19119 MVT VT = Op->getSimpleValueType(0);
19120 SDValue In = Op->getOperand(0);
19121 MVT InVT = In.getSimpleValueType();
19122 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
19123 MVT VTElt = VT.getVectorElementType();
19126 unsigned NumElts = VT.getVectorNumElements();
19128 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
19130 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
19131 // If v16i32 is to be avoided, we'll need to split and concatenate.
19132 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
19133 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
19135 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
19138 // Widen to 512-bits if VLX is not supported.
19139 MVT WideVT = ExtVT;
19140 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
19141 NumElts *= 512 / ExtVT.getSizeInBits();
19142 InVT = MVT::getVectorVT(MVT::i1, NumElts);
19143 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
19144 In, DAG.getIntPtrConstant(0, dl));
19145 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
19149 MVT WideEltVT = WideVT.getVectorElementType();
19150 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
19151 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
19152 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
19154 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
19155 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
19156 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
19159 // Truncate if we had to extend i16/i8 above.
19161 WideVT = MVT::getVectorVT(VTElt, NumElts);
19162 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
19165 // Extract back to 128/256-bit if we widened.
19167 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
19168 DAG.getIntPtrConstant(0, dl));
19173 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19174 SelectionDAG &DAG) {
19175 SDValue In = Op->getOperand(0);
19176 MVT InVT = In.getSimpleValueType();
19178 if (InVT.getVectorElementType() == MVT::i1)
19179 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19181 assert(Subtarget.hasAVX() && "Expected AVX support");
19182 return LowerAVXExtend(Op, DAG, Subtarget);
19185 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
19186 // For sign extend this needs to handle all vector sizes and SSE4.1 and
19187 // non-SSE4.1 targets. For zero extend this should only handle inputs of
19188 // MVT::v64i8 when BWI is not supported, but AVX512 is.
19189 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
19190 const X86Subtarget &Subtarget,
19191 SelectionDAG &DAG) {
19192 SDValue In = Op->getOperand(0);
19193 MVT VT = Op->getSimpleValueType(0);
19194 MVT InVT = In.getSimpleValueType();
19195 assert(VT.getSizeInBits() == InVT.getSizeInBits());
19197 MVT SVT = VT.getVectorElementType();
19198 MVT InSVT = InVT.getVectorElementType();
19199 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
19201 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
19203 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
19205 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
19206 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
19207 !(VT.is512BitVector() && Subtarget.hasAVX512()))
19212 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
19213 // For 512-bit vectors, we need 128-bits or 256-bits.
19214 if (VT.getSizeInBits() > 128) {
19215 // Input needs to be at least the same number of elements as output, and
19216 // at least 128-bits.
19217 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
19218 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
19221 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
19222 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
19224 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
19225 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
19226 // need to be handled here for 256/512-bit results.
19227 if (Subtarget.hasInt256()) {
19228 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
19229 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
19230 X86ISD::VSEXT : X86ISD::VZEXT;
19231 return DAG.getNode(ExtOpc, dl, VT, In);
19234 // We should only get here for sign extend.
19235 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
19236 "Unexpected opcode!");
19238 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
19242 // As SRAI is only available on i16/i32 types, we expand only up to i32
19243 // and handle i64 separately.
19244 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
19245 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
19246 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
19247 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
19248 Curr = DAG.getBitcast(CurrVT, Curr);
19251 SDValue SignExt = Curr;
19252 if (CurrVT != InVT) {
19253 unsigned SignExtShift =
19254 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
19255 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19256 DAG.getConstant(SignExtShift, dl, MVT::i8));
19262 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
19263 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19264 DAG.getConstant(31, dl, MVT::i8));
19265 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
19266 return DAG.getBitcast(VT, Ext);
19272 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19273 SelectionDAG &DAG) {
19274 MVT VT = Op->getSimpleValueType(0);
19275 SDValue In = Op->getOperand(0);
19276 MVT InVT = In.getSimpleValueType();
19279 if (InVT.getVectorElementType() == MVT::i1)
19280 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19282 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19283 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
19284 "Expected same number of elements");
19285 assert((VT.getVectorElementType() == MVT::i16 ||
19286 VT.getVectorElementType() == MVT::i32 ||
19287 VT.getVectorElementType() == MVT::i64) &&
19288 "Unexpected element type");
19289 assert((InVT.getVectorElementType() == MVT::i8 ||
19290 InVT.getVectorElementType() == MVT::i16 ||
19291 InVT.getVectorElementType() == MVT::i32) &&
19292 "Unexpected element type");
19294 if (Subtarget.hasInt256())
19295 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
19297 // Optimize vectors in AVX mode
19298 // Sign extend v8i16 to v8i32 and
19301 // Divide input vector into two parts
19302 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
19303 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
19304 // concat the vectors to original VT
19306 unsigned NumElems = InVT.getVectorNumElements();
19307 SDValue Undef = DAG.getUNDEF(InVT);
19309 SmallVector<int,8> ShufMask1(NumElems, -1);
19310 for (unsigned i = 0; i != NumElems/2; ++i)
19313 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
19315 SmallVector<int,8> ShufMask2(NumElems, -1);
19316 for (unsigned i = 0; i != NumElems/2; ++i)
19317 ShufMask2[i] = i + NumElems/2;
19319 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
19321 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
19322 VT.getVectorNumElements() / 2);
19324 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
19325 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
19327 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19330 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
19331 SelectionDAG &DAG) {
19332 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
19334 SDValue StoredVal = St->getValue();
19336 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19337 assert(StoredVal.getValueType().isVector() &&
19338 StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
19339 StoredVal.getValueType().getVectorNumElements() <= 8 &&
19341 assert(!St->isTruncatingStore() && "Expected non-truncating store");
19342 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19343 "Expected AVX512F without AVX512DQI");
19345 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
19346 DAG.getUNDEF(MVT::v8i1), StoredVal,
19347 DAG.getIntPtrConstant(0, dl));
19348 StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
19350 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
19351 St->getPointerInfo(), St->getAlignment(),
19352 St->getMemOperand()->getFlags());
19355 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
19356 // may emit an illegal shuffle but the expansion is still better than scalar
19357 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
19358 // we'll emit a shuffle and a arithmetic shift.
19359 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
19360 // TODO: It is possible to support ZExt by zeroing the undef values during
19361 // the shuffle phase or after the shuffle.
19362 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
19363 SelectionDAG &DAG) {
19364 MVT RegVT = Op.getSimpleValueType();
19365 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
19366 assert(RegVT.isInteger() &&
19367 "We only custom lower integer vector sext loads.");
19369 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
19371 EVT MemVT = Ld->getMemoryVT();
19373 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19374 if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
19375 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
19376 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
19377 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19378 "Expected AVX512F without AVX512DQI");
19380 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
19381 Ld->getPointerInfo(), Ld->getAlignment(),
19382 Ld->getMemOperand()->getFlags());
19384 // Replace chain users with the new chain.
19385 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
19386 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
19388 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
19389 DAG.getBitcast(MVT::v8i1, NewLd),
19390 DAG.getIntPtrConstant(0, dl));
19391 return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
19394 // Nothing useful we can do without SSE2 shuffles.
19395 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
19397 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19398 unsigned RegSz = RegVT.getSizeInBits();
19400 ISD::LoadExtType Ext = Ld->getExtensionType();
19402 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
19403 && "Only anyext and sext are currently implemented.");
19404 assert(MemVT != RegVT && "Cannot extend to the same type");
19405 assert(MemVT.isVector() && "Must load a vector from memory");
19407 unsigned NumElems = RegVT.getVectorNumElements();
19408 unsigned MemSz = MemVT.getSizeInBits();
19409 assert(RegSz > MemSz && "Register size must be greater than the mem size");
19411 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
19412 // The only way in which we have a legal 256-bit vector result but not the
19413 // integer 256-bit operations needed to directly lower a sextload is if we
19414 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
19415 // a 128-bit vector and a normal sign_extend to 256-bits that should get
19416 // correctly legalized. We do this late to allow the canonical form of
19417 // sextload to persist throughout the rest of the DAG combiner -- it wants
19418 // to fold together any extensions it can, and so will fuse a sign_extend
19419 // of an sextload into a sextload targeting a wider value.
19421 if (MemSz == 128) {
19422 // Just switch this to a normal load.
19423 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
19424 "it must be a legal 128-bit vector "
19426 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
19427 Ld->getPointerInfo(), Ld->getAlignment(),
19428 Ld->getMemOperand()->getFlags());
19430 assert(MemSz < 128 &&
19431 "Can't extend a type wider than 128 bits to a 256 bit vector!");
19432 // Do an sext load to a 128-bit vector type. We want to use the same
19433 // number of elements, but elements half as wide. This will end up being
19434 // recursively lowered by this routine, but will succeed as we definitely
19435 // have all the necessary features if we're using AVX1.
19437 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
19438 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
19440 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
19441 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
19442 Ld->getMemOperand()->getFlags());
19445 // Replace chain users with the new chain.
19446 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
19447 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
19449 // Finally, do a normal sign-extend to the desired register.
19450 return DAG.getSExtOrTrunc(Load, dl, RegVT);
19453 // All sizes must be a power of two.
19454 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
19455 "Non-power-of-two elements are not custom lowered!");
19457 // Attempt to load the original value using scalar loads.
19458 // Find the largest scalar type that divides the total loaded size.
19459 MVT SclrLoadTy = MVT::i8;
19460 for (MVT Tp : MVT::integer_valuetypes()) {
19461 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19466 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19467 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19469 SclrLoadTy = MVT::f64;
19471 // Calculate the number of scalar loads that we need to perform
19472 // in order to load our vector from memory.
19473 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19475 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19476 "Can only lower sext loads with a single scalar load!");
19478 unsigned loadRegZize = RegSz;
19479 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19482 // If we don't have BWI we won't be able to create the shuffle needed for
19484 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19485 MemVT == MVT::v8i8)
19488 // Represent our vector as a sequence of elements which are the
19489 // largest scalar that we can load.
19490 EVT LoadUnitVecVT = EVT::getVectorVT(
19491 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19493 // Represent the data using the same element type that is stored in
19494 // memory. In practice, we ''widen'' MemVT.
19496 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19497 loadRegZize / MemVT.getScalarSizeInBits());
19499 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19500 "Invalid vector type");
19502 // We can't shuffle using an illegal type.
19503 assert(TLI.isTypeLegal(WideVecVT) &&
19504 "We only lower types that form legal widened vector types");
19506 SmallVector<SDValue, 8> Chains;
19507 SDValue Ptr = Ld->getBasePtr();
19508 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19509 TLI.getPointerTy(DAG.getDataLayout()));
19510 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19512 for (unsigned i = 0; i < NumLoads; ++i) {
19513 // Perform a single load.
19514 SDValue ScalarLoad =
19515 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19516 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19517 Chains.push_back(ScalarLoad.getValue(1));
19518 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19519 // another round of DAGCombining.
19521 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19523 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19524 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19526 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19529 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19531 // Bitcast the loaded value to a vector of the original element type, in
19532 // the size of the target vector type.
19533 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19534 unsigned SizeRatio = RegSz / MemSz;
19536 if (Ext == ISD::SEXTLOAD) {
19537 // If we have SSE4.1, we can directly emit a VSEXT node.
19538 if (Subtarget.hasSSE41()) {
19539 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19540 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19544 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19546 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19547 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19549 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19550 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19554 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19555 MemVT == MVT::v8i8) {
19556 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19557 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19561 // Redistribute the loaded elements into the different locations.
19562 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19563 for (unsigned i = 0; i != NumElems; ++i)
19564 ShuffleVec[i * SizeRatio] = i;
19566 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19567 DAG.getUNDEF(WideVecVT), ShuffleVec);
19569 // Bitcast to the requested type.
19570 Shuff = DAG.getBitcast(RegVT, Shuff);
19571 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19575 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19576 /// each of which has no other use apart from the AND / OR.
19577 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19578 Opc = Op.getOpcode();
19579 if (Opc != ISD::OR && Opc != ISD::AND)
19581 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19582 Op.getOperand(0).hasOneUse() &&
19583 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19584 Op.getOperand(1).hasOneUse());
19587 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19588 /// SETCC node has a single use.
19589 static bool isXor1OfSetCC(SDValue Op) {
19590 if (Op.getOpcode() != ISD::XOR)
19592 if (isOneConstant(Op.getOperand(1)))
19593 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19594 Op.getOperand(0).hasOneUse();
19598 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19599 bool addTest = true;
19600 SDValue Chain = Op.getOperand(0);
19601 SDValue Cond = Op.getOperand(1);
19602 SDValue Dest = Op.getOperand(2);
19605 bool Inverted = false;
19607 if (Cond.getOpcode() == ISD::SETCC) {
19608 // Check for setcc([su]{add,sub,mul}o == 0).
19609 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19610 isNullConstant(Cond.getOperand(1)) &&
19611 Cond.getOperand(0).getResNo() == 1 &&
19612 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19613 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19614 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19615 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19616 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19617 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19619 Cond = Cond.getOperand(0);
19621 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19626 // FIXME: LowerXALUO doesn't handle these!!
19627 else if (Cond.getOpcode() == X86ISD::ADD ||
19628 Cond.getOpcode() == X86ISD::SUB ||
19629 Cond.getOpcode() == X86ISD::SMUL ||
19630 Cond.getOpcode() == X86ISD::UMUL)
19631 Cond = LowerXALUO(Cond, DAG);
19634 // Look pass (and (setcc_carry (cmp ...)), 1).
19635 if (Cond.getOpcode() == ISD::AND &&
19636 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19637 isOneConstant(Cond.getOperand(1)))
19638 Cond = Cond.getOperand(0);
19640 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19641 // setting operand in place of the X86ISD::SETCC.
19642 unsigned CondOpcode = Cond.getOpcode();
19643 if (CondOpcode == X86ISD::SETCC ||
19644 CondOpcode == X86ISD::SETCC_CARRY) {
19645 CC = Cond.getOperand(0);
19647 SDValue Cmp = Cond.getOperand(1);
19648 unsigned Opc = Cmp.getOpcode();
19649 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19650 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19654 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19658 // These can only come from an arithmetic instruction with overflow,
19659 // e.g. SADDO, UADDO.
19660 Cond = Cond.getOperand(1);
19666 CondOpcode = Cond.getOpcode();
19667 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19668 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19669 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19670 Cond.getOperand(0).getValueType() != MVT::i8)) {
19671 SDValue LHS = Cond.getOperand(0);
19672 SDValue RHS = Cond.getOperand(1);
19673 unsigned X86Opcode;
19676 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19677 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19679 switch (CondOpcode) {
19680 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19682 if (isOneConstant(RHS)) {
19683 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19686 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19687 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19689 if (isOneConstant(RHS)) {
19690 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19693 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19694 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19695 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19696 default: llvm_unreachable("unexpected overflowing operator");
19699 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19700 if (CondOpcode == ISD::UMULO)
19701 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19704 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19706 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19708 if (CondOpcode == ISD::UMULO)
19709 Cond = X86Op.getValue(2);
19711 Cond = X86Op.getValue(1);
19713 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19717 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19718 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19719 if (CondOpc == ISD::OR) {
19720 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19721 // two branches instead of an explicit OR instruction with a
19723 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19724 isX86LogicalCmp(Cmp)) {
19725 CC = Cond.getOperand(0).getOperand(0);
19726 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19727 Chain, Dest, CC, Cmp);
19728 CC = Cond.getOperand(1).getOperand(0);
19732 } else { // ISD::AND
19733 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19734 // two branches instead of an explicit AND instruction with a
19735 // separate test. However, we only do this if this block doesn't
19736 // have a fall-through edge, because this requires an explicit
19737 // jmp when the condition is false.
19738 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19739 isX86LogicalCmp(Cmp) &&
19740 Op.getNode()->hasOneUse()) {
19741 X86::CondCode CCode =
19742 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19743 CCode = X86::GetOppositeBranchCondition(CCode);
19744 CC = DAG.getConstant(CCode, dl, MVT::i8);
19745 SDNode *User = *Op.getNode()->use_begin();
19746 // Look for an unconditional branch following this conditional branch.
19747 // We need this because we need to reverse the successors in order
19748 // to implement FCMP_OEQ.
19749 if (User->getOpcode() == ISD::BR) {
19750 SDValue FalseBB = User->getOperand(1);
19752 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19753 assert(NewBR == User);
19757 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19758 Chain, Dest, CC, Cmp);
19759 X86::CondCode CCode =
19760 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19761 CCode = X86::GetOppositeBranchCondition(CCode);
19762 CC = DAG.getConstant(CCode, dl, MVT::i8);
19768 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19769 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19770 // It should be transformed during dag combiner except when the condition
19771 // is set by a arithmetics with overflow node.
19772 X86::CondCode CCode =
19773 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19774 CCode = X86::GetOppositeBranchCondition(CCode);
19775 CC = DAG.getConstant(CCode, dl, MVT::i8);
19776 Cond = Cond.getOperand(0).getOperand(1);
19778 } else if (Cond.getOpcode() == ISD::SETCC &&
19779 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19780 // For FCMP_OEQ, we can emit
19781 // two branches instead of an explicit AND instruction with a
19782 // separate test. However, we only do this if this block doesn't
19783 // have a fall-through edge, because this requires an explicit
19784 // jmp when the condition is false.
19785 if (Op.getNode()->hasOneUse()) {
19786 SDNode *User = *Op.getNode()->use_begin();
19787 // Look for an unconditional branch following this conditional branch.
19788 // We need this because we need to reverse the successors in order
19789 // to implement FCMP_OEQ.
19790 if (User->getOpcode() == ISD::BR) {
19791 SDValue FalseBB = User->getOperand(1);
19793 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19794 assert(NewBR == User);
19798 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19799 Cond.getOperand(0), Cond.getOperand(1));
19800 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19801 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19802 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19803 Chain, Dest, CC, Cmp);
19804 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19809 } else if (Cond.getOpcode() == ISD::SETCC &&
19810 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19811 // For FCMP_UNE, we can emit
19812 // two branches instead of an explicit AND instruction with a
19813 // separate test. However, we only do this if this block doesn't
19814 // have a fall-through edge, because this requires an explicit
19815 // jmp when the condition is false.
19816 if (Op.getNode()->hasOneUse()) {
19817 SDNode *User = *Op.getNode()->use_begin();
19818 // Look for an unconditional branch following this conditional branch.
19819 // We need this because we need to reverse the successors in order
19820 // to implement FCMP_UNE.
19821 if (User->getOpcode() == ISD::BR) {
19822 SDValue FalseBB = User->getOperand(1);
19824 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19825 assert(NewBR == User);
19828 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19829 Cond.getOperand(0), Cond.getOperand(1));
19830 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19831 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19832 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19833 Chain, Dest, CC, Cmp);
19834 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19844 // Look pass the truncate if the high bits are known zero.
19845 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19846 Cond = Cond.getOperand(0);
19848 // We know the result of AND is compared against zero. Try to match
19850 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19851 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19852 CC = NewSetCC.getOperand(0);
19853 Cond = NewSetCC.getOperand(1);
19860 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19861 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19862 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19864 Cond = ConvertCmpIfNecessary(Cond, DAG);
19865 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19866 Chain, Dest, CC, Cond);
19869 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19870 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19871 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19872 // that the guard pages used by the OS virtual memory manager are allocated in
19873 // correct sequence.
19875 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19876 SelectionDAG &DAG) const {
19877 MachineFunction &MF = DAG.getMachineFunction();
19878 bool SplitStack = MF.shouldSplitStack();
19879 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19880 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19881 SplitStack || EmitStackProbe;
19885 SDNode *Node = Op.getNode();
19886 SDValue Chain = Op.getOperand(0);
19887 SDValue Size = Op.getOperand(1);
19888 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19889 EVT VT = Node->getValueType(0);
19891 // Chain the dynamic stack allocation so that it doesn't modify the stack
19892 // pointer when other instructions are using the stack.
19893 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19895 bool Is64Bit = Subtarget.is64Bit();
19896 MVT SPTy = getPointerTy(DAG.getDataLayout());
19900 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19901 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19902 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19903 " not tell us which reg is the stack pointer!");
19905 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19906 Chain = SP.getValue(1);
19907 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19908 unsigned StackAlign = TFI.getStackAlignment();
19909 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19910 if (Align > StackAlign)
19911 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19912 DAG.getConstant(-(uint64_t)Align, dl, VT));
19913 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19914 } else if (SplitStack) {
19915 MachineRegisterInfo &MRI = MF.getRegInfo();
19918 // The 64 bit implementation of segmented stacks needs to clobber both r10
19919 // r11. This makes it impossible to use it along with nested parameters.
19920 const Function &F = MF.getFunction();
19921 for (const auto &A : F.args()) {
19922 if (A.hasNestAttr())
19923 report_fatal_error("Cannot use segmented stacks with functions that "
19924 "have nested arguments.");
19928 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19929 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19930 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19931 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19932 DAG.getRegister(Vreg, SPTy));
19934 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19935 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19936 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19938 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19939 unsigned SPReg = RegInfo->getStackRegister();
19940 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19941 Chain = SP.getValue(1);
19944 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19945 DAG.getConstant(-(uint64_t)Align, dl, VT));
19946 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19952 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19953 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19955 SDValue Ops[2] = {Result, Chain};
19956 return DAG.getMergeValues(Ops, dl);
19959 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19960 MachineFunction &MF = DAG.getMachineFunction();
19961 auto PtrVT = getPointerTy(MF.getDataLayout());
19962 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19964 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19967 if (!Subtarget.is64Bit() ||
19968 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19969 // vastart just stores the address of the VarArgsFrameIndex slot into the
19970 // memory location argument.
19971 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19972 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19973 MachinePointerInfo(SV));
19977 // gp_offset (0 - 6 * 8)
19978 // fp_offset (48 - 48 + 8 * 16)
19979 // overflow_arg_area (point to parameters coming in memory).
19981 SmallVector<SDValue, 8> MemOps;
19982 SDValue FIN = Op.getOperand(1);
19984 SDValue Store = DAG.getStore(
19985 Op.getOperand(0), DL,
19986 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19987 MachinePointerInfo(SV));
19988 MemOps.push_back(Store);
19991 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19992 Store = DAG.getStore(
19993 Op.getOperand(0), DL,
19994 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19995 MachinePointerInfo(SV, 4));
19996 MemOps.push_back(Store);
19998 // Store ptr to overflow_arg_area
19999 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
20000 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
20002 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
20003 MemOps.push_back(Store);
20005 // Store ptr to reg_save_area.
20006 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
20007 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
20008 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
20009 Store = DAG.getStore(
20010 Op.getOperand(0), DL, RSFIN, FIN,
20011 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
20012 MemOps.push_back(Store);
20013 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
20016 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
20017 assert(Subtarget.is64Bit() &&
20018 "LowerVAARG only handles 64-bit va_arg!");
20019 assert(Op.getNumOperands() == 4);
20021 MachineFunction &MF = DAG.getMachineFunction();
20022 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
20023 // The Win64 ABI uses char* instead of a structure.
20024 return DAG.expandVAArg(Op.getNode());
20026 SDValue Chain = Op.getOperand(0);
20027 SDValue SrcPtr = Op.getOperand(1);
20028 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
20029 unsigned Align = Op.getConstantOperandVal(3);
20032 EVT ArgVT = Op.getNode()->getValueType(0);
20033 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20034 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
20037 // Decide which area this value should be read from.
20038 // TODO: Implement the AMD64 ABI in its entirety. This simple
20039 // selection mechanism works only for the basic types.
20040 if (ArgVT == MVT::f80) {
20041 llvm_unreachable("va_arg for f80 not yet implemented");
20042 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
20043 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
20044 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
20045 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
20047 llvm_unreachable("Unhandled argument type in LowerVAARG");
20050 if (ArgMode == 2) {
20051 // Sanity Check: Make sure using fp_offset makes sense.
20052 assert(!Subtarget.useSoftFloat() &&
20053 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
20054 Subtarget.hasSSE1());
20057 // Insert VAARG_64 node into the DAG
20058 // VAARG_64 returns two values: Variable Argument Address, Chain
20059 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
20060 DAG.getConstant(ArgMode, dl, MVT::i8),
20061 DAG.getConstant(Align, dl, MVT::i32)};
20062 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
20063 SDValue VAARG = DAG.getMemIntrinsicNode(
20064 X86ISD::VAARG_64, dl,
20065 VTs, InstOps, MVT::i64,
20066 MachinePointerInfo(SV),
20068 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
20069 Chain = VAARG.getValue(1);
20071 // Load the next argument and return it
20072 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
20075 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
20076 SelectionDAG &DAG) {
20077 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
20078 // where a va_list is still an i8*.
20079 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
20080 if (Subtarget.isCallingConvWin64(
20081 DAG.getMachineFunction().getFunction().getCallingConv()))
20082 // Probably a Win64 va_copy.
20083 return DAG.expandVACopy(Op.getNode());
20085 SDValue Chain = Op.getOperand(0);
20086 SDValue DstPtr = Op.getOperand(1);
20087 SDValue SrcPtr = Op.getOperand(2);
20088 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
20089 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20092 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
20093 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
20095 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
20098 /// Handle vector element shifts where the shift amount is a constant.
20099 /// Takes immediate version of shift as input.
20100 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
20101 SDValue SrcOp, uint64_t ShiftAmt,
20102 SelectionDAG &DAG) {
20103 MVT ElementType = VT.getVectorElementType();
20105 // Bitcast the source vector to the output type, this is mainly necessary for
20106 // vXi8/vXi64 shifts.
20107 if (VT != SrcOp.getSimpleValueType())
20108 SrcOp = DAG.getBitcast(VT, SrcOp);
20110 // Fold this packed shift into its first operand if ShiftAmt is 0.
20114 // Check for ShiftAmt >= element width
20115 if (ShiftAmt >= ElementType.getSizeInBits()) {
20116 if (Opc == X86ISD::VSRAI)
20117 ShiftAmt = ElementType.getSizeInBits() - 1;
20119 return DAG.getConstant(0, dl, VT);
20122 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
20123 && "Unknown target vector shift-by-constant node");
20125 // Fold this packed vector shift into a build vector if SrcOp is a
20126 // vector of Constants or UNDEFs.
20127 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
20128 SmallVector<SDValue, 8> Elts;
20129 unsigned NumElts = SrcOp->getNumOperands();
20130 ConstantSDNode *ND;
20133 default: llvm_unreachable("Unknown opcode!");
20134 case X86ISD::VSHLI:
20135 for (unsigned i=0; i!=NumElts; ++i) {
20136 SDValue CurrentOp = SrcOp->getOperand(i);
20137 if (CurrentOp->isUndef()) {
20138 Elts.push_back(CurrentOp);
20141 ND = cast<ConstantSDNode>(CurrentOp);
20142 const APInt &C = ND->getAPIntValue();
20143 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
20146 case X86ISD::VSRLI:
20147 for (unsigned i=0; i!=NumElts; ++i) {
20148 SDValue CurrentOp = SrcOp->getOperand(i);
20149 if (CurrentOp->isUndef()) {
20150 Elts.push_back(CurrentOp);
20153 ND = cast<ConstantSDNode>(CurrentOp);
20154 const APInt &C = ND->getAPIntValue();
20155 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
20158 case X86ISD::VSRAI:
20159 for (unsigned i=0; i!=NumElts; ++i) {
20160 SDValue CurrentOp = SrcOp->getOperand(i);
20161 if (CurrentOp->isUndef()) {
20162 Elts.push_back(CurrentOp);
20165 ND = cast<ConstantSDNode>(CurrentOp);
20166 const APInt &C = ND->getAPIntValue();
20167 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
20172 return DAG.getBuildVector(VT, dl, Elts);
20175 return DAG.getNode(Opc, dl, VT, SrcOp,
20176 DAG.getConstant(ShiftAmt, dl, MVT::i8));
20179 /// Handle vector element shifts where the shift amount may or may not be a
20180 /// constant. Takes immediate version of shift as input.
20181 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
20182 SDValue SrcOp, SDValue ShAmt,
20183 const X86Subtarget &Subtarget,
20184 SelectionDAG &DAG) {
20185 MVT SVT = ShAmt.getSimpleValueType();
20186 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
20188 // Catch shift-by-constant.
20189 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
20190 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
20191 CShAmt->getZExtValue(), DAG);
20193 // Change opcode to non-immediate version
20195 default: llvm_unreachable("Unknown target vector shift node");
20196 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
20197 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
20198 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
20201 // Need to build a vector containing shift amount.
20202 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
20203 // +=================+============+=======================================+
20204 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
20205 // +=================+============+=======================================+
20206 // | i64 | Yes, No | Use ShAmt as lowest elt |
20207 // | i32 | Yes | zero-extend in-reg |
20208 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
20209 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
20210 // +=================+============+=======================================+
20212 if (SVT == MVT::i64)
20213 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
20214 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
20215 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
20216 ShAmt = ShAmt.getOperand(0);
20217 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
20218 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20219 } else if (Subtarget.hasSSE41() &&
20220 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
20221 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
20222 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20224 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
20225 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
20226 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
20229 // The return type has to be a 128-bit type with the same element
20230 // type as the input type.
20231 MVT EltVT = VT.getVectorElementType();
20232 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
20234 ShAmt = DAG.getBitcast(ShVT, ShAmt);
20235 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
20238 /// Return Mask with the necessary casting or extending
20239 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
20240 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
20241 const X86Subtarget &Subtarget, SelectionDAG &DAG,
20244 if (isAllOnesConstant(Mask))
20245 return DAG.getConstant(1, dl, MaskVT);
20246 if (X86::isZeroNode(Mask))
20247 return DAG.getConstant(0, dl, MaskVT);
20249 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
20250 // Mask should be extended
20251 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
20252 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
20255 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
20256 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
20257 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
20258 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
20260 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20261 DAG.getConstant(0, dl, MVT::i32));
20262 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20263 DAG.getConstant(1, dl, MVT::i32));
20265 Lo = DAG.getBitcast(MVT::v32i1, Lo);
20266 Hi = DAG.getBitcast(MVT::v32i1, Hi);
20268 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
20270 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20271 Mask.getSimpleValueType().getSizeInBits());
20272 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
20273 // are extracted by EXTRACT_SUBVECTOR.
20274 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
20275 DAG.getBitcast(BitcastVT, Mask),
20276 DAG.getIntPtrConstant(0, dl));
20280 /// Return (and \p Op, \p Mask) for compare instructions or
20281 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
20282 /// necessary casting or extending for \p Mask when lowering masking intrinsics
20283 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
20284 SDValue PreservedSrc,
20285 const X86Subtarget &Subtarget,
20286 SelectionDAG &DAG) {
20287 MVT VT = Op.getSimpleValueType();
20288 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20289 unsigned OpcodeSelect = ISD::VSELECT;
20292 if (isAllOnesConstant(Mask))
20295 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20297 switch (Op.getOpcode()) {
20300 case X86ISD::CMPM_RND:
20301 case X86ISD::VPSHUFBITQMB:
20302 case X86ISD::VFPCLASS:
20303 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
20304 case ISD::TRUNCATE:
20305 case X86ISD::VTRUNC:
20306 case X86ISD::VTRUNCS:
20307 case X86ISD::VTRUNCUS:
20308 case X86ISD::CVTPS2PH:
20309 // We can't use ISD::VSELECT here because it is not always "Legal"
20310 // for the destination type. For example vpmovqb require only AVX512
20311 // and vselect that can operate on byte element type require BWI
20312 OpcodeSelect = X86ISD::SELECT;
20315 if (PreservedSrc.isUndef())
20316 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20317 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
20320 /// Creates an SDNode for a predicated scalar operation.
20321 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
20322 /// The mask is coming as MVT::i8 and it should be transformed
20323 /// to MVT::v1i1 while lowering masking intrinsics.
20324 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
20325 /// "X86select" instead of "vselect". We just can't create the "vselect" node
20326 /// for a scalar instruction.
20327 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
20328 SDValue PreservedSrc,
20329 const X86Subtarget &Subtarget,
20330 SelectionDAG &DAG) {
20332 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
20333 if (MaskConst->getZExtValue() & 0x1)
20336 MVT VT = Op.getSimpleValueType();
20339 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
20340 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
20341 if (Op.getOpcode() == X86ISD::FSETCCM ||
20342 Op.getOpcode() == X86ISD::FSETCCM_RND ||
20343 Op.getOpcode() == X86ISD::VFPCLASSS)
20344 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
20346 if (PreservedSrc.isUndef())
20347 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20348 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
20351 static int getSEHRegistrationNodeSize(const Function *Fn) {
20352 if (!Fn->hasPersonalityFn())
20353 report_fatal_error(
20354 "querying registration node size for function without personality");
20355 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
20356 // WinEHStatePass for the full struct definition.
20357 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
20358 case EHPersonality::MSVC_X86SEH: return 24;
20359 case EHPersonality::MSVC_CXX: return 16;
20362 report_fatal_error(
20363 "can only recover FP for 32-bit MSVC EH personality functions");
20366 /// When the MSVC runtime transfers control to us, either to an outlined
20367 /// function or when returning to a parent frame after catching an exception, we
20368 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
20369 /// Here's the math:
20370 /// RegNodeBase = EntryEBP - RegNodeSize
20371 /// ParentFP = RegNodeBase - ParentFrameOffset
20372 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
20373 /// subtracting the offset (negative on x86) takes us back to the parent FP.
20374 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
20375 SDValue EntryEBP) {
20376 MachineFunction &MF = DAG.getMachineFunction();
20379 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20380 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20382 // It's possible that the parent function no longer has a personality function
20383 // if the exceptional code was optimized away, in which case we just return
20384 // the incoming EBP.
20385 if (!Fn->hasPersonalityFn())
20388 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
20389 // registration, or the .set_setframe offset.
20390 MCSymbol *OffsetSym =
20391 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
20392 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20393 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
20394 SDValue ParentFrameOffset =
20395 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
20397 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
20398 // prologue to RBP in the parent function.
20399 const X86Subtarget &Subtarget =
20400 static_cast<const X86Subtarget &>(DAG.getSubtarget());
20401 if (Subtarget.is64Bit())
20402 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
20404 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
20405 // RegNodeBase = EntryEBP - RegNodeSize
20406 // ParentFP = RegNodeBase - ParentFrameOffset
20407 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
20408 DAG.getConstant(RegNodeSize, dl, PtrVT));
20409 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
20412 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
20413 SelectionDAG &DAG) const {
20414 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
20415 auto isRoundModeCurDirection = [](SDValue Rnd) {
20416 if (!isa<ConstantSDNode>(Rnd))
20419 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
20420 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
20424 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20425 MVT VT = Op.getSimpleValueType();
20426 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
20428 switch(IntrData->Type) {
20429 case INTR_TYPE_1OP: {
20430 // We specify 2 possible opcodes for intrinsics with rounding modes.
20431 // First, we check if the intrinsic may have non-default rounding mode,
20432 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20433 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20434 if (IntrWithRoundingModeOpcode != 0) {
20435 SDValue Rnd = Op.getOperand(2);
20436 if (!isRoundModeCurDirection(Rnd)) {
20437 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20438 Op.getOperand(1), Rnd);
20441 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
20443 case INTR_TYPE_2OP:
20444 case INTR_TYPE_2OP_IMM8: {
20445 SDValue Src2 = Op.getOperand(2);
20447 if (IntrData->Type == INTR_TYPE_2OP_IMM8)
20448 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20450 // We specify 2 possible opcodes for intrinsics with rounding modes.
20451 // First, we check if the intrinsic may have non-default rounding mode,
20452 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20453 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20454 if (IntrWithRoundingModeOpcode != 0) {
20455 SDValue Rnd = Op.getOperand(3);
20456 if (!isRoundModeCurDirection(Rnd)) {
20457 return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(),
20458 Op.getOperand(1), Src2, Rnd);
20462 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20463 Op.getOperand(1), Src2);
20465 case INTR_TYPE_3OP:
20466 case INTR_TYPE_3OP_IMM8: {
20467 SDValue Src1 = Op.getOperand(1);
20468 SDValue Src2 = Op.getOperand(2);
20469 SDValue Src3 = Op.getOperand(3);
20471 if (IntrData->Type == INTR_TYPE_3OP_IMM8)
20472 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20474 // We specify 2 possible opcodes for intrinsics with rounding modes.
20475 // First, we check if the intrinsic may have non-default rounding mode,
20476 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20477 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20478 if (IntrWithRoundingModeOpcode != 0) {
20479 SDValue Rnd = Op.getOperand(4);
20480 if (!isRoundModeCurDirection(Rnd)) {
20481 return DAG.getNode(IntrWithRoundingModeOpcode,
20482 dl, Op.getValueType(),
20483 Src1, Src2, Src3, Rnd);
20487 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20490 case INTR_TYPE_4OP:
20491 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20492 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
20493 case INTR_TYPE_1OP_MASK_RM: {
20494 SDValue Src = Op.getOperand(1);
20495 SDValue PassThru = Op.getOperand(2);
20496 SDValue Mask = Op.getOperand(3);
20497 SDValue RoundingMode;
20498 // We always add rounding mode to the Node.
20499 // If the rounding mode is not specified, we add the
20500 // "current direction" mode.
20501 if (Op.getNumOperands() == 4)
20503 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20505 RoundingMode = Op.getOperand(4);
20506 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20507 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20509 Mask, PassThru, Subtarget, DAG);
20511 case INTR_TYPE_1OP_MASK: {
20512 SDValue Src = Op.getOperand(1);
20513 SDValue PassThru = Op.getOperand(2);
20514 SDValue Mask = Op.getOperand(3);
20515 // We add rounding mode to the Node when
20516 // - RM Opcode is specified and
20517 // - RM is not "current direction".
20518 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20519 if (IntrWithRoundingModeOpcode != 0) {
20520 SDValue Rnd = Op.getOperand(4);
20521 if (!isRoundModeCurDirection(Rnd)) {
20522 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20523 dl, Op.getValueType(),
20525 Mask, PassThru, Subtarget, DAG);
20528 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20529 Mask, PassThru, Subtarget, DAG);
20531 case INTR_TYPE_SCALAR_MASK: {
20532 SDValue Src1 = Op.getOperand(1);
20533 SDValue Src2 = Op.getOperand(2);
20534 SDValue passThru = Op.getOperand(3);
20535 SDValue Mask = Op.getOperand(4);
20536 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20537 // There are 2 kinds of intrinsics in this group:
20538 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20539 // (2) With rounding mode and sae - 7 operands.
20540 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20541 if (Op.getNumOperands() == (5U + HasRounding)) {
20543 SDValue Rnd = Op.getOperand(5);
20544 if (!isRoundModeCurDirection(Rnd))
20545 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20546 dl, VT, Src1, Src2, Rnd),
20547 Mask, passThru, Subtarget, DAG);
20549 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20551 Mask, passThru, Subtarget, DAG);
20554 assert(Op.getNumOperands() == (6U + HasRounding) &&
20555 "Unexpected intrinsic form");
20556 SDValue RoundingMode = Op.getOperand(5);
20558 SDValue Sae = Op.getOperand(6);
20559 if (!isRoundModeCurDirection(Sae))
20560 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20561 dl, VT, Src1, Src2,
20562 RoundingMode, Sae),
20563 Mask, passThru, Subtarget, DAG);
20565 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20566 Src2, RoundingMode),
20567 Mask, passThru, Subtarget, DAG);
20569 case INTR_TYPE_SCALAR_MASK_RM: {
20570 SDValue Src1 = Op.getOperand(1);
20571 SDValue Src2 = Op.getOperand(2);
20572 SDValue Src0 = Op.getOperand(3);
20573 SDValue Mask = Op.getOperand(4);
20574 // There are 2 kinds of intrinsics in this group:
20575 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20576 // (2) With rounding mode and sae - 7 operands.
20577 if (Op.getNumOperands() == 6) {
20578 SDValue Sae = Op.getOperand(5);
20579 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20581 Mask, Src0, Subtarget, DAG);
20583 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20584 SDValue RoundingMode = Op.getOperand(5);
20585 SDValue Sae = Op.getOperand(6);
20586 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20587 RoundingMode, Sae),
20588 Mask, Src0, Subtarget, DAG);
20590 case INTR_TYPE_2OP_MASK: {
20591 SDValue Src1 = Op.getOperand(1);
20592 SDValue Src2 = Op.getOperand(2);
20593 SDValue PassThru = Op.getOperand(3);
20594 SDValue Mask = Op.getOperand(4);
20596 // We specify 2 possible opcodes for intrinsics with rounding modes.
20597 // First, we check if the intrinsic may have non-default rounding mode,
20598 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20599 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20600 if (IntrWithRoundingModeOpcode != 0) {
20601 SDValue Rnd = Op.getOperand(5);
20602 if (!isRoundModeCurDirection(Rnd)) {
20603 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20604 dl, Op.getValueType(),
20606 Mask, PassThru, Subtarget, DAG);
20609 // TODO: Intrinsics should have fast-math-flags to propagate.
20610 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20611 Mask, PassThru, Subtarget, DAG);
20613 case INTR_TYPE_2OP_MASK_RM: {
20614 SDValue Src1 = Op.getOperand(1);
20615 SDValue Src2 = Op.getOperand(2);
20616 SDValue PassThru = Op.getOperand(3);
20617 SDValue Mask = Op.getOperand(4);
20618 // We specify 2 possible modes for intrinsics, with/without rounding
20620 // First, we check if the intrinsic have rounding mode (6 operands),
20621 // if not, we set rounding mode to "current".
20623 if (Op.getNumOperands() == 6)
20624 Rnd = Op.getOperand(5);
20626 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20627 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20629 Mask, PassThru, Subtarget, DAG);
20631 case INTR_TYPE_3OP_SCALAR_MASK: {
20632 SDValue Src1 = Op.getOperand(1);
20633 SDValue Src2 = Op.getOperand(2);
20634 SDValue Src3 = Op.getOperand(3);
20635 SDValue PassThru = Op.getOperand(4);
20636 SDValue Mask = Op.getOperand(5);
20638 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20639 if (IntrWithRoundingModeOpcode != 0) {
20640 SDValue Rnd = Op.getOperand(6);
20641 if (!isRoundModeCurDirection(Rnd))
20642 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20643 dl, VT, Src1, Src2, Src3, Rnd),
20644 Mask, PassThru, Subtarget, DAG);
20646 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20648 Mask, PassThru, Subtarget, DAG);
20650 case INTR_TYPE_3OP_MASK: {
20651 SDValue Src1 = Op.getOperand(1);
20652 SDValue Src2 = Op.getOperand(2);
20653 SDValue Src3 = Op.getOperand(3);
20654 SDValue PassThru = Op.getOperand(4);
20655 SDValue Mask = Op.getOperand(5);
20657 // We specify 2 possible opcodes for intrinsics with rounding modes.
20658 // First, we check if the intrinsic may have non-default rounding mode,
20659 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20660 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20661 if (IntrWithRoundingModeOpcode != 0) {
20662 SDValue Rnd = Op.getOperand(6);
20663 if (!isRoundModeCurDirection(Rnd)) {
20664 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20665 dl, Op.getValueType(),
20666 Src1, Src2, Src3, Rnd),
20667 Mask, PassThru, Subtarget, DAG);
20670 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20672 Mask, PassThru, Subtarget, DAG);
20675 SDValue Src1 = Op.getOperand(1);
20676 SDValue Src2 = Op.getOperand(2);
20678 // Swap Src1 and Src2 in the node creation
20679 return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1);
20682 case FMA_OP_MASK: {
20683 SDValue Src1 = Op.getOperand(1);
20684 SDValue Src2 = Op.getOperand(2);
20685 SDValue Src3 = Op.getOperand(3);
20686 SDValue Mask = Op.getOperand(4);
20687 MVT VT = Op.getSimpleValueType();
20688 SDValue PassThru = SDValue();
20690 // set PassThru element
20691 if (IntrData->Type == FMA_OP_MASKZ)
20692 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20696 // We specify 2 possible opcodes for intrinsics with rounding modes.
20697 // First, we check if the intrinsic may have non-default rounding mode,
20698 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20699 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20700 if (IntrWithRoundingModeOpcode != 0) {
20701 SDValue Rnd = Op.getOperand(5);
20702 if (!isRoundModeCurDirection(Rnd))
20703 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20704 dl, Op.getValueType(),
20705 Src1, Src2, Src3, Rnd),
20706 Mask, PassThru, Subtarget, DAG);
20708 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20709 dl, Op.getValueType(),
20711 Mask, PassThru, Subtarget, DAG);
20713 case FMA_OP_SCALAR_MASK:
20714 case FMA_OP_SCALAR_MASK3:
20715 case FMA_OP_SCALAR_MASKZ: {
20716 SDValue Src1 = Op.getOperand(1);
20717 SDValue Src2 = Op.getOperand(2);
20718 SDValue Src3 = Op.getOperand(3);
20719 SDValue Mask = Op.getOperand(4);
20720 MVT VT = Op.getSimpleValueType();
20721 SDValue PassThru = SDValue();
20723 // set PassThru element
20724 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20725 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20726 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20731 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20732 if (IntrWithRoundingModeOpcode != 0) {
20733 SDValue Rnd = Op.getOperand(5);
20734 if (!isRoundModeCurDirection(Rnd))
20735 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20736 Op.getValueType(), Src1, Src2,
20738 Mask, PassThru, Subtarget, DAG);
20741 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20742 Op.getValueType(), Src1, Src2,
20744 Mask, PassThru, Subtarget, DAG);
20747 // NOTE: We need to swizzle the operands to pass the multiply operands
20749 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20750 Op.getOperand(2), Op.getOperand(3), Op.getOperand(1));
20752 // ISD::FP_ROUND has a second argument that indicates if the truncation
20753 // does not change the value. Set it to 0 since it can change.
20754 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20755 DAG.getIntPtrConstant(0, dl));
20756 case CVTPD2PS_MASK: {
20757 SDValue Src = Op.getOperand(1);
20758 SDValue PassThru = Op.getOperand(2);
20759 SDValue Mask = Op.getOperand(3);
20760 // We add rounding mode to the Node when
20761 // - RM Opcode is specified and
20762 // - RM is not "current direction".
20763 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20764 if (IntrWithRoundingModeOpcode != 0) {
20765 SDValue Rnd = Op.getOperand(4);
20766 if (!isRoundModeCurDirection(Rnd)) {
20767 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20768 dl, Op.getValueType(),
20770 Mask, PassThru, Subtarget, DAG);
20773 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20774 // ISD::FP_ROUND has a second argument that indicates if the truncation
20775 // does not change the value. Set it to 0 since it can change.
20776 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20777 DAG.getIntPtrConstant(0, dl)),
20778 Mask, PassThru, Subtarget, DAG);
20781 // FPclass intrinsics
20782 SDValue Src1 = Op.getOperand(1);
20783 MVT MaskVT = Op.getSimpleValueType();
20784 SDValue Imm = Op.getOperand(2);
20785 return DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20788 SDValue Src1 = Op.getOperand(1);
20789 SDValue Imm = Op.getOperand(2);
20790 SDValue Mask = Op.getOperand(3);
20791 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20792 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20794 // Need to fill with zeros to ensure the bitcast will produce zeroes
20795 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20796 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20797 DAG.getConstant(0, dl, MVT::v8i1),
20798 FPclassMask, DAG.getIntPtrConstant(0, dl));
20799 return DAG.getBitcast(MVT::i8, Ins);
20802 // Comparison intrinsics with masks.
20803 // Example of transformation:
20804 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20805 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20807 // (v8i1 (insert_subvector zero,
20808 // (v2i1 (and (PCMPEQM %a, %b),
20809 // (extract_subvector
20810 // (v8i1 (bitcast %mask)), 0))), 0))))
20811 MVT VT = Op.getOperand(1).getSimpleValueType();
20812 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20813 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20814 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20815 Mask.getSimpleValueType().getSizeInBits());
20816 SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20818 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20820 // Need to fill with zeros to ensure the bitcast will produce zeroes
20821 // for the upper bits in the v2i1/v4i1 case.
20822 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20823 DAG.getConstant(0, dl, BitcastVT),
20824 CmpMask, DAG.getIntPtrConstant(0, dl));
20825 return DAG.getBitcast(Op.getValueType(), Res);
20828 case CMP_MASK_CC: {
20829 MVT MaskVT = Op.getSimpleValueType();
20831 SDValue CC = Op.getOperand(3);
20832 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20833 // We specify 2 possible opcodes for intrinsics with rounding modes.
20834 // First, we check if the intrinsic may have non-default rounding mode,
20835 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20836 if (IntrData->Opc1 != 0) {
20837 SDValue Rnd = Op.getOperand(4);
20838 if (!isRoundModeCurDirection(Rnd))
20839 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20840 Op.getOperand(2), CC, Rnd);
20842 //default rounding mode
20843 if (!Cmp.getNode())
20844 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20845 Op.getOperand(2), CC);
20849 case CMP_MASK_SCALAR_CC: {
20850 SDValue Src1 = Op.getOperand(1);
20851 SDValue Src2 = Op.getOperand(2);
20852 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20853 SDValue Mask = Op.getOperand(4);
20856 if (IntrData->Opc1 != 0) {
20857 SDValue Rnd = Op.getOperand(5);
20858 if (!isRoundModeCurDirection(Rnd))
20859 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20861 //default rounding mode
20863 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20865 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20867 // Need to fill with zeros to ensure the bitcast will produce zeroes
20868 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20869 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20870 DAG.getConstant(0, dl, MVT::v8i1),
20871 CmpMask, DAG.getIntPtrConstant(0, dl));
20872 return DAG.getBitcast(MVT::i8, Ins);
20874 case COMI: { // Comparison intrinsics
20875 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20876 SDValue LHS = Op.getOperand(1);
20877 SDValue RHS = Op.getOperand(2);
20878 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20879 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20882 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20883 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20884 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20885 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20888 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20889 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20890 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20891 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20894 case ISD::SETGT: // (CF = 0 and ZF = 0)
20895 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20897 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20898 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20901 case ISD::SETGE: // CF = 0
20902 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20904 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20905 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20908 llvm_unreachable("Unexpected illegal condition!");
20910 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20912 case COMI_RM: { // Comparison intrinsics with Sae
20913 SDValue LHS = Op.getOperand(1);
20914 SDValue RHS = Op.getOperand(2);
20915 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20916 SDValue Sae = Op.getOperand(4);
20919 if (isRoundModeCurDirection(Sae))
20920 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20921 DAG.getConstant(CondVal, dl, MVT::i8));
20923 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20924 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20925 // Need to fill with zeros to ensure the bitcast will produce zeroes
20926 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20927 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20928 DAG.getConstant(0, dl, MVT::v16i1),
20929 FCmp, DAG.getIntPtrConstant(0, dl));
20930 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
20931 DAG.getBitcast(MVT::i16, Ins));
20934 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20935 Op.getOperand(1), Op.getOperand(2), Subtarget,
20937 case COMPRESS_EXPAND_IN_REG: {
20938 SDValue Mask = Op.getOperand(3);
20939 SDValue DataToCompress = Op.getOperand(1);
20940 SDValue PassThru = Op.getOperand(2);
20941 if (isAllOnesConstant(Mask)) // return data as is
20942 return Op.getOperand(1);
20944 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20946 Mask, PassThru, Subtarget, DAG);
20949 case FIXUPIMMS_MASKZ:
20951 case FIXUPIMM_MASKZ:{
20952 SDValue Src1 = Op.getOperand(1);
20953 SDValue Src2 = Op.getOperand(2);
20954 SDValue Src3 = Op.getOperand(3);
20955 SDValue Imm = Op.getOperand(4);
20956 SDValue Mask = Op.getOperand(5);
20957 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20958 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20959 // We specify 2 possible modes for intrinsics, with/without rounding
20961 // First, we check if the intrinsic have rounding mode (7 operands),
20962 // if not, we set rounding mode to "current".
20964 if (Op.getNumOperands() == 7)
20965 Rnd = Op.getOperand(6);
20967 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20968 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20969 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20970 Src1, Src2, Src3, Imm, Rnd),
20971 Mask, Passthru, Subtarget, DAG);
20972 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20973 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20974 Src1, Src2, Src3, Imm, Rnd),
20975 Mask, Passthru, Subtarget, DAG);
20978 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20979 // Clear the upper bits of the rounding immediate so that the legacy
20980 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20981 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20983 DAG.getConstant(0xf, dl, MVT::i32));
20984 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20985 Op.getOperand(1), RoundingMode);
20988 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20989 // Clear the upper bits of the rounding immediate so that the legacy
20990 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20991 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20993 DAG.getConstant(0xf, dl, MVT::i32));
20994 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20995 Op.getOperand(1), Op.getOperand(2), RoundingMode);
21003 default: return SDValue(); // Don't custom lower most intrinsics.
21005 // ptest and testp intrinsics. The intrinsic these come from are designed to
21006 // return an integer value, not just an instruction so lower it to the ptest
21007 // or testp pattern and a setcc for the result.
21008 case Intrinsic::x86_sse41_ptestz:
21009 case Intrinsic::x86_sse41_ptestc:
21010 case Intrinsic::x86_sse41_ptestnzc:
21011 case Intrinsic::x86_avx_ptestz_256:
21012 case Intrinsic::x86_avx_ptestc_256:
21013 case Intrinsic::x86_avx_ptestnzc_256:
21014 case Intrinsic::x86_avx_vtestz_ps:
21015 case Intrinsic::x86_avx_vtestc_ps:
21016 case Intrinsic::x86_avx_vtestnzc_ps:
21017 case Intrinsic::x86_avx_vtestz_pd:
21018 case Intrinsic::x86_avx_vtestc_pd:
21019 case Intrinsic::x86_avx_vtestnzc_pd:
21020 case Intrinsic::x86_avx_vtestz_ps_256:
21021 case Intrinsic::x86_avx_vtestc_ps_256:
21022 case Intrinsic::x86_avx_vtestnzc_ps_256:
21023 case Intrinsic::x86_avx_vtestz_pd_256:
21024 case Intrinsic::x86_avx_vtestc_pd_256:
21025 case Intrinsic::x86_avx_vtestnzc_pd_256: {
21026 bool IsTestPacked = false;
21027 X86::CondCode X86CC;
21029 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
21030 case Intrinsic::x86_avx_vtestz_ps:
21031 case Intrinsic::x86_avx_vtestz_pd:
21032 case Intrinsic::x86_avx_vtestz_ps_256:
21033 case Intrinsic::x86_avx_vtestz_pd_256:
21034 IsTestPacked = true;
21036 case Intrinsic::x86_sse41_ptestz:
21037 case Intrinsic::x86_avx_ptestz_256:
21039 X86CC = X86::COND_E;
21041 case Intrinsic::x86_avx_vtestc_ps:
21042 case Intrinsic::x86_avx_vtestc_pd:
21043 case Intrinsic::x86_avx_vtestc_ps_256:
21044 case Intrinsic::x86_avx_vtestc_pd_256:
21045 IsTestPacked = true;
21047 case Intrinsic::x86_sse41_ptestc:
21048 case Intrinsic::x86_avx_ptestc_256:
21050 X86CC = X86::COND_B;
21052 case Intrinsic::x86_avx_vtestnzc_ps:
21053 case Intrinsic::x86_avx_vtestnzc_pd:
21054 case Intrinsic::x86_avx_vtestnzc_ps_256:
21055 case Intrinsic::x86_avx_vtestnzc_pd_256:
21056 IsTestPacked = true;
21058 case Intrinsic::x86_sse41_ptestnzc:
21059 case Intrinsic::x86_avx_ptestnzc_256:
21061 X86CC = X86::COND_A;
21065 SDValue LHS = Op.getOperand(1);
21066 SDValue RHS = Op.getOperand(2);
21067 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
21068 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
21069 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
21070 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21073 case Intrinsic::x86_sse42_pcmpistria128:
21074 case Intrinsic::x86_sse42_pcmpestria128:
21075 case Intrinsic::x86_sse42_pcmpistric128:
21076 case Intrinsic::x86_sse42_pcmpestric128:
21077 case Intrinsic::x86_sse42_pcmpistrio128:
21078 case Intrinsic::x86_sse42_pcmpestrio128:
21079 case Intrinsic::x86_sse42_pcmpistris128:
21080 case Intrinsic::x86_sse42_pcmpestris128:
21081 case Intrinsic::x86_sse42_pcmpistriz128:
21082 case Intrinsic::x86_sse42_pcmpestriz128: {
21084 X86::CondCode X86CC;
21086 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
21087 case Intrinsic::x86_sse42_pcmpistria128:
21088 Opcode = X86ISD::PCMPISTR;
21089 X86CC = X86::COND_A;
21091 case Intrinsic::x86_sse42_pcmpestria128:
21092 Opcode = X86ISD::PCMPESTR;
21093 X86CC = X86::COND_A;
21095 case Intrinsic::x86_sse42_pcmpistric128:
21096 Opcode = X86ISD::PCMPISTR;
21097 X86CC = X86::COND_B;
21099 case Intrinsic::x86_sse42_pcmpestric128:
21100 Opcode = X86ISD::PCMPESTR;
21101 X86CC = X86::COND_B;
21103 case Intrinsic::x86_sse42_pcmpistrio128:
21104 Opcode = X86ISD::PCMPISTR;
21105 X86CC = X86::COND_O;
21107 case Intrinsic::x86_sse42_pcmpestrio128:
21108 Opcode = X86ISD::PCMPESTR;
21109 X86CC = X86::COND_O;
21111 case Intrinsic::x86_sse42_pcmpistris128:
21112 Opcode = X86ISD::PCMPISTR;
21113 X86CC = X86::COND_S;
21115 case Intrinsic::x86_sse42_pcmpestris128:
21116 Opcode = X86ISD::PCMPESTR;
21117 X86CC = X86::COND_S;
21119 case Intrinsic::x86_sse42_pcmpistriz128:
21120 Opcode = X86ISD::PCMPISTR;
21121 X86CC = X86::COND_E;
21123 case Intrinsic::x86_sse42_pcmpestriz128:
21124 Opcode = X86ISD::PCMPESTR;
21125 X86CC = X86::COND_E;
21128 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21129 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21130 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2);
21131 SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG);
21132 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
21135 case Intrinsic::x86_sse42_pcmpistri128:
21136 case Intrinsic::x86_sse42_pcmpestri128: {
21138 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
21139 Opcode = X86ISD::PCMPISTR;
21141 Opcode = X86ISD::PCMPESTR;
21143 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21144 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21145 return DAG.getNode(Opcode, dl, VTs, NewOps);
21148 case Intrinsic::x86_sse42_pcmpistrm128:
21149 case Intrinsic::x86_sse42_pcmpestrm128: {
21151 if (IntNo == Intrinsic::x86_sse42_pcmpistrm128)
21152 Opcode = X86ISD::PCMPISTR;
21154 Opcode = X86ISD::PCMPESTR;
21156 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
21157 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32);
21158 return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1);
21161 case Intrinsic::eh_sjlj_lsda: {
21162 MachineFunction &MF = DAG.getMachineFunction();
21163 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21164 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21165 auto &Context = MF.getMMI().getContext();
21166 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
21167 Twine(MF.getFunctionNumber()));
21168 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
21169 DAG.getMCSymbol(S, PtrVT));
21172 case Intrinsic::x86_seh_lsda: {
21173 // Compute the symbol for the LSDA. We know it'll get emitted later.
21174 MachineFunction &MF = DAG.getMachineFunction();
21175 SDValue Op1 = Op.getOperand(1);
21176 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
21177 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
21178 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21180 // Generate a simple absolute symbol reference. This intrinsic is only
21181 // supported on 32-bit Windows, which isn't PIC.
21182 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
21183 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
21186 case Intrinsic::x86_seh_recoverfp: {
21187 SDValue FnOp = Op.getOperand(1);
21188 SDValue IncomingFPOp = Op.getOperand(2);
21189 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
21190 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
21192 report_fatal_error(
21193 "llvm.x86.seh.recoverfp must take a function as the first argument");
21194 return recoverFramePointer(DAG, Fn, IncomingFPOp);
21197 case Intrinsic::localaddress: {
21198 // Returns one of the stack, base, or frame pointer registers, depending on
21199 // which is used to reference local variables.
21200 MachineFunction &MF = DAG.getMachineFunction();
21201 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21203 if (RegInfo->hasBasePointer(MF))
21204 Reg = RegInfo->getBaseRegister();
21205 else // This function handles the SP or FP case.
21206 Reg = RegInfo->getPtrSizedFrameRegister(MF);
21207 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
21212 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21213 SDValue Src, SDValue Mask, SDValue Base,
21214 SDValue Index, SDValue ScaleOp, SDValue Chain,
21215 const X86Subtarget &Subtarget) {
21217 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21218 // Scale must be constant.
21221 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21222 EVT MaskVT = Mask.getValueType();
21223 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21224 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21225 SDValue Segment = DAG.getRegister(0, MVT::i32);
21226 // If source is undef or we know it won't be used, use a zero vector
21227 // to break register dependency.
21228 // TODO: use undef instead and let BreakFalseDeps deal with it?
21229 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
21230 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21231 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
21232 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21233 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21234 return DAG.getMergeValues(RetOps, dl);
21237 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21238 SDValue Src, SDValue Mask, SDValue Base,
21239 SDValue Index, SDValue ScaleOp, SDValue Chain,
21240 const X86Subtarget &Subtarget) {
21242 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21243 // Scale must be constant.
21246 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21247 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21248 Index.getSimpleValueType().getVectorNumElements());
21250 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21251 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21252 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21253 SDValue Segment = DAG.getRegister(0, MVT::i32);
21254 // If source is undef or we know it won't be used, use a zero vector
21255 // to break register dependency.
21256 // TODO: use undef instead and let BreakFalseDeps deal with it?
21257 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
21258 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21259 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
21260 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21261 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21262 return DAG.getMergeValues(RetOps, dl);
21265 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21266 SDValue Src, SDValue Mask, SDValue Base,
21267 SDValue Index, SDValue ScaleOp, SDValue Chain,
21268 const X86Subtarget &Subtarget) {
21270 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21271 // Scale must be constant.
21274 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21275 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21276 SDValue Segment = DAG.getRegister(0, MVT::i32);
21277 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21278 Index.getSimpleValueType().getVectorNumElements());
21280 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21281 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
21282 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
21283 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21284 return SDValue(Res, 1);
21287 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21288 SDValue Mask, SDValue Base, SDValue Index,
21289 SDValue ScaleOp, SDValue Chain,
21290 const X86Subtarget &Subtarget) {
21292 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21293 // Scale must be constant.
21296 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21297 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21298 SDValue Segment = DAG.getRegister(0, MVT::i32);
21300 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
21301 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21302 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
21303 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
21304 return SDValue(Res, 0);
21307 /// Handles the lowering of builtin intrinsic that return the value
21308 /// of the extended control register.
21309 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
21311 const X86Subtarget &Subtarget,
21312 SmallVectorImpl<SDValue> &Results) {
21313 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21314 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21317 // The ECX register is used to select the index of the XCR register to
21320 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
21321 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
21322 Chain = SDValue(N1, 0);
21324 // Reads the content of XCR and returns it in registers EDX:EAX.
21325 if (Subtarget.is64Bit()) {
21326 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
21327 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21330 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
21331 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21334 Chain = HI.getValue(1);
21336 if (Subtarget.is64Bit()) {
21337 // Merge the two 32-bit values into a 64-bit one..
21338 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21339 DAG.getConstant(32, DL, MVT::i8));
21340 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21341 Results.push_back(Chain);
21345 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21346 SDValue Ops[] = { LO, HI };
21347 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21348 Results.push_back(Pair);
21349 Results.push_back(Chain);
21352 /// Handles the lowering of builtin intrinsics that read performance monitor
21353 /// counters (x86_rdpmc).
21354 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21356 const X86Subtarget &Subtarget,
21357 SmallVectorImpl<SDValue> &Results) {
21358 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21359 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21362 // The ECX register is used to select the index of the performance counter
21364 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21366 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21368 // Reads the content of a 64-bit performance counter and returns it in the
21369 // registers EDX:EAX.
21370 if (Subtarget.is64Bit()) {
21371 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21372 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21375 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21376 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21379 Chain = HI.getValue(1);
21381 if (Subtarget.is64Bit()) {
21382 // The EAX register is loaded with the low-order 32 bits. The EDX register
21383 // is loaded with the supported high-order bits of the counter.
21384 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21385 DAG.getConstant(32, DL, MVT::i8));
21386 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21387 Results.push_back(Chain);
21391 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21392 SDValue Ops[] = { LO, HI };
21393 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21394 Results.push_back(Pair);
21395 Results.push_back(Chain);
21398 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21399 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21400 /// READCYCLECOUNTER nodes.
21401 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21403 const X86Subtarget &Subtarget,
21404 SmallVectorImpl<SDValue> &Results) {
21405 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21406 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21409 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21410 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21411 // and the EAX register is loaded with the low-order 32 bits.
21412 if (Subtarget.is64Bit()) {
21413 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21414 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21417 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21418 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21421 SDValue Chain = HI.getValue(1);
21423 if (Opcode == X86ISD::RDTSCP_DAG) {
21424 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21426 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21427 // the ECX register. Add 'ecx' explicitly to the chain.
21428 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21430 // Explicitly store the content of ECX at the location passed in input
21431 // to the 'rdtscp' intrinsic.
21432 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21433 MachinePointerInfo());
21436 if (Subtarget.is64Bit()) {
21437 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21438 // the EAX register is loaded with the low-order 32 bits.
21439 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21440 DAG.getConstant(32, DL, MVT::i8));
21441 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21442 Results.push_back(Chain);
21446 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21447 SDValue Ops[] = { LO, HI };
21448 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21449 Results.push_back(Pair);
21450 Results.push_back(Chain);
21453 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21454 SelectionDAG &DAG) {
21455 SmallVector<SDValue, 2> Results;
21457 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21459 return DAG.getMergeValues(Results, DL);
21462 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21463 MachineFunction &MF = DAG.getMachineFunction();
21464 SDValue Chain = Op.getOperand(0);
21465 SDValue RegNode = Op.getOperand(2);
21466 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21468 report_fatal_error("EH registrations only live in functions using WinEH");
21470 // Cast the operand to an alloca, and remember the frame index.
21471 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21473 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21474 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21476 // Return the chain operand without making any DAG nodes.
21480 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21481 MachineFunction &MF = DAG.getMachineFunction();
21482 SDValue Chain = Op.getOperand(0);
21483 SDValue EHGuard = Op.getOperand(2);
21484 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21486 report_fatal_error("EHGuard only live in functions using WinEH");
21488 // Cast the operand to an alloca, and remember the frame index.
21489 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21491 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21492 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21494 // Return the chain operand without making any DAG nodes.
21498 /// Emit Truncating Store with signed or unsigned saturation.
21500 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21501 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21502 SelectionDAG &DAG) {
21504 SDVTList VTs = DAG.getVTList(MVT::Other);
21505 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21506 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21508 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21509 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21512 /// Emit Masked Truncating Store with signed or unsigned saturation.
21514 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21515 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21516 MachineMemOperand *MMO, SelectionDAG &DAG) {
21518 SDVTList VTs = DAG.getVTList(MVT::Other);
21519 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21521 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21522 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21525 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21526 SelectionDAG &DAG) {
21527 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21529 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21532 case llvm::Intrinsic::x86_seh_ehregnode:
21533 return MarkEHRegistrationNode(Op, DAG);
21534 case llvm::Intrinsic::x86_seh_ehguard:
21535 return MarkEHGuard(Op, DAG);
21536 case llvm::Intrinsic::x86_flags_read_u32:
21537 case llvm::Intrinsic::x86_flags_read_u64:
21538 case llvm::Intrinsic::x86_flags_write_u32:
21539 case llvm::Intrinsic::x86_flags_write_u64: {
21540 // We need a frame pointer because this will get lowered to a PUSH/POP
21542 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21543 MFI.setHasCopyImplyingStackAdjustment(true);
21544 // Don't do anything here, we will expand these intrinsics out later
21545 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21548 case Intrinsic::x86_lwpins32:
21549 case Intrinsic::x86_lwpins64:
21550 case Intrinsic::x86_umwait:
21551 case Intrinsic::x86_tpause: {
21553 SDValue Chain = Op->getOperand(0);
21554 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21558 default: llvm_unreachable("Impossible intrinsic");
21559 case Intrinsic::x86_umwait:
21560 Opcode = X86ISD::UMWAIT;
21562 case Intrinsic::x86_tpause:
21563 Opcode = X86ISD::TPAUSE;
21565 case Intrinsic::x86_lwpins32:
21566 case Intrinsic::x86_lwpins64:
21567 Opcode = X86ISD::LWPINS;
21571 SDValue Operation =
21572 DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2),
21573 Op->getOperand(3), Op->getOperand(4));
21574 SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG);
21575 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21576 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21577 Operation.getValue(1));
21584 switch(IntrData->Type) {
21585 default: llvm_unreachable("Unknown Intrinsic Type");
21588 // Emit the node with the right value type.
21589 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21590 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21592 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21593 // Otherwise return the value from Rand, which is always 0, casted to i32.
21594 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21595 DAG.getConstant(1, dl, Op->getValueType(1)),
21596 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21597 SDValue(Result.getNode(), 1) };
21598 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21600 // Return { result, isValid, chain }.
21601 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21602 SDValue(Result.getNode(), 2));
21604 case GATHER_AVX2: {
21605 SDValue Chain = Op.getOperand(0);
21606 SDValue Src = Op.getOperand(2);
21607 SDValue Base = Op.getOperand(3);
21608 SDValue Index = Op.getOperand(4);
21609 SDValue Mask = Op.getOperand(5);
21610 SDValue Scale = Op.getOperand(6);
21611 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21612 Scale, Chain, Subtarget);
21615 //gather(v1, mask, index, base, scale);
21616 SDValue Chain = Op.getOperand(0);
21617 SDValue Src = Op.getOperand(2);
21618 SDValue Base = Op.getOperand(3);
21619 SDValue Index = Op.getOperand(4);
21620 SDValue Mask = Op.getOperand(5);
21621 SDValue Scale = Op.getOperand(6);
21622 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21626 //scatter(base, mask, index, v1, scale);
21627 SDValue Chain = Op.getOperand(0);
21628 SDValue Base = Op.getOperand(2);
21629 SDValue Mask = Op.getOperand(3);
21630 SDValue Index = Op.getOperand(4);
21631 SDValue Src = Op.getOperand(5);
21632 SDValue Scale = Op.getOperand(6);
21633 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21634 Scale, Chain, Subtarget);
21637 SDValue Hint = Op.getOperand(6);
21638 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21639 assert((HintVal == 2 || HintVal == 3) &&
21640 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21641 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21642 SDValue Chain = Op.getOperand(0);
21643 SDValue Mask = Op.getOperand(2);
21644 SDValue Index = Op.getOperand(3);
21645 SDValue Base = Op.getOperand(4);
21646 SDValue Scale = Op.getOperand(5);
21647 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21650 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21652 SmallVector<SDValue, 2> Results;
21653 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21655 return DAG.getMergeValues(Results, dl);
21657 // Read Performance Monitoring Counters.
21659 SmallVector<SDValue, 2> Results;
21660 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21661 return DAG.getMergeValues(Results, dl);
21663 // Get Extended Control Register.
21665 SmallVector<SDValue, 2> Results;
21666 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21667 return DAG.getMergeValues(Results, dl);
21669 // XTEST intrinsics.
21671 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21672 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21674 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21675 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21676 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21677 Ret, SDValue(InTrans.getNode(), 1));
21681 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21682 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21683 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21684 DAG.getConstant(-1, dl, MVT::i8));
21685 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21686 Op.getOperand(4), GenCF.getValue(1));
21687 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21688 Op.getOperand(5), MachinePointerInfo());
21689 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21690 SDValue Results[] = { SetCC, Store };
21691 return DAG.getMergeValues(Results, dl);
21693 case TRUNCATE_TO_MEM_VI8:
21694 case TRUNCATE_TO_MEM_VI16:
21695 case TRUNCATE_TO_MEM_VI32: {
21696 SDValue Mask = Op.getOperand(4);
21697 SDValue DataToTruncate = Op.getOperand(3);
21698 SDValue Addr = Op.getOperand(2);
21699 SDValue Chain = Op.getOperand(0);
21701 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21702 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21704 EVT MemVT = MemIntr->getMemoryVT();
21706 uint16_t TruncationOp = IntrData->Opc0;
21707 switch (TruncationOp) {
21708 case X86ISD::VTRUNC: {
21709 if (isAllOnesConstant(Mask)) // return just a truncate store
21710 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21711 MemIntr->getMemOperand());
21713 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21714 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21716 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21717 MemIntr->getMemOperand(), true /* truncating */);
21719 case X86ISD::VTRUNCUS:
21720 case X86ISD::VTRUNCS: {
21721 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21722 if (isAllOnesConstant(Mask))
21723 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21724 MemIntr->getMemOperand(), DAG);
21726 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21727 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21729 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21730 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21733 llvm_unreachable("Unsupported truncstore intrinsic");
21739 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21740 SelectionDAG &DAG) const {
21741 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21742 MFI.setReturnAddressIsTaken(true);
21744 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21747 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21749 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21752 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21753 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21754 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21755 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21756 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21757 MachinePointerInfo());
21760 // Just load the return address.
21761 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21762 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21763 MachinePointerInfo());
21766 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21767 SelectionDAG &DAG) const {
21768 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21769 return getReturnAddressFrameIndex(DAG);
21772 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21773 MachineFunction &MF = DAG.getMachineFunction();
21774 MachineFrameInfo &MFI = MF.getFrameInfo();
21775 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21776 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21777 EVT VT = Op.getValueType();
21779 MFI.setFrameAddressIsTaken(true);
21781 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21782 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21783 // is not possible to crawl up the stack without looking at the unwind codes
21785 int FrameAddrIndex = FuncInfo->getFAIndex();
21786 if (!FrameAddrIndex) {
21787 // Set up a frame object for the return address.
21788 unsigned SlotSize = RegInfo->getSlotSize();
21789 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21790 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21791 FuncInfo->setFAIndex(FrameAddrIndex);
21793 return DAG.getFrameIndex(FrameAddrIndex, VT);
21796 unsigned FrameReg =
21797 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21798 SDLoc dl(Op); // FIXME probably not meaningful
21799 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21800 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21801 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21802 "Invalid Frame Register!");
21803 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21805 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21806 MachinePointerInfo());
21810 // FIXME? Maybe this could be a TableGen attribute on some registers and
21811 // this table could be generated automatically from RegInfo.
21812 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21813 SelectionDAG &DAG) const {
21814 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21815 const MachineFunction &MF = DAG.getMachineFunction();
21817 unsigned Reg = StringSwitch<unsigned>(RegName)
21818 .Case("esp", X86::ESP)
21819 .Case("rsp", X86::RSP)
21820 .Case("ebp", X86::EBP)
21821 .Case("rbp", X86::RBP)
21824 if (Reg == X86::EBP || Reg == X86::RBP) {
21825 if (!TFI.hasFP(MF))
21826 report_fatal_error("register " + StringRef(RegName) +
21827 " is allocatable: function has no frame pointer");
21830 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21831 unsigned FrameReg =
21832 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21833 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21834 "Invalid Frame Register!");
21842 report_fatal_error("Invalid register name global variable");
21845 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21846 SelectionDAG &DAG) const {
21847 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21848 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21851 unsigned X86TargetLowering::getExceptionPointerRegister(
21852 const Constant *PersonalityFn) const {
21853 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21854 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21856 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21859 unsigned X86TargetLowering::getExceptionSelectorRegister(
21860 const Constant *PersonalityFn) const {
21861 // Funclet personalities don't use selectors (the runtime does the selection).
21862 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21863 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21866 bool X86TargetLowering::needsFixedCatchObjects() const {
21867 return Subtarget.isTargetWin64();
21870 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21871 SDValue Chain = Op.getOperand(0);
21872 SDValue Offset = Op.getOperand(1);
21873 SDValue Handler = Op.getOperand(2);
21876 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21877 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21878 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21879 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21880 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21881 "Invalid Frame Register!");
21882 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21883 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21885 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21886 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21888 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21889 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21890 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21892 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21893 DAG.getRegister(StoreAddrReg, PtrVT));
21896 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21897 SelectionDAG &DAG) const {
21899 // If the subtarget is not 64bit, we may need the global base reg
21900 // after isel expand pseudo, i.e., after CGBR pass ran.
21901 // Therefore, ask for the GlobalBaseReg now, so that the pass
21902 // inserts the code for us in case we need it.
21903 // Otherwise, we will end up in a situation where we will
21904 // reference a virtual register that is not defined!
21905 if (!Subtarget.is64Bit()) {
21906 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21907 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21909 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21910 DAG.getVTList(MVT::i32, MVT::Other),
21911 Op.getOperand(0), Op.getOperand(1));
21914 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21915 SelectionDAG &DAG) const {
21917 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21918 Op.getOperand(0), Op.getOperand(1));
21921 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21922 SelectionDAG &DAG) const {
21924 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21928 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21929 return Op.getOperand(0);
21932 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21933 SelectionDAG &DAG) const {
21934 SDValue Root = Op.getOperand(0);
21935 SDValue Trmp = Op.getOperand(1); // trampoline
21936 SDValue FPtr = Op.getOperand(2); // nested function
21937 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21940 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21941 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21943 if (Subtarget.is64Bit()) {
21944 SDValue OutChains[6];
21946 // Large code-model.
21947 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21948 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21950 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21951 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21953 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21955 // Load the pointer to the nested function into R11.
21956 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21957 SDValue Addr = Trmp;
21958 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21959 Addr, MachinePointerInfo(TrmpAddr));
21961 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21962 DAG.getConstant(2, dl, MVT::i64));
21964 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21965 /* Alignment = */ 2);
21967 // Load the 'nest' parameter value into R10.
21968 // R10 is specified in X86CallingConv.td
21969 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21970 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21971 DAG.getConstant(10, dl, MVT::i64));
21972 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21973 Addr, MachinePointerInfo(TrmpAddr, 10));
21975 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21976 DAG.getConstant(12, dl, MVT::i64));
21978 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21979 /* Alignment = */ 2);
21981 // Jump to the nested function.
21982 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21983 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21984 DAG.getConstant(20, dl, MVT::i64));
21985 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21986 Addr, MachinePointerInfo(TrmpAddr, 20));
21988 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21989 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21990 DAG.getConstant(22, dl, MVT::i64));
21991 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21992 Addr, MachinePointerInfo(TrmpAddr, 22));
21994 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21996 const Function *Func =
21997 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21998 CallingConv::ID CC = Func->getCallingConv();
22003 llvm_unreachable("Unsupported calling convention");
22004 case CallingConv::C:
22005 case CallingConv::X86_StdCall: {
22006 // Pass 'nest' parameter in ECX.
22007 // Must be kept in sync with X86CallingConv.td
22008 NestReg = X86::ECX;
22010 // Check that ECX wasn't needed by an 'inreg' parameter.
22011 FunctionType *FTy = Func->getFunctionType();
22012 const AttributeList &Attrs = Func->getAttributes();
22014 if (!Attrs.isEmpty() && !Func->isVarArg()) {
22015 unsigned InRegCount = 0;
22018 for (FunctionType::param_iterator I = FTy->param_begin(),
22019 E = FTy->param_end(); I != E; ++I, ++Idx)
22020 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
22021 auto &DL = DAG.getDataLayout();
22022 // FIXME: should only count parameters that are lowered to integers.
22023 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
22026 if (InRegCount > 2) {
22027 report_fatal_error("Nest register in use - reduce number of inreg"
22033 case CallingConv::X86_FastCall:
22034 case CallingConv::X86_ThisCall:
22035 case CallingConv::Fast:
22036 // Pass 'nest' parameter in EAX.
22037 // Must be kept in sync with X86CallingConv.td
22038 NestReg = X86::EAX;
22042 SDValue OutChains[4];
22043 SDValue Addr, Disp;
22045 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22046 DAG.getConstant(10, dl, MVT::i32));
22047 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
22049 // This is storing the opcode for MOV32ri.
22050 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
22051 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
22053 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
22054 Trmp, MachinePointerInfo(TrmpAddr));
22056 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22057 DAG.getConstant(1, dl, MVT::i32));
22059 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
22060 /* Alignment = */ 1);
22062 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
22063 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22064 DAG.getConstant(5, dl, MVT::i32));
22065 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
22066 Addr, MachinePointerInfo(TrmpAddr, 5),
22067 /* Alignment = */ 1);
22069 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
22070 DAG.getConstant(6, dl, MVT::i32));
22072 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
22073 /* Alignment = */ 1);
22075 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
22079 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
22080 SelectionDAG &DAG) const {
22082 The rounding mode is in bits 11:10 of FPSR, and has the following
22084 00 Round to nearest
22089 FLT_ROUNDS, on the other hand, expects the following:
22096 To perform the conversion, we do:
22097 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
22100 MachineFunction &MF = DAG.getMachineFunction();
22101 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
22102 unsigned StackAlignment = TFI.getStackAlignment();
22103 MVT VT = Op.getSimpleValueType();
22106 // Save FP Control Word to stack slot
22107 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
22108 SDValue StackSlot =
22109 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
22111 MachineMemOperand *MMO =
22112 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
22113 MachineMemOperand::MOStore, 2, 2);
22115 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
22116 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
22117 DAG.getVTList(MVT::Other),
22118 Ops, MVT::i16, MMO);
22120 // Load FP Control Word from stack slot
22122 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
22124 // Transform as necessary
22126 DAG.getNode(ISD::SRL, DL, MVT::i16,
22127 DAG.getNode(ISD::AND, DL, MVT::i16,
22128 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
22129 DAG.getConstant(11, DL, MVT::i8));
22131 DAG.getNode(ISD::SRL, DL, MVT::i16,
22132 DAG.getNode(ISD::AND, DL, MVT::i16,
22133 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
22134 DAG.getConstant(9, DL, MVT::i8));
22137 DAG.getNode(ISD::AND, DL, MVT::i16,
22138 DAG.getNode(ISD::ADD, DL, MVT::i16,
22139 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
22140 DAG.getConstant(1, DL, MVT::i16)),
22141 DAG.getConstant(3, DL, MVT::i16));
22143 return DAG.getNode((VT.getSizeInBits() < 16 ?
22144 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
22147 // Split an unary integer op into 2 half sized ops.
22148 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
22149 MVT VT = Op.getSimpleValueType();
22150 unsigned NumElems = VT.getVectorNumElements();
22151 unsigned SizeInBits = VT.getSizeInBits();
22152 MVT EltVT = VT.getVectorElementType();
22153 SDValue Src = Op.getOperand(0);
22154 assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
22155 "Src and Op should have the same element type!");
22157 // Extract the Lo/Hi vectors
22159 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
22160 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
22162 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
22163 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22164 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
22165 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
22168 // Decompose 256-bit ops into smaller 128-bit ops.
22169 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
22170 assert(Op.getSimpleValueType().is256BitVector() &&
22171 Op.getSimpleValueType().isInteger() &&
22172 "Only handle AVX 256-bit vector integer operation");
22173 return LowerVectorIntUnary(Op, DAG);
22176 // Decompose 512-bit ops into smaller 256-bit ops.
22177 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
22178 assert(Op.getSimpleValueType().is512BitVector() &&
22179 Op.getSimpleValueType().isInteger() &&
22180 "Only handle AVX 512-bit vector integer operation");
22181 return LowerVectorIntUnary(Op, DAG);
22184 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
22186 // i8/i16 vector implemented using dword LZCNT vector instruction
22187 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
22188 // split the vector, perform operation on it's Lo a Hi part and
22189 // concatenate the results.
22190 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
22191 const X86Subtarget &Subtarget) {
22192 assert(Op.getOpcode() == ISD::CTLZ);
22194 MVT VT = Op.getSimpleValueType();
22195 MVT EltVT = VT.getVectorElementType();
22196 unsigned NumElems = VT.getVectorNumElements();
22198 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
22199 "Unsupported element type");
22201 // Split vector, it's Lo and Hi parts will be handled in next iteration.
22202 if (NumElems > 16 ||
22203 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
22204 return LowerVectorIntUnary(Op, DAG);
22206 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
22207 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
22208 "Unsupported value type for operation");
22210 // Use native supported vector instruction vplzcntd.
22211 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
22212 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
22213 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
22214 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
22216 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
22219 // Lower CTLZ using a PSHUFB lookup table implementation.
22220 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
22221 const X86Subtarget &Subtarget,
22222 SelectionDAG &DAG) {
22223 MVT VT = Op.getSimpleValueType();
22224 int NumElts = VT.getVectorNumElements();
22225 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
22226 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
22228 // Per-nibble leading zero PSHUFB lookup table.
22229 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
22230 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
22231 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
22232 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
22234 SmallVector<SDValue, 64> LUTVec;
22235 for (int i = 0; i < NumBytes; ++i)
22236 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22237 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
22239 // Begin by bitcasting the input to byte vector, then split those bytes
22240 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22241 // If the hi input nibble is zero then we add both results together, otherwise
22242 // we just take the hi result (by masking the lo result to zero before the
22244 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
22245 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
22247 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
22248 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
22249 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
22250 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
22252 if (CurrVT.is512BitVector()) {
22253 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22254 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
22255 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22257 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
22260 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22261 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22262 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
22263 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
22265 // Merge result back from vXi8 back to VT, working on the lo/hi halves
22266 // of the current vector width in the same way we did for the nibbles.
22267 // If the upper half of the input element is zero then add the halves'
22268 // leading zero counts together, otherwise just use the upper half's.
22269 // Double the width of the result until we are at target width.
22270 while (CurrVT != VT) {
22271 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
22272 int CurrNumElts = CurrVT.getVectorNumElements();
22273 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
22274 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
22275 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
22277 // Check if the upper half of the input element is zero.
22278 if (CurrVT.is512BitVector()) {
22279 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22280 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
22281 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22282 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22284 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
22285 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22287 HiZ = DAG.getBitcast(NextVT, HiZ);
22289 // Move the upper/lower halves to the lower bits as we'll be extending to
22290 // NextVT. Mask the lower result to zero if HiZ is true and add the results
22292 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
22293 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
22294 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
22295 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
22296 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
22303 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
22304 const X86Subtarget &Subtarget,
22305 SelectionDAG &DAG) {
22306 MVT VT = Op.getSimpleValueType();
22308 if (Subtarget.hasCDI() &&
22309 // vXi8 vectors need to be promoted to 512-bits for vXi32.
22310 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
22311 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
22313 // Decompose 256-bit ops into smaller 128-bit ops.
22314 if (VT.is256BitVector() && !Subtarget.hasInt256())
22315 return Lower256IntUnary(Op, DAG);
22317 // Decompose 512-bit ops into smaller 256-bit ops.
22318 if (VT.is512BitVector() && !Subtarget.hasBWI())
22319 return Lower512IntUnary(Op, DAG);
22321 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
22322 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
22325 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
22326 SelectionDAG &DAG) {
22327 MVT VT = Op.getSimpleValueType();
22329 unsigned NumBits = VT.getSizeInBits();
22331 unsigned Opc = Op.getOpcode();
22334 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22336 Op = Op.getOperand(0);
22337 if (VT == MVT::i8) {
22338 // Zero extend to i32 since there is not an i8 bsr.
22340 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22343 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22344 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22345 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22347 if (Opc == ISD::CTLZ) {
22348 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22351 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22352 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22355 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22358 // Finally xor with NumBits-1.
22359 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22360 DAG.getConstant(NumBits - 1, dl, OpVT));
22363 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22367 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22368 MVT VT = Op.getSimpleValueType();
22369 unsigned NumBits = VT.getScalarSizeInBits();
22372 if (VT.isVector()) {
22373 SDValue N0 = Op.getOperand(0);
22374 SDValue Zero = DAG.getConstant(0, dl, VT);
22376 // lsb(x) = (x & -x)
22377 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22378 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22380 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22381 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22382 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22383 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22384 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22387 // cttz(x) = ctpop(lsb - 1)
22388 SDValue One = DAG.getConstant(1, dl, VT);
22389 return DAG.getNode(ISD::CTPOP, dl, VT,
22390 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22393 assert(Op.getOpcode() == ISD::CTTZ &&
22394 "Only scalar CTTZ requires custom lowering");
22396 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22397 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22398 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22400 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22403 DAG.getConstant(NumBits, dl, VT),
22404 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22407 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22410 /// Break a 256-bit integer operation into two new 128-bit ones and then
22411 /// concatenate the result back.
22412 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22413 MVT VT = Op.getSimpleValueType();
22415 assert(VT.is256BitVector() && VT.isInteger() &&
22416 "Unsupported value type for operation");
22418 unsigned NumElems = VT.getVectorNumElements();
22421 // Extract the LHS vectors
22422 SDValue LHS = Op.getOperand(0);
22423 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22424 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22426 // Extract the RHS vectors
22427 SDValue RHS = Op.getOperand(1);
22428 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22429 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22431 MVT EltVT = VT.getVectorElementType();
22432 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22434 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22435 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22436 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22439 /// Break a 512-bit integer operation into two new 256-bit ones and then
22440 /// concatenate the result back.
22441 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22442 MVT VT = Op.getSimpleValueType();
22444 assert(VT.is512BitVector() && VT.isInteger() &&
22445 "Unsupported value type for operation");
22447 unsigned NumElems = VT.getVectorNumElements();
22450 // Extract the LHS vectors
22451 SDValue LHS = Op.getOperand(0);
22452 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22453 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22455 // Extract the RHS vectors
22456 SDValue RHS = Op.getOperand(1);
22457 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22458 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22460 MVT EltVT = VT.getVectorElementType();
22461 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22463 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22464 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22465 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22468 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22469 MVT VT = Op.getSimpleValueType();
22470 if (VT.getScalarType() == MVT::i1)
22471 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22472 Op.getOperand(0), Op.getOperand(1));
22473 assert(Op.getSimpleValueType().is256BitVector() &&
22474 Op.getSimpleValueType().isInteger() &&
22475 "Only handle AVX 256-bit vector integer operation");
22476 return Lower256IntArith(Op, DAG);
22479 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22480 MVT VT = Op.getSimpleValueType();
22481 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22482 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22483 // 8-bit integer abs to NEG and CMOV.
22485 SDValue N0 = Op.getOperand(0);
22486 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22487 DAG.getConstant(0, DL, VT), N0);
22488 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22489 SDValue(Neg.getNode(), 1)};
22490 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22493 assert(Op.getSimpleValueType().is256BitVector() &&
22494 Op.getSimpleValueType().isInteger() &&
22495 "Only handle AVX 256-bit vector integer operation");
22496 return Lower256IntUnary(Op, DAG);
22499 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22500 MVT VT = Op.getSimpleValueType();
22502 // For AVX1 cases, split to use legal ops (everything but v4i64).
22503 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
22504 return Lower256IntArith(Op, DAG);
22507 unsigned Opcode = Op.getOpcode();
22508 SDValue N0 = Op.getOperand(0);
22509 SDValue N1 = Op.getOperand(1);
22511 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
22512 // using the SMIN/SMAX instructions and flipping the signbit back.
22513 if (VT == MVT::v8i16) {
22514 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
22515 "Unexpected MIN/MAX opcode");
22516 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
22517 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
22518 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
22519 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
22520 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
22521 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
22524 // Else, expand to a compare/select.
22527 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
22528 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
22529 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
22530 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
22531 default: llvm_unreachable("Unknown MINMAX opcode");
22534 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
22535 return DAG.getSelect(DL, VT, Cond, N0, N1);
22538 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22539 SelectionDAG &DAG) {
22541 MVT VT = Op.getSimpleValueType();
22543 if (VT.getScalarType() == MVT::i1)
22544 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22546 // Decompose 256-bit ops into smaller 128-bit ops.
22547 if (VT.is256BitVector() && !Subtarget.hasInt256())
22548 return Lower256IntArith(Op, DAG);
22550 SDValue A = Op.getOperand(0);
22551 SDValue B = Op.getOperand(1);
22553 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22554 // vector pairs, multiply and truncate.
22555 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22556 if (Subtarget.hasInt256()) {
22557 // For 512-bit vectors, split into 256-bit vectors to allow the
22558 // sign-extension to occur.
22559 if (VT == MVT::v64i8)
22560 return Lower512IntArith(Op, DAG);
22562 // For 256-bit vectors, split into 128-bit vectors to allow the
22563 // sign-extension to occur. We don't need this on AVX512BW as we can
22564 // safely sign-extend to v32i16.
22565 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22566 return Lower256IntArith(Op, DAG);
22568 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22569 return DAG.getNode(
22570 ISD::TRUNCATE, dl, VT,
22571 DAG.getNode(ISD::MUL, dl, ExVT,
22572 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22573 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22576 assert(VT == MVT::v16i8 &&
22577 "Pre-AVX2 support only supports v16i8 multiplication");
22578 MVT ExVT = MVT::v8i16;
22580 // Extract the lo parts and sign extend to i16
22581 // We're going to mask off the low byte of each result element of the
22582 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22584 const int LoShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
22585 4, -1, 5, -1, 6, -1, 7, -1};
22586 SDValue ALo = DAG.getVectorShuffle(VT, dl, A, A, LoShufMask);
22587 SDValue BLo = DAG.getVectorShuffle(VT, dl, B, B, LoShufMask);
22588 ALo = DAG.getBitcast(ExVT, ALo);
22589 BLo = DAG.getBitcast(ExVT, BLo);
22591 // Extract the hi parts and sign extend to i16
22592 // We're going to mask off the low byte of each result element of the
22593 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22595 const int HiShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
22596 12, -1, 13, -1, 14, -1, 15, -1};
22597 SDValue AHi = DAG.getVectorShuffle(VT, dl, A, A, HiShufMask);
22598 SDValue BHi = DAG.getVectorShuffle(VT, dl, B, B, HiShufMask);
22599 AHi = DAG.getBitcast(ExVT, AHi);
22600 BHi = DAG.getBitcast(ExVT, BHi);
22602 // Multiply, mask the lower 8bits of the lo/hi results and pack
22603 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22604 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22605 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22606 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22607 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22610 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22611 if (VT == MVT::v4i32) {
22612 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22613 "Should not custom lower when pmulld is available!");
22615 // Extract the odd parts.
22616 static const int UnpackMask[] = { 1, -1, 3, -1 };
22617 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22618 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22620 // Multiply the even parts.
22621 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22622 DAG.getBitcast(MVT::v2i64, A),
22623 DAG.getBitcast(MVT::v2i64, B));
22624 // Now multiply odd parts.
22625 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22626 DAG.getBitcast(MVT::v2i64, Aodds),
22627 DAG.getBitcast(MVT::v2i64, Bodds));
22629 Evens = DAG.getBitcast(VT, Evens);
22630 Odds = DAG.getBitcast(VT, Odds);
22632 // Merge the two vectors back together with a shuffle. This expands into 2
22634 static const int ShufMask[] = { 0, 4, 2, 6 };
22635 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22638 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22639 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22640 assert(!Subtarget.hasDQI() && "DQI should use MULLQ");
22642 // Ahi = psrlqi(a, 32);
22643 // Bhi = psrlqi(b, 32);
22645 // AloBlo = pmuludq(a, b);
22646 // AloBhi = pmuludq(a, Bhi);
22647 // AhiBlo = pmuludq(Ahi, b);
22649 // Hi = psllqi(AloBhi + AhiBlo, 32);
22650 // return AloBlo + Hi;
22651 KnownBits AKnown, BKnown;
22652 DAG.computeKnownBits(A, AKnown);
22653 DAG.computeKnownBits(B, BKnown);
22655 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22656 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
22657 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
22659 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22660 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
22661 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
22663 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22665 // Only multiply lo/hi halves that aren't known to be zero.
22666 SDValue AloBlo = Zero;
22667 if (!ALoIsZero && !BLoIsZero)
22668 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
22670 SDValue AloBhi = Zero;
22671 if (!ALoIsZero && !BHiIsZero) {
22672 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22673 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
22676 SDValue AhiBlo = Zero;
22677 if (!AHiIsZero && !BLoIsZero) {
22678 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22679 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
22682 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22683 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22685 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22688 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22689 SelectionDAG &DAG) {
22691 MVT VT = Op.getSimpleValueType();
22693 // Decompose 256-bit ops into smaller 128-bit ops.
22694 if (VT.is256BitVector() && !Subtarget.hasInt256())
22695 return Lower256IntArith(Op, DAG);
22697 // Only i8 vectors should need custom lowering after this.
22698 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22699 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22700 "Unsupported vector type");
22702 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22703 // logical shift down the upper half and pack back to i8.
22704 SDValue A = Op.getOperand(0);
22705 SDValue B = Op.getOperand(1);
22707 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22708 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22709 unsigned Opcode = Op.getOpcode();
22710 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22711 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22713 // For 512-bit vectors, split into 256-bit vectors to allow the
22714 // sign-extension to occur.
22715 if (VT == MVT::v64i8)
22716 return Lower512IntArith(Op, DAG);
22718 // AVX2 implementations - extend xmm subvectors to ymm.
22719 if (Subtarget.hasInt256()) {
22720 unsigned NumElems = VT.getVectorNumElements();
22721 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22722 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22724 if (VT == MVT::v32i8) {
22725 if (Subtarget.canExtendTo512BW()) {
22726 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22727 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22728 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22729 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22730 DAG.getConstant(8, dl, MVT::v32i16));
22731 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22733 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22734 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22735 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22736 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22737 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22738 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22739 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22740 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22741 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22742 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22743 DAG.getConstant(8, dl, MVT::v16i16));
22744 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22745 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22746 DAG.getConstant(8, dl, MVT::v16i16));
22747 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22748 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22749 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22750 16, 17, 18, 19, 20, 21, 22, 23};
22751 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22752 24, 25, 26, 27, 28, 29, 30, 31};
22753 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22754 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22755 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22758 assert(VT == MVT::v16i8 && "Unexpected VT");
22760 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22761 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22762 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22763 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22764 DAG.getConstant(8, dl, MVT::v16i16));
22765 // If we have BWI we can use truncate instruction.
22766 if (Subtarget.hasBWI())
22767 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22768 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22769 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22770 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22773 assert(VT == MVT::v16i8 &&
22774 "Pre-AVX2 support only supports v16i8 multiplication");
22775 MVT ExVT = MVT::v8i16;
22776 unsigned ExSSE41 = ISD::MULHU == Opcode ? ISD::ZERO_EXTEND_VECTOR_INREG
22777 : ISD::SIGN_EXTEND_VECTOR_INREG;
22779 // Extract the lo parts and zero/sign extend to i16.
22781 if (Subtarget.hasSSE41()) {
22782 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
22783 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
22785 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22786 -1, 4, -1, 5, -1, 6, -1, 7};
22787 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22788 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22789 ALo = DAG.getBitcast(ExVT, ALo);
22790 BLo = DAG.getBitcast(ExVT, BLo);
22791 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22792 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22795 // Extract the hi parts and zero/sign extend to i16.
22797 if (Subtarget.hasSSE41()) {
22798 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22799 -1, -1, -1, -1, -1, -1, -1, -1};
22800 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22801 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22802 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
22803 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
22805 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22806 -1, 12, -1, 13, -1, 14, -1, 15};
22807 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22808 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22809 AHi = DAG.getBitcast(ExVT, AHi);
22810 BHi = DAG.getBitcast(ExVT, BHi);
22811 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22812 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22815 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22816 // pack back to v16i8.
22817 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22818 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22819 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22820 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22821 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22824 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22825 assert(Subtarget.isTargetWin64() && "Unexpected target");
22826 EVT VT = Op.getValueType();
22827 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22828 "Unexpected return type for lowering");
22832 switch (Op->getOpcode()) {
22833 default: llvm_unreachable("Unexpected request for libcall!");
22834 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22835 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22836 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22837 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22838 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22839 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22843 SDValue InChain = DAG.getEntryNode();
22845 TargetLowering::ArgListTy Args;
22846 TargetLowering::ArgListEntry Entry;
22847 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22848 EVT ArgVT = Op->getOperand(i).getValueType();
22849 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22850 "Unexpected argument type for lowering");
22851 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22852 Entry.Node = StackPtr;
22853 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22854 MachinePointerInfo(), /* Alignment = */ 16);
22855 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22856 Entry.Ty = PointerType::get(ArgTy,0);
22857 Entry.IsSExt = false;
22858 Entry.IsZExt = false;
22859 Args.push_back(Entry);
22862 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22863 getPointerTy(DAG.getDataLayout()));
22865 TargetLowering::CallLoweringInfo CLI(DAG);
22866 CLI.setDebugLoc(dl)
22869 getLibcallCallingConv(LC),
22870 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22873 .setSExtResult(isSigned)
22874 .setZExtResult(!isSigned);
22876 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22877 return DAG.getBitcast(VT, CallInfo.first);
22880 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22881 SelectionDAG &DAG) {
22882 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22883 MVT VT = Op0.getSimpleValueType();
22886 // Decompose 256-bit ops into smaller 128-bit ops.
22887 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22888 unsigned Opcode = Op.getOpcode();
22889 unsigned NumElems = VT.getVectorNumElements();
22890 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22891 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22892 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22893 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22894 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22895 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22896 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22898 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22899 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22901 return DAG.getMergeValues(Ops, dl);
22904 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22905 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22906 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22908 int NumElts = VT.getVectorNumElements();
22910 // PMULxD operations multiply each even value (starting at 0) of LHS with
22911 // the related value of RHS and produce a widen result.
22912 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22913 // => <2 x i64> <ae|cg>
22915 // In other word, to have all the results, we need to perform two PMULxD:
22916 // 1. one with the even values.
22917 // 2. one with the odd values.
22918 // To achieve #2, with need to place the odd values at an even position.
22920 // Place the odd value at an even position (basically, shift all values 1
22921 // step to the left):
22922 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22923 // <a|b|c|d> => <b|undef|d|undef>
22924 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22925 makeArrayRef(&Mask[0], NumElts));
22926 // <e|f|g|h> => <f|undef|h|undef>
22927 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22928 makeArrayRef(&Mask[0], NumElts));
22930 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22932 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22933 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22935 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22936 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22937 // => <2 x i64> <ae|cg>
22938 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22939 DAG.getBitcast(MulVT, Op0),
22940 DAG.getBitcast(MulVT, Op1)));
22941 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22942 // => <2 x i64> <bf|dh>
22943 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22944 DAG.getBitcast(MulVT, Odd0),
22945 DAG.getBitcast(MulVT, Odd1)));
22947 // Shuffle it back into the right order.
22948 SmallVector<int, 16> HighMask(NumElts);
22949 SmallVector<int, 16> LowMask(NumElts);
22950 for (int i = 0; i != NumElts; ++i) {
22951 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22952 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22955 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22956 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22958 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22959 // unsigned multiply.
22960 if (IsSigned && !Subtarget.hasSSE41()) {
22961 SDValue ShAmt = DAG.getConstant(
22963 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22964 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22965 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22966 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22967 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22969 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22970 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22973 // The first result of MUL_LOHI is actually the low value, followed by the
22975 SDValue Ops[] = {Lows, Highs};
22976 return DAG.getMergeValues(Ops, dl);
22979 // Return true if the required (according to Opcode) shift-imm form is natively
22980 // supported by the Subtarget
22981 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22983 if (VT.getScalarSizeInBits() < 16)
22986 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22987 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22990 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22991 (VT.is256BitVector() && Subtarget.hasInt256());
22993 bool AShift = LShift && (Subtarget.hasAVX512() ||
22994 (VT != MVT::v2i64 && VT != MVT::v4i64));
22995 return (Opcode == ISD::SRA) ? AShift : LShift;
22998 // The shift amount is a variable, but it is the same for all vector lanes.
22999 // These instructions are defined together with shift-immediate.
23001 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
23003 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
23006 // Return true if the required (according to Opcode) variable-shift form is
23007 // natively supported by the Subtarget
23008 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
23011 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
23014 // vXi16 supported only on AVX-512, BWI
23015 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
23018 if (Subtarget.hasAVX512())
23021 bool LShift = VT.is128BitVector() || VT.is256BitVector();
23022 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
23023 return (Opcode == ISD::SRA) ? AShift : LShift;
23026 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
23027 const X86Subtarget &Subtarget) {
23028 MVT VT = Op.getSimpleValueType();
23030 SDValue R = Op.getOperand(0);
23031 SDValue Amt = Op.getOperand(1);
23033 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
23034 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23036 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
23037 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
23038 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
23039 SDValue Ex = DAG.getBitcast(ExVT, R);
23041 // ashr(R, 63) === cmp_slt(R, 0)
23042 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
23043 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
23044 "Unsupported PCMPGT op");
23045 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
23046 getZeroVector(VT, Subtarget, DAG, dl), R);
23049 if (ShiftAmt >= 32) {
23050 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
23052 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
23053 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23054 ShiftAmt - 32, DAG);
23055 if (VT == MVT::v2i64)
23056 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
23057 if (VT == MVT::v4i64)
23058 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23059 {9, 1, 11, 3, 13, 5, 15, 7});
23061 // SRA upper i32, SHL whole i64 and select lower i32.
23062 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
23065 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
23066 Lower = DAG.getBitcast(ExVT, Lower);
23067 if (VT == MVT::v2i64)
23068 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
23069 if (VT == MVT::v4i64)
23070 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
23071 {8, 1, 10, 3, 12, 5, 14, 7});
23073 return DAG.getBitcast(VT, Ex);
23076 // Optimize shl/srl/sra with constant shift amount.
23077 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23078 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
23079 uint64_t ShiftAmt = ShiftConst->getZExtValue();
23081 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23082 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23084 // i64 SRA needs to be performed as partial shifts.
23085 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
23086 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
23087 Op.getOpcode() == ISD::SRA)
23088 return ArithmeticShiftRight64(ShiftAmt);
23090 if (VT == MVT::v16i8 ||
23091 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
23092 VT == MVT::v64i8) {
23093 unsigned NumElts = VT.getVectorNumElements();
23094 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
23096 // Simple i8 add case
23097 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
23098 return DAG.getNode(ISD::ADD, dl, VT, R, R);
23100 // ashr(R, 7) === cmp_slt(R, 0)
23101 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
23102 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
23103 if (VT.is512BitVector()) {
23104 assert(VT == MVT::v64i8 && "Unexpected element type!");
23105 SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R,
23107 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
23109 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
23112 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
23113 if (VT == MVT::v16i8 && Subtarget.hasXOP())
23116 if (Op.getOpcode() == ISD::SHL) {
23117 // Make a large shift.
23118 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
23120 SHL = DAG.getBitcast(VT, SHL);
23121 // Zero out the rightmost bits.
23122 return DAG.getNode(ISD::AND, dl, VT, SHL,
23123 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
23125 if (Op.getOpcode() == ISD::SRL) {
23126 // Make a large shift.
23127 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
23129 SRL = DAG.getBitcast(VT, SRL);
23130 // Zero out the leftmost bits.
23131 return DAG.getNode(ISD::AND, dl, VT, SRL,
23132 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
23134 if (Op.getOpcode() == ISD::SRA) {
23135 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
23136 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23138 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
23139 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
23140 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
23143 llvm_unreachable("Unknown shift opcode.");
23148 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23149 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
23150 if (!Subtarget.hasXOP() &&
23151 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
23152 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
23154 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
23155 unsigned SubVectorScale = 1;
23156 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23158 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
23159 Amt = Amt.getOperand(0);
23162 // Peek through any splat that was introduced for i64 shift vectorization.
23163 int SplatIndex = -1;
23164 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
23165 if (SVN->isSplat()) {
23166 SplatIndex = SVN->getSplatIndex();
23167 Amt = Amt.getOperand(0);
23168 assert(SplatIndex < (int)VT.getVectorNumElements() &&
23169 "Splat shuffle referencing second operand");
23172 if (Amt.getOpcode() != ISD::BITCAST ||
23173 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
23176 Amt = Amt.getOperand(0);
23177 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23178 (SubVectorScale * VT.getVectorNumElements());
23179 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
23180 uint64_t ShiftAmt = 0;
23181 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
23182 for (unsigned i = 0; i != Ratio; ++i) {
23183 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
23187 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
23190 // Check remaining shift amounts (if not a splat).
23191 if (SplatIndex < 0) {
23192 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23193 uint64_t ShAmt = 0;
23194 for (unsigned j = 0; j != Ratio; ++j) {
23195 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
23199 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
23201 if (ShAmt != ShiftAmt)
23206 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23207 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23209 if (Op.getOpcode() == ISD::SRA)
23210 return ArithmeticShiftRight64(ShiftAmt);
23216 // Determine if V is a splat value, and return the scalar.
23217 static SDValue IsSplatValue(MVT VT, SDValue V, const SDLoc &dl,
23218 SelectionDAG &DAG, const X86Subtarget &Subtarget,
23220 V = peekThroughEXTRACT_SUBVECTORs(V);
23222 // Check if this is a splat build_vector node.
23223 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V)) {
23224 SDValue SplatAmt = BV->getSplatValue();
23225 if (SplatAmt && SplatAmt.isUndef())
23230 // Check for SUB(SPLAT_BV, SPLAT) cases from rotate patterns.
23231 if (V.getOpcode() == ISD::SUB &&
23232 !SupportedVectorVarShift(VT, Subtarget, Opcode)) {
23233 SDValue LHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(0));
23234 SDValue RHS = peekThroughEXTRACT_SUBVECTORs(V.getOperand(1));
23236 // Ensure that the corresponding splat BV element is not UNDEF.
23237 BitVector UndefElts;
23238 BuildVectorSDNode *BV0 = dyn_cast<BuildVectorSDNode>(LHS);
23239 ShuffleVectorSDNode *SVN1 = dyn_cast<ShuffleVectorSDNode>(RHS);
23240 if (BV0 && SVN1 && BV0->getSplatValue(&UndefElts) && SVN1->isSplat()) {
23241 unsigned SplatIdx = (unsigned)SVN1->getSplatIndex();
23242 if (!UndefElts[SplatIdx])
23243 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23244 VT.getVectorElementType(), V,
23245 DAG.getIntPtrConstant(SplatIdx, dl));
23249 // Check if this is a shuffle node doing a splat.
23250 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(V);
23251 if (!SVN || !SVN->isSplat())
23254 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
23255 SDValue InVec = V.getOperand(0);
23256 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
23257 assert((SplatIdx < VT.getVectorNumElements()) &&
23258 "Unexpected shuffle index found!");
23259 return InVec.getOperand(SplatIdx);
23260 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
23261 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(InVec.getOperand(2)))
23262 if (C->getZExtValue() == SplatIdx)
23263 return InVec.getOperand(1);
23266 // Avoid introducing an extract element from a shuffle.
23267 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
23268 VT.getVectorElementType(), InVec,
23269 DAG.getIntPtrConstant(SplatIdx, dl));
23272 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
23273 const X86Subtarget &Subtarget) {
23274 MVT VT = Op.getSimpleValueType();
23276 SDValue R = Op.getOperand(0);
23277 SDValue Amt = Op.getOperand(1);
23278 unsigned Opcode = Op.getOpcode();
23280 unsigned X86OpcI = (Opcode == ISD::SHL) ? X86ISD::VSHLI :
23281 (Opcode == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23283 unsigned X86OpcV = (Opcode == ISD::SHL) ? X86ISD::VSHL :
23284 (Opcode == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
23286 Amt = peekThroughEXTRACT_SUBVECTORs(Amt);
23288 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) {
23289 if (SDValue BaseShAmt = IsSplatValue(VT, Amt, dl, DAG, Subtarget, Opcode)) {
23290 MVT EltVT = VT.getVectorElementType();
23291 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
23292 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
23293 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
23294 else if (EltVT.bitsLT(MVT::i32))
23295 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
23297 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
23301 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23302 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
23303 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
23304 Amt = Amt.getOperand(0);
23305 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23306 VT.getVectorNumElements();
23307 std::vector<SDValue> Vals(Ratio);
23308 for (unsigned i = 0; i != Ratio; ++i)
23309 Vals[i] = Amt.getOperand(i);
23310 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23311 for (unsigned j = 0; j != Ratio; ++j)
23312 if (Vals[j] != Amt.getOperand(i + j))
23316 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
23317 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
23322 // Convert a shift/rotate left amount to a multiplication scale factor.
23323 static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
23324 const X86Subtarget &Subtarget,
23325 SelectionDAG &DAG) {
23326 MVT VT = Amt.getSimpleValueType();
23327 if (!(VT == MVT::v8i16 || VT == MVT::v4i32 ||
23328 (Subtarget.hasInt256() && VT == MVT::v16i16) ||
23329 (!Subtarget.hasAVX512() && VT == MVT::v16i8)))
23332 if (ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
23333 SmallVector<SDValue, 8> Elts;
23334 MVT SVT = VT.getVectorElementType();
23335 unsigned SVTBits = SVT.getSizeInBits();
23336 APInt One(SVTBits, 1);
23337 unsigned NumElems = VT.getVectorNumElements();
23339 for (unsigned i = 0; i != NumElems; ++i) {
23340 SDValue Op = Amt->getOperand(i);
23341 if (Op->isUndef()) {
23342 Elts.push_back(Op);
23346 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23347 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23348 uint64_t ShAmt = C.getZExtValue();
23349 if (ShAmt >= SVTBits) {
23350 Elts.push_back(DAG.getUNDEF(SVT));
23353 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23355 return DAG.getBuildVector(VT, dl, Elts);
23358 // If the target doesn't support variable shifts, use either FP conversion
23359 // or integer multiplication to avoid shifting each element individually.
23360 if (VT == MVT::v4i32) {
23361 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23362 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt,
23363 DAG.getConstant(0x3f800000U, dl, VT));
23364 Amt = DAG.getBitcast(MVT::v4f32, Amt);
23365 return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt);
23368 // AVX2 can more effectively perform this as a zext/trunc to/from v8i32.
23369 if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) {
23370 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23371 SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z));
23372 SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z));
23373 Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG);
23374 Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG);
23375 if (Subtarget.hasSSE41())
23376 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23378 return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, Lo),
23379 DAG.getBitcast(VT, Hi),
23380 {0, 2, 4, 6, 8, 10, 12, 14});
23386 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
23387 SelectionDAG &DAG) {
23388 MVT VT = Op.getSimpleValueType();
23390 SDValue R = Op.getOperand(0);
23391 SDValue Amt = Op.getOperand(1);
23392 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23394 assert(VT.isVector() && "Custom lowering only for vector shifts!");
23395 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
23397 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
23400 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
23403 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
23406 // XOP has 128-bit variable logical/arithmetic shifts.
23407 // +ve/-ve Amt = shift left/right.
23408 if (Subtarget.hasXOP() &&
23409 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
23410 VT == MVT::v8i16 || VT == MVT::v16i8)) {
23411 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
23412 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
23413 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
23415 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
23416 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
23417 if (Op.getOpcode() == ISD::SRA)
23418 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
23421 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23422 // shifts per-lane and then shuffle the partial results back together.
23423 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23424 // Splat the shift amounts so the scalar shifts above will catch it.
23425 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23426 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23427 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23428 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23429 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23432 // i64 vector arithmetic shift can be emulated with the transform:
23433 // M = lshr(SIGN_MASK, Amt)
23434 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23435 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23436 Op.getOpcode() == ISD::SRA) {
23437 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23438 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23439 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23440 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23441 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23445 // If possible, lower this shift as a sequence of two shifts by
23446 // constant plus a BLENDing shuffle instead of scalarizing it.
23448 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23450 // Could be rewritten as:
23451 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23453 // The advantage is that the two shifts from the example would be
23454 // lowered as X86ISD::VSRLI nodes in parallel before blending.
23455 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23456 (VT == MVT::v16i16 && Subtarget.hasInt256()))) {
23457 SDValue Amt1, Amt2;
23458 unsigned NumElts = VT.getVectorNumElements();
23459 SmallVector<int, 8> ShuffleMask;
23460 for (unsigned i = 0; i != NumElts; ++i) {
23461 SDValue A = Amt->getOperand(i);
23463 ShuffleMask.push_back(SM_SentinelUndef);
23466 if (!Amt1 || Amt1 == A) {
23467 ShuffleMask.push_back(i);
23471 if (!Amt2 || Amt2 == A) {
23472 ShuffleMask.push_back(i + NumElts);
23479 // Only perform this blend if we can perform it without loading a mask.
23480 if (ShuffleMask.size() == NumElts && Amt1 && Amt2 &&
23481 isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2) &&
23482 (VT != MVT::v16i16 ||
23483 is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) &&
23484 (VT == MVT::v4i32 || Subtarget.hasSSE41() ||
23485 Op.getOpcode() != ISD::SHL || canWidenShuffleElements(ShuffleMask))) {
23487 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23488 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23490 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23491 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23492 return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
23496 // If possible, lower this packed shift into a vector multiply instead of
23497 // expanding it into a sequence of scalar shifts.
23498 if (Op.getOpcode() == ISD::SHL)
23499 if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG))
23500 return DAG.getNode(ISD::MUL, dl, VT, R, Scale);
23502 // v4i32 Non Uniform Shifts.
23503 // If the shift amount is constant we can shift each lane using the SSE2
23504 // immediate shifts, else we need to zero-extend each lane to the lower i64
23505 // and shift using the SSE2 variable shifts.
23506 // The separate results can then be blended together.
23507 if (VT == MVT::v4i32) {
23508 unsigned Opc = Op.getOpcode();
23509 SDValue Amt0, Amt1, Amt2, Amt3;
23511 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23512 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23513 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23514 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23516 // ISD::SHL is handled above but we include it here for completeness.
23519 llvm_unreachable("Unknown target vector shift node");
23521 Opc = X86ISD::VSHL;
23524 Opc = X86ISD::VSRL;
23527 Opc = X86ISD::VSRA;
23530 // The SSE2 shifts use the lower i64 as the same shift amount for
23531 // all lanes and the upper i64 is ignored. On AVX we're better off
23532 // just zero-extending, but for SSE just duplicating the top 16-bits is
23533 // cheaper and has the same effect for out of range values.
23534 if (Subtarget.hasAVX()) {
23535 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23536 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23537 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23538 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23539 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23541 SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt);
23542 SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23543 {4, 5, 6, 7, -1, -1, -1, -1});
23544 Amt0 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23545 {0, 1, 1, 1, -1, -1, -1, -1});
23546 Amt1 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01,
23547 {2, 3, 3, 3, -1, -1, -1, -1});
23548 Amt2 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23549 {0, 1, 1, 1, -1, -1, -1, -1});
23550 Amt3 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt23, Amt23,
23551 {2, 3, 3, 3, -1, -1, -1, -1});
23555 SDValue R0 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt0));
23556 SDValue R1 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt1));
23557 SDValue R2 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt2));
23558 SDValue R3 = DAG.getNode(Opc, dl, VT, R, DAG.getBitcast(VT, Amt3));
23560 // Merge the shifted lane results optimally with/without PBLENDW.
23561 // TODO - ideally shuffle combining would handle this.
23562 if (Subtarget.hasSSE41()) {
23563 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23564 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23565 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23567 SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5});
23568 SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7});
23569 return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7});
23572 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23573 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23574 // make the existing SSE solution better.
23575 // NOTE: We honor prefered vector width before promoting to 512-bits.
23576 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23577 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23578 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23579 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23580 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23581 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23582 "Unexpected vector type");
23583 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23584 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23586 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23587 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23588 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23589 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23590 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23593 if (VT == MVT::v16i8 ||
23594 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23595 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23596 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23597 unsigned ShiftOpcode = Op->getOpcode();
23599 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23600 if (VT.is512BitVector()) {
23601 // On AVX512BW targets we make use of the fact that VSELECT lowers
23602 // to a masked blend which selects bytes based just on the sign bit
23603 // extracted to a mask.
23604 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23605 V0 = DAG.getBitcast(VT, V0);
23606 V1 = DAG.getBitcast(VT, V1);
23607 Sel = DAG.getBitcast(VT, Sel);
23608 Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel,
23610 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23611 } else if (Subtarget.hasSSE41()) {
23612 // On SSE41 targets we make use of the fact that VSELECT lowers
23613 // to PBLENDVB which selects bytes based just on the sign bit.
23614 V0 = DAG.getBitcast(VT, V0);
23615 V1 = DAG.getBitcast(VT, V1);
23616 Sel = DAG.getBitcast(VT, Sel);
23617 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23619 // On pre-SSE41 targets we test for the sign bit by comparing to
23620 // zero - a negative value will set all bits of the lanes to true
23621 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23622 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23623 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23624 return DAG.getSelect(dl, SelVT, C, V0, V1);
23627 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23628 // We can safely do this using i16 shifts as we're only interested in
23629 // the 3 lower bits of each byte.
23630 Amt = DAG.getBitcast(ExtVT, Amt);
23631 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23632 Amt = DAG.getBitcast(VT, Amt);
23634 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23635 // r = VSELECT(r, shift(r, 4), a);
23637 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23638 R = SignBitSelect(VT, Amt, M, R);
23641 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23643 // r = VSELECT(r, shift(r, 2), a);
23644 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23645 R = SignBitSelect(VT, Amt, M, R);
23648 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23650 // return VSELECT(r, shift(r, 1), a);
23651 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23652 R = SignBitSelect(VT, Amt, M, R);
23656 if (Op->getOpcode() == ISD::SRA) {
23657 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23658 // so we can correctly sign extend. We don't care what happens to the
23660 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23661 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23662 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23663 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23664 ALo = DAG.getBitcast(ExtVT, ALo);
23665 AHi = DAG.getBitcast(ExtVT, AHi);
23666 RLo = DAG.getBitcast(ExtVT, RLo);
23667 RHi = DAG.getBitcast(ExtVT, RHi);
23669 // r = VSELECT(r, shift(r, 4), a);
23670 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23671 DAG.getConstant(4, dl, ExtVT));
23672 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23673 DAG.getConstant(4, dl, ExtVT));
23674 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23675 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23678 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23679 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23681 // r = VSELECT(r, shift(r, 2), a);
23682 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23683 DAG.getConstant(2, dl, ExtVT));
23684 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23685 DAG.getConstant(2, dl, ExtVT));
23686 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23687 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23690 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23691 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23693 // r = VSELECT(r, shift(r, 1), a);
23694 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23695 DAG.getConstant(1, dl, ExtVT));
23696 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23697 DAG.getConstant(1, dl, ExtVT));
23698 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23699 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23701 // Logical shift the result back to the lower byte, leaving a zero upper
23703 // meaning that we can safely pack with PACKUSWB.
23705 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23707 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23708 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23712 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23713 MVT ExtVT = MVT::v8i32;
23714 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23715 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23716 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23717 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23718 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23719 ALo = DAG.getBitcast(ExtVT, ALo);
23720 AHi = DAG.getBitcast(ExtVT, AHi);
23721 RLo = DAG.getBitcast(ExtVT, RLo);
23722 RHi = DAG.getBitcast(ExtVT, RHi);
23723 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23724 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23725 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23726 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23727 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23730 if (VT == MVT::v8i16) {
23731 unsigned ShiftOpcode = Op->getOpcode();
23733 // If we have a constant shift amount, the non-SSE41 path is best as
23734 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23735 bool UseSSE41 = Subtarget.hasSSE41() &&
23736 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23738 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23739 // On SSE41 targets we make use of the fact that VSELECT lowers
23740 // to PBLENDVB which selects bytes based just on the sign bit.
23742 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23743 V0 = DAG.getBitcast(ExtVT, V0);
23744 V1 = DAG.getBitcast(ExtVT, V1);
23745 Sel = DAG.getBitcast(ExtVT, Sel);
23746 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23748 // On pre-SSE41 targets we splat the sign bit - a negative value will
23749 // set all bits of the lanes to true and VSELECT uses that in
23750 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23752 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23753 return DAG.getSelect(dl, VT, C, V0, V1);
23756 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23758 // On SSE41 targets we need to replicate the shift mask in both
23759 // bytes for PBLENDVB.
23762 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23763 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23765 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23768 // r = VSELECT(r, shift(r, 8), a);
23769 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23770 R = SignBitSelect(Amt, M, R);
23773 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23775 // r = VSELECT(r, shift(r, 4), a);
23776 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23777 R = SignBitSelect(Amt, M, R);
23780 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23782 // r = VSELECT(r, shift(r, 2), a);
23783 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23784 R = SignBitSelect(Amt, M, R);
23787 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23789 // return VSELECT(r, shift(r, 1), a);
23790 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23791 R = SignBitSelect(Amt, M, R);
23795 // Decompose 256-bit shifts into smaller 128-bit shifts.
23796 if (VT.is256BitVector())
23797 return Lower256IntArith(Op, DAG);
23802 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23803 SelectionDAG &DAG) {
23804 MVT VT = Op.getSimpleValueType();
23805 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23808 SDValue R = Op.getOperand(0);
23809 SDValue Amt = Op.getOperand(1);
23810 unsigned Opcode = Op.getOpcode();
23811 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23813 if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
23814 // Attempt to rotate by immediate.
23816 SmallVector<APInt, 16> EltBits;
23817 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23818 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23819 return EltBits[0] == V;
23821 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23822 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23823 return DAG.getNode(Op, DL, VT, R,
23824 DAG.getConstant(RotateAmt, DL, MVT::i8));
23828 // Else, fall-back on VPROLV/VPRORV.
23832 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23834 // XOP has 128-bit vector variable + immediate rotates.
23835 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23836 if (Subtarget.hasXOP()) {
23837 // Split 256-bit integers.
23838 if (VT.is256BitVector())
23839 return Lower256IntArith(Op, DAG);
23840 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23842 // Attempt to rotate by immediate.
23843 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23844 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23845 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23846 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23847 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23848 DAG.getConstant(RotateAmt, DL, MVT::i8));
23852 // Use general rotate by variable (per-element).
23856 // Split 256-bit integers on pre-AVX2 targets.
23857 if (VT.is256BitVector() && !Subtarget.hasAVX2())
23858 return Lower256IntArith(Op, DAG);
23860 assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
23861 ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
23862 Subtarget.hasAVX2())) &&
23863 "Only vXi32/vXi16/vXi8 vector rotates supported");
23865 // Rotate by an uniform constant - expand back to shifts.
23866 // TODO - legalizers should be able to handle this.
23867 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23868 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23869 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23870 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23871 if (RotateAmt == 0)
23874 SDValue AmtR = DAG.getConstant(EltSizeInBits - RotateAmt, DL, VT);
23875 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23876 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23877 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23881 // Rotate by splat - expand back to shifts.
23882 // TODO - legalizers should be able to handle this.
23883 if ((EltSizeInBits >= 16 || Subtarget.hasBWI()) &&
23884 IsSplatValue(VT, Amt, DL, DAG, Subtarget, Opcode)) {
23885 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23886 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23887 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23888 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23889 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23892 // v16i8/v32i8: Split rotation into rot4/rot2/rot1 stages and select by
23894 if (EltSizeInBits == 8) {
23895 if (Subtarget.hasBWI()) {
23896 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23897 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23898 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23899 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23900 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23903 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23905 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23906 if (Subtarget.hasSSE41()) {
23907 // On SSE41 targets we make use of the fact that VSELECT lowers
23908 // to PBLENDVB which selects bytes based just on the sign bit.
23909 V0 = DAG.getBitcast(VT, V0);
23910 V1 = DAG.getBitcast(VT, V1);
23911 Sel = DAG.getBitcast(VT, Sel);
23912 return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
23914 // On pre-SSE41 targets we test for the sign bit by comparing to
23915 // zero - a negative value will set all bits of the lanes to true
23916 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23917 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, DL);
23918 SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel);
23919 return DAG.getSelect(DL, SelVT, C, V0, V1);
23922 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23923 // We can safely do this using i16 shifts as we're only interested in
23924 // the 3 lower bits of each byte.
23925 Amt = DAG.getBitcast(ExtVT, Amt);
23926 Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT));
23927 Amt = DAG.getBitcast(VT, Amt);
23929 // r = VSELECT(r, rot(r, 4), a);
23933 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(4, DL, VT)),
23934 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(4, DL, VT)));
23935 R = SignBitSelect(VT, Amt, M, R);
23938 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23940 // r = VSELECT(r, rot(r, 2), a);
23943 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(2, DL, VT)),
23944 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(6, DL, VT)));
23945 R = SignBitSelect(VT, Amt, M, R);
23948 Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt);
23950 // return VSELECT(r, rot(r, 1), a);
23953 DAG.getNode(ISD::SHL, DL, VT, R, DAG.getConstant(1, DL, VT)),
23954 DAG.getNode(ISD::SRL, DL, VT, R, DAG.getConstant(7, DL, VT)));
23955 return SignBitSelect(VT, Amt, M, R);
23958 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23959 bool LegalVarShifts = SupportedVectorVarShift(VT, Subtarget, ISD::SHL) &&
23960 SupportedVectorVarShift(VT, Subtarget, ISD::SRL);
23962 // Best to fallback for all supported variable shifts.
23963 // AVX2 - best to fallback for non-constants as well.
23964 // TODO - legalizers should be able to handle this.
23965 if (LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) {
23966 SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT);
23967 AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt);
23968 SDValue SHL = DAG.getNode(ISD::SHL, DL, VT, R, Amt);
23969 SDValue SRL = DAG.getNode(ISD::SRL, DL, VT, R, AmtR);
23970 return DAG.getNode(ISD::OR, DL, VT, SHL, SRL);
23973 // As with shifts, convert the rotation amount to a multiplication factor.
23974 SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG);
23975 assert(Scale && "Failed to convert ROTL amount to scale");
23977 // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results.
23978 if (EltSizeInBits == 16) {
23979 SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale);
23980 SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale);
23981 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23984 // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32
23985 // to v2i64 results at a time. The upper 32-bits contain the wrapped bits
23986 // that can then be OR'd with the lower 32-bits.
23987 assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected");
23988 static const int OddMask[] = {1, -1, 3, -1};
23989 SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask);
23990 SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask);
23992 SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
23993 DAG.getBitcast(MVT::v2i64, R),
23994 DAG.getBitcast(MVT::v2i64, Scale));
23995 SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64,
23996 DAG.getBitcast(MVT::v2i64, R13),
23997 DAG.getBitcast(MVT::v2i64, Scale13));
23998 Res02 = DAG.getBitcast(VT, Res02);
23999 Res13 = DAG.getBitcast(VT, Res13);
24001 return DAG.getNode(ISD::OR, DL, VT,
24002 DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}),
24003 DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7}));
24006 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
24007 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
24008 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
24009 // looks for this combo and may remove the "setcc" instruction if the "setcc"
24010 // has only one use.
24011 SDNode *N = Op.getNode();
24012 SDValue LHS = N->getOperand(0);
24013 SDValue RHS = N->getOperand(1);
24014 unsigned BaseOp = 0;
24015 X86::CondCode Cond;
24017 switch (Op.getOpcode()) {
24018 default: llvm_unreachable("Unknown ovf instruction!");
24020 // A subtract of one will be selected as a INC. Note that INC doesn't
24021 // set CF, so we can't do this for UADDO.
24022 if (isOneConstant(RHS)) {
24023 BaseOp = X86ISD::INC;
24024 Cond = X86::COND_O;
24027 BaseOp = X86ISD::ADD;
24028 Cond = X86::COND_O;
24031 BaseOp = X86ISD::ADD;
24032 Cond = X86::COND_B;
24035 // A subtract of one will be selected as a DEC. Note that DEC doesn't
24036 // set CF, so we can't do this for USUBO.
24037 if (isOneConstant(RHS)) {
24038 BaseOp = X86ISD::DEC;
24039 Cond = X86::COND_O;
24042 BaseOp = X86ISD::SUB;
24043 Cond = X86::COND_O;
24046 BaseOp = X86ISD::SUB;
24047 Cond = X86::COND_B;
24050 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
24051 Cond = X86::COND_O;
24053 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
24054 if (N->getValueType(0) == MVT::i8) {
24055 BaseOp = X86ISD::UMUL8;
24056 Cond = X86::COND_O;
24059 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
24061 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
24063 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
24065 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24069 // Also sets EFLAGS.
24070 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
24071 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
24073 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
24075 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24078 /// Returns true if the operand type is exactly twice the native width, and
24079 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
24080 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
24081 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
24082 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
24083 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
24086 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
24087 else if (OpWidth == 128)
24088 return Subtarget.hasCmpxchg16b();
24093 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
24094 return needsCmpXchgNb(SI->getValueOperand()->getType());
24097 // Note: this turns large loads into lock cmpxchg8b/16b.
24098 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
24099 TargetLowering::AtomicExpansionKind
24100 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
24101 auto PTy = cast<PointerType>(LI->getPointerOperandType());
24102 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
24103 : AtomicExpansionKind::None;
24106 TargetLowering::AtomicExpansionKind
24107 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
24108 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24109 Type *MemType = AI->getType();
24111 // If the operand is too big, we must see if cmpxchg8/16b is available
24112 // and default to library calls otherwise.
24113 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
24114 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
24115 : AtomicExpansionKind::None;
24118 AtomicRMWInst::BinOp Op = AI->getOperation();
24121 llvm_unreachable("Unknown atomic operation");
24122 case AtomicRMWInst::Xchg:
24123 case AtomicRMWInst::Add:
24124 case AtomicRMWInst::Sub:
24125 // It's better to use xadd, xsub or xchg for these in all cases.
24126 return AtomicExpansionKind::None;
24127 case AtomicRMWInst::Or:
24128 case AtomicRMWInst::And:
24129 case AtomicRMWInst::Xor:
24130 // If the atomicrmw's result isn't actually used, we can just add a "lock"
24131 // prefix to a normal instruction for these operations.
24132 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
24133 : AtomicExpansionKind::None;
24134 case AtomicRMWInst::Nand:
24135 case AtomicRMWInst::Max:
24136 case AtomicRMWInst::Min:
24137 case AtomicRMWInst::UMax:
24138 case AtomicRMWInst::UMin:
24139 // These always require a non-trivial set of data operations on x86. We must
24140 // use a cmpxchg loop.
24141 return AtomicExpansionKind::CmpXChg;
24146 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
24147 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
24148 Type *MemType = AI->getType();
24149 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
24150 // there is no benefit in turning such RMWs into loads, and it is actually
24151 // harmful as it introduces a mfence.
24152 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
24155 auto Builder = IRBuilder<>(AI);
24156 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
24157 auto SSID = AI->getSyncScopeID();
24158 // We must restrict the ordering to avoid generating loads with Release or
24159 // ReleaseAcquire orderings.
24160 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
24161 auto Ptr = AI->getPointerOperand();
24163 // Before the load we need a fence. Here is an example lifted from
24164 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
24167 // x.store(1, relaxed);
24168 // r1 = y.fetch_add(0, release);
24170 // y.fetch_add(42, acquire);
24171 // r2 = x.load(relaxed);
24172 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
24173 // lowered to just a load without a fence. A mfence flushes the store buffer,
24174 // making the optimization clearly correct.
24175 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
24176 // otherwise, we might be able to be more aggressive on relaxed idempotent
24177 // rmw. In practice, they do not look useful, so we don't try to be
24178 // especially clever.
24179 if (SSID == SyncScope::SingleThread)
24180 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
24181 // the IR level, so we must wrap it in an intrinsic.
24184 if (!Subtarget.hasMFence())
24185 // FIXME: it might make sense to use a locked operation here but on a
24186 // different cache-line to prevent cache-line bouncing. In practice it
24187 // is probably a small win, and x86 processors without mfence are rare
24188 // enough that we do not bother.
24192 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
24193 Builder.CreateCall(MFence, {});
24195 // Finally we can emit the atomic load.
24196 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
24197 AI->getType()->getPrimitiveSizeInBits());
24198 Loaded->setAtomic(Order, SSID);
24199 AI->replaceAllUsesWith(Loaded);
24200 AI->eraseFromParent();
24204 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
24205 SelectionDAG &DAG) {
24207 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
24208 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
24209 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
24210 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
24212 // The only fence that needs an instruction is a sequentially-consistent
24213 // cross-thread fence.
24214 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
24215 FenceSSID == SyncScope::System) {
24216 if (Subtarget.hasMFence())
24217 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
24219 SDValue Chain = Op.getOperand(0);
24220 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
24222 DAG.getRegister(X86::ESP, MVT::i32), // Base
24223 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
24224 DAG.getRegister(0, MVT::i32), // Index
24225 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
24226 DAG.getRegister(0, MVT::i32), // Segment.
24230 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
24231 return SDValue(Res, 0);
24234 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
24235 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
24238 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
24239 SelectionDAG &DAG) {
24240 MVT T = Op.getSimpleValueType();
24244 switch(T.SimpleTy) {
24245 default: llvm_unreachable("Invalid value type!");
24246 case MVT::i8: Reg = X86::AL; size = 1; break;
24247 case MVT::i16: Reg = X86::AX; size = 2; break;
24248 case MVT::i32: Reg = X86::EAX; size = 4; break;
24250 assert(Subtarget.is64Bit() && "Node not type legal!");
24251 Reg = X86::RAX; size = 8;
24254 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
24255 Op.getOperand(2), SDValue());
24256 SDValue Ops[] = { cpIn.getValue(0),
24259 DAG.getTargetConstant(size, DL, MVT::i8),
24260 cpIn.getValue(1) };
24261 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24262 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
24263 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
24267 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
24268 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
24269 MVT::i32, cpOut.getValue(2));
24270 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
24272 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
24273 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
24274 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
24278 // Create MOVMSKB, taking into account whether we need to split for AVX1.
24279 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
24280 const X86Subtarget &Subtarget) {
24281 MVT InVT = V.getSimpleValueType();
24283 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
24285 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
24286 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
24287 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
24288 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
24289 DAG.getConstant(16, DL, MVT::i8));
24290 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
24293 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
24296 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
24297 SelectionDAG &DAG) {
24298 SDValue Src = Op.getOperand(0);
24299 MVT SrcVT = Src.getSimpleValueType();
24300 MVT DstVT = Op.getSimpleValueType();
24302 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
24303 // half to v32i1 and concatenating the result.
24304 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
24305 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
24306 assert(Subtarget.hasBWI() && "Expected BWI target");
24308 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24309 DAG.getIntPtrConstant(0, dl));
24310 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24311 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24312 DAG.getIntPtrConstant(1, dl));
24313 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24314 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24317 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
24318 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
24319 DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
24322 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
24323 EVT CastVT = MVT::getVectorVT(DstVT.getVectorElementType(),
24324 DstVT.getVectorNumElements() / 2);
24325 Lo = DAG.getBitcast(CastVT, Lo);
24326 Hi = DAG.getBitcast(CastVT, Hi);
24327 return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
24330 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
24331 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
24332 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
24333 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
24335 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
24336 V = getPMOVMSKB(DL, V, DAG, Subtarget);
24337 return DAG.getZExtOrTrunc(V, DL, DstVT);
24340 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
24341 SrcVT == MVT::i64) {
24342 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24343 if (DstVT != MVT::f64)
24344 // This conversion needs to be expanded.
24347 SmallVector<SDValue, 16> Elts;
24351 if (SrcVT.isVector()) {
24352 NumElts = SrcVT.getVectorNumElements();
24353 SVT = SrcVT.getVectorElementType();
24355 // Widen the vector in input in the case of MVT::v2i32.
24356 // Example: from MVT::v2i32 to MVT::v4i32.
24357 for (unsigned i = 0, e = NumElts; i != e; ++i)
24358 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
24359 DAG.getIntPtrConstant(i, dl)));
24361 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
24362 "Unexpected source type in LowerBITCAST");
24363 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24364 DAG.getIntPtrConstant(0, dl)));
24365 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24366 DAG.getIntPtrConstant(1, dl)));
24370 // Explicitly mark the extra elements as Undef.
24371 Elts.append(NumElts, DAG.getUNDEF(SVT));
24373 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24374 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
24375 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
24376 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
24377 DAG.getIntPtrConstant(0, dl));
24380 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
24381 Subtarget.hasMMX() && "Unexpected custom BITCAST");
24382 assert((DstVT == MVT::i64 ||
24383 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
24384 "Unexpected custom BITCAST");
24385 // i64 <=> MMX conversions are Legal.
24386 if (SrcVT==MVT::i64 && DstVT.isVector())
24388 if (DstVT==MVT::i64 && SrcVT.isVector())
24390 // MMX <=> MMX conversions are Legal.
24391 if (SrcVT.isVector() && DstVT.isVector())
24393 // All other conversions need to be expanded.
24397 /// Compute the horizontal sum of bytes in V for the elements of VT.
24399 /// Requires V to be a byte vector and VT to be an integer vector type with
24400 /// wider elements than V's type. The width of the elements of VT determines
24401 /// how many bytes of V are summed horizontally to produce each element of the
24403 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
24404 const X86Subtarget &Subtarget,
24405 SelectionDAG &DAG) {
24407 MVT ByteVecVT = V.getSimpleValueType();
24408 MVT EltVT = VT.getVectorElementType();
24409 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
24410 "Expected value to have byte element type.");
24411 assert(EltVT != MVT::i8 &&
24412 "Horizontal byte sum only makes sense for wider elements!");
24413 unsigned VecSize = VT.getSizeInBits();
24414 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
24416 // PSADBW instruction horizontally add all bytes and leave the result in i64
24417 // chunks, thus directly computes the pop count for v2i64 and v4i64.
24418 if (EltVT == MVT::i64) {
24419 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24420 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24421 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
24422 return DAG.getBitcast(VT, V);
24425 if (EltVT == MVT::i32) {
24426 // We unpack the low half and high half into i32s interleaved with zeros so
24427 // that we can use PSADBW to horizontally sum them. The most useful part of
24428 // this is that it lines up the results of two PSADBW instructions to be
24429 // two v2i64 vectors which concatenated are the 4 population counts. We can
24430 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
24431 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
24432 SDValue V32 = DAG.getBitcast(VT, V);
24433 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
24434 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
24436 // Do the horizontal sums into two v2i64s.
24437 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24438 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24439 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24440 DAG.getBitcast(ByteVecVT, Low), Zeros);
24441 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24442 DAG.getBitcast(ByteVecVT, High), Zeros);
24444 // Merge them together.
24445 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
24446 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
24447 DAG.getBitcast(ShortVecVT, Low),
24448 DAG.getBitcast(ShortVecVT, High));
24450 return DAG.getBitcast(VT, V);
24453 // The only element type left is i16.
24454 assert(EltVT == MVT::i16 && "Unknown how to handle type");
24456 // To obtain pop count for each i16 element starting from the pop count for
24457 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
24458 // right by 8. It is important to shift as i16s as i8 vector shift isn't
24459 // directly supported.
24460 SDValue ShifterV = DAG.getConstant(8, DL, VT);
24461 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24462 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
24463 DAG.getBitcast(ByteVecVT, V));
24464 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24467 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
24468 const X86Subtarget &Subtarget,
24469 SelectionDAG &DAG) {
24470 MVT VT = Op.getSimpleValueType();
24471 MVT EltVT = VT.getVectorElementType();
24472 unsigned VecSize = VT.getSizeInBits();
24474 // Implement a lookup table in register by using an algorithm based on:
24475 // http://wm.ite.pl/articles/sse-popcount.html
24477 // The general idea is that every lower byte nibble in the input vector is an
24478 // index into a in-register pre-computed pop count table. We then split up the
24479 // input vector in two new ones: (1) a vector with only the shifted-right
24480 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
24481 // masked out higher ones) for each byte. PSHUFB is used separately with both
24482 // to index the in-register table. Next, both are added and the result is a
24483 // i8 vector where each element contains the pop count for input byte.
24485 // To obtain the pop count for elements != i8, we follow up with the same
24486 // approach and use additional tricks as described below.
24488 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
24489 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
24490 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
24491 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
24493 int NumByteElts = VecSize / 8;
24494 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
24495 SDValue In = DAG.getBitcast(ByteVecVT, Op);
24496 SmallVector<SDValue, 64> LUTVec;
24497 for (int i = 0; i < NumByteElts; ++i)
24498 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24499 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
24500 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
24503 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
24504 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
24507 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
24509 // The input vector is used as the shuffle mask that index elements into the
24510 // LUT. After counting low and high nibbles, add the vector to obtain the
24511 // final pop count per i8 element.
24512 SDValue HighPopCnt =
24513 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24514 SDValue LowPopCnt =
24515 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24516 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
24518 if (EltVT == MVT::i8)
24521 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
24524 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
24525 const X86Subtarget &Subtarget,
24526 SelectionDAG &DAG) {
24527 MVT VT = Op.getSimpleValueType();
24528 assert(VT.is128BitVector() &&
24529 "Only 128-bit vector bitmath lowering supported.");
24531 int VecSize = VT.getSizeInBits();
24532 MVT EltVT = VT.getVectorElementType();
24533 int Len = EltVT.getSizeInBits();
24535 // This is the vectorized version of the "best" algorithm from
24536 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
24537 // with a minor tweak to use a series of adds + shifts instead of vector
24538 // multiplications. Implemented for all integer vector types. We only use
24539 // this when we don't have SSSE3 which allows a LUT-based lowering that is
24540 // much faster, even faster than using native popcnt instructions.
24542 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
24543 MVT VT = V.getSimpleValueType();
24544 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
24545 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
24547 auto GetMask = [&](SDValue V, APInt Mask) {
24548 MVT VT = V.getSimpleValueType();
24549 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
24550 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
24553 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
24554 // x86, so set the SRL type to have elements at least i16 wide. This is
24555 // correct because all of our SRLs are followed immediately by a mask anyways
24556 // that handles any bits that sneak into the high bits of the byte elements.
24557 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
24561 // v = v - ((v >> 1) & 0x55555555...)
24563 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
24564 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
24565 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
24567 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
24568 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
24569 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
24570 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
24571 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
24573 // v = (v + (v >> 4)) & 0x0F0F0F0F...
24574 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
24575 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
24576 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
24578 // At this point, V contains the byte-wise population count, and we are
24579 // merely doing a horizontal sum if necessary to get the wider element
24581 if (EltVT == MVT::i8)
24584 return LowerHorizontalByteSum(
24585 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
24589 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
24590 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24591 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24592 SelectionDAG &DAG) {
24593 MVT VT = Op.getSimpleValueType();
24594 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24595 "Unknown CTPOP type to handle");
24596 SDLoc DL(Op.getNode());
24597 SDValue Op0 = Op.getOperand(0);
24599 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24600 if (Subtarget.hasVPOPCNTDQ()) {
24601 unsigned NumElems = VT.getVectorNumElements();
24602 assert((VT.getVectorElementType() == MVT::i8 ||
24603 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24604 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
24605 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24606 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24607 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24608 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24612 if (!Subtarget.hasSSSE3()) {
24613 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24614 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24615 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24618 // Decompose 256-bit ops into smaller 128-bit ops.
24619 if (VT.is256BitVector() && !Subtarget.hasInt256())
24620 return Lower256IntUnary(Op, DAG);
24622 // Decompose 512-bit ops into smaller 256-bit ops.
24623 if (VT.is512BitVector() && !Subtarget.hasBWI())
24624 return Lower512IntUnary(Op, DAG);
24626 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24629 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24630 SelectionDAG &DAG) {
24631 assert(Op.getSimpleValueType().isVector() &&
24632 "We only do custom lowering for vector population count.");
24633 return LowerVectorCTPOP(Op, Subtarget, DAG);
24636 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24637 MVT VT = Op.getSimpleValueType();
24638 SDValue In = Op.getOperand(0);
24641 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24642 // perform the BITREVERSE.
24643 if (!VT.isVector()) {
24644 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24645 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24646 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24647 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24648 DAG.getIntPtrConstant(0, DL));
24651 int NumElts = VT.getVectorNumElements();
24652 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24654 // Decompose 256-bit ops into smaller 128-bit ops.
24655 if (VT.is256BitVector())
24656 return Lower256IntUnary(Op, DAG);
24658 assert(VT.is128BitVector() &&
24659 "Only 128-bit vector bitreverse lowering supported.");
24661 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24662 // perform the BSWAP in the shuffle.
24663 // Its best to shuffle using the second operand as this will implicitly allow
24664 // memory folding for multiple vectors.
24665 SmallVector<SDValue, 16> MaskElts;
24666 for (int i = 0; i != NumElts; ++i) {
24667 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24668 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24669 int PermuteByte = SourceByte | (2 << 5);
24670 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24674 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24675 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24676 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24678 return DAG.getBitcast(VT, Res);
24681 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24682 SelectionDAG &DAG) {
24683 MVT VT = Op.getSimpleValueType();
24685 if (Subtarget.hasXOP() && !VT.is512BitVector())
24686 return LowerBITREVERSE_XOP(Op, DAG);
24688 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24690 SDValue In = Op.getOperand(0);
24693 unsigned NumElts = VT.getVectorNumElements();
24694 assert(VT.getScalarType() == MVT::i8 &&
24695 "Only byte vector BITREVERSE supported");
24697 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24698 if (VT.is256BitVector() && !Subtarget.hasInt256())
24699 return Lower256IntUnary(Op, DAG);
24701 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24702 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24703 // 0-15 value (moved to the other nibble).
24704 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24705 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24706 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24708 const int LoLUT[16] = {
24709 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24710 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24711 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24712 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24713 const int HiLUT[16] = {
24714 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24715 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24716 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24717 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24719 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24720 for (unsigned i = 0; i < NumElts; ++i) {
24721 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24722 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24725 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24726 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24727 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24728 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24729 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24732 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24733 const X86Subtarget &Subtarget,
24734 bool AllowIncDec = true) {
24735 unsigned NewOpc = 0;
24736 switch (N->getOpcode()) {
24737 case ISD::ATOMIC_LOAD_ADD:
24738 NewOpc = X86ISD::LADD;
24740 case ISD::ATOMIC_LOAD_SUB:
24741 NewOpc = X86ISD::LSUB;
24743 case ISD::ATOMIC_LOAD_OR:
24744 NewOpc = X86ISD::LOR;
24746 case ISD::ATOMIC_LOAD_XOR:
24747 NewOpc = X86ISD::LXOR;
24749 case ISD::ATOMIC_LOAD_AND:
24750 NewOpc = X86ISD::LAND;
24753 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24756 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24758 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24759 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24760 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24761 DAG.getMachineFunction().getFunction().optForSize())) {
24762 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24763 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24764 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24765 DAG.getVTList(MVT::i32, MVT::Other),
24766 {N->getOperand(0), N->getOperand(1)},
24767 /*MemVT=*/N->getSimpleValueType(0), MMO);
24768 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24769 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24770 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24771 DAG.getVTList(MVT::i32, MVT::Other),
24772 {N->getOperand(0), N->getOperand(1)},
24773 /*MemVT=*/N->getSimpleValueType(0), MMO);
24777 return DAG.getMemIntrinsicNode(
24778 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24779 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24780 /*MemVT=*/N->getSimpleValueType(0), MMO);
24783 /// Lower atomic_load_ops into LOCK-prefixed operations.
24784 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24785 const X86Subtarget &Subtarget) {
24786 SDValue Chain = N->getOperand(0);
24787 SDValue LHS = N->getOperand(1);
24788 SDValue RHS = N->getOperand(2);
24789 unsigned Opc = N->getOpcode();
24790 MVT VT = N->getSimpleValueType(0);
24793 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24794 // can only be lowered when the result is unused. They should have already
24795 // been transformed into a cmpxchg loop in AtomicExpand.
24796 if (N->hasAnyUseOfValue(0)) {
24797 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24798 // select LXADD if LOCK_SUB can't be selected.
24799 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24800 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24801 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24802 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24803 RHS, AN->getMemOperand());
24805 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24806 "Used AtomicRMW ops other than Add should have been expanded!");
24810 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24811 // RAUW the chain, but don't worry about the result, as it's unused.
24812 assert(!N->hasAnyUseOfValue(0));
24813 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24817 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24818 SDNode *Node = Op.getNode();
24820 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24822 // Convert seq_cst store -> xchg
24823 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24824 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24825 // (The only way to get a 16-byte store is cmpxchg16b)
24826 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24827 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24828 AtomicOrdering::SequentiallyConsistent ||
24829 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24830 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24831 cast<AtomicSDNode>(Node)->getMemoryVT(),
24832 Node->getOperand(0),
24833 Node->getOperand(1), Node->getOperand(2),
24834 cast<AtomicSDNode>(Node)->getMemOperand());
24835 return Swap.getValue(1);
24837 // Other atomic stores have a simple pattern.
24841 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24842 SDNode *N = Op.getNode();
24843 MVT VT = N->getSimpleValueType(0);
24845 // Let legalize expand this if it isn't a legal type yet.
24846 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24849 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24852 // Set the carry flag.
24853 SDValue Carry = Op.getOperand(2);
24854 EVT CarryVT = Carry.getValueType();
24855 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24856 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24857 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24859 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24860 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24861 Op.getOperand(1), Carry.getValue(1));
24863 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24864 if (N->getValueType(1) == MVT::i1)
24865 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24867 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24870 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24871 SelectionDAG &DAG) {
24872 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24874 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24875 // which returns the values as { float, float } (in XMM0) or
24876 // { double, double } (which is returned in XMM0, XMM1).
24878 SDValue Arg = Op.getOperand(0);
24879 EVT ArgVT = Arg.getValueType();
24880 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24882 TargetLowering::ArgListTy Args;
24883 TargetLowering::ArgListEntry Entry;
24887 Entry.IsSExt = false;
24888 Entry.IsZExt = false;
24889 Args.push_back(Entry);
24891 bool isF64 = ArgVT == MVT::f64;
24892 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24893 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24894 // the results are returned via SRet in memory.
24895 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24896 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24897 const char *LibcallName = TLI.getLibcallName(LC);
24899 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24901 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24902 : (Type *)VectorType::get(ArgTy, 4);
24904 TargetLowering::CallLoweringInfo CLI(DAG);
24905 CLI.setDebugLoc(dl)
24906 .setChain(DAG.getEntryNode())
24907 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24909 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24912 // Returned in xmm0 and xmm1.
24913 return CallResult.first;
24915 // Returned in bits 0:31 and 32:64 xmm0.
24916 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24917 CallResult.first, DAG.getIntPtrConstant(0, dl));
24918 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24919 CallResult.first, DAG.getIntPtrConstant(1, dl));
24920 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24921 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24924 /// Widen a vector input to a vector of NVT. The
24925 /// input vector must have the same element type as NVT.
24926 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24927 bool FillWithZeroes = false) {
24928 // Check if InOp already has the right width.
24929 MVT InVT = InOp.getSimpleValueType();
24933 if (InOp.isUndef())
24934 return DAG.getUNDEF(NVT);
24936 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24937 "input and widen element type must match");
24939 unsigned InNumElts = InVT.getVectorNumElements();
24940 unsigned WidenNumElts = NVT.getVectorNumElements();
24941 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24942 "Unexpected request for vector widening");
24945 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24946 InOp.getNumOperands() == 2) {
24947 SDValue N1 = InOp.getOperand(1);
24948 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24950 InOp = InOp.getOperand(0);
24951 InVT = InOp.getSimpleValueType();
24952 InNumElts = InVT.getVectorNumElements();
24955 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24956 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24957 SmallVector<SDValue, 16> Ops;
24958 for (unsigned i = 0; i < InNumElts; ++i)
24959 Ops.push_back(InOp.getOperand(i));
24961 EVT EltVT = InOp.getOperand(0).getValueType();
24963 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24964 DAG.getUNDEF(EltVT);
24965 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24966 Ops.push_back(FillVal);
24967 return DAG.getBuildVector(NVT, dl, Ops);
24969 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24971 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24972 InOp, DAG.getIntPtrConstant(0, dl));
24975 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24976 SelectionDAG &DAG) {
24977 assert(Subtarget.hasAVX512() &&
24978 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24980 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24981 SDValue Src = N->getValue();
24982 MVT VT = Src.getSimpleValueType();
24983 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24986 SDValue Scale = N->getScale();
24987 SDValue Index = N->getIndex();
24988 SDValue Mask = N->getMask();
24989 SDValue Chain = N->getChain();
24990 SDValue BasePtr = N->getBasePtr();
24992 if (VT == MVT::v2f32) {
24993 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24994 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24995 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24996 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24997 DAG.getUNDEF(MVT::v2f32));
24998 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24999 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25000 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25001 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25002 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25003 return SDValue(NewScatter.getNode(), 1);
25008 if (VT == MVT::v2i32) {
25009 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25010 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
25011 DAG.getUNDEF(MVT::v2i32));
25012 // If the index is v2i64 and we have VLX we can use xmm for data and index.
25013 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
25014 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
25015 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25016 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25017 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25018 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25019 return SDValue(NewScatter.getNode(), 1);
25021 // Custom widen all the operands to avoid promotion.
25022 EVT NewIndexVT = EVT::getVectorVT(
25023 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
25024 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25025 DAG.getUNDEF(Index.getValueType()));
25026 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25027 DAG.getConstant(0, dl, MVT::v2i1));
25028 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25029 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
25030 Ops, N->getMemOperand());
25033 MVT IndexVT = Index.getSimpleValueType();
25034 MVT MaskVT = Mask.getSimpleValueType();
25036 // If the index is v2i32, we're being called by type legalization and we
25037 // should just let the default handling take care of it.
25038 if (IndexVT == MVT::v2i32)
25041 // If we don't have VLX and neither the passthru or index is 512-bits, we
25042 // need to widen until one is.
25043 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
25044 !Index.getSimpleValueType().is512BitVector()) {
25045 // Determine how much we need to widen by to get a 512-bit type.
25046 unsigned Factor = std::min(512/VT.getSizeInBits(),
25047 512/IndexVT.getSizeInBits());
25048 unsigned NumElts = VT.getVectorNumElements() * Factor;
25050 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25051 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25052 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25054 Src = ExtendToType(Src, VT, DAG);
25055 Index = ExtendToType(Index, IndexVT, DAG);
25056 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25059 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
25060 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
25061 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
25062 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
25063 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
25064 return SDValue(NewScatter.getNode(), 1);
25067 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
25068 SelectionDAG &DAG) {
25070 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
25071 MVT VT = Op.getSimpleValueType();
25072 MVT ScalarVT = VT.getScalarType();
25073 SDValue Mask = N->getMask();
25076 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
25077 "Expanding masked load is supported on AVX-512 target only!");
25079 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
25080 "Expanding masked load is supported for 32 and 64-bit types only!");
25082 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25083 "Cannot lower masked load op.");
25085 assert((ScalarVT.getSizeInBits() >= 32 ||
25086 (Subtarget.hasBWI() &&
25087 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25088 "Unsupported masked load op.");
25090 // This operation is legal for targets with VLX, but without
25091 // VLX the vector should be widened to 512 bit
25092 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
25093 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25094 SDValue Src0 = N->getSrc0();
25095 Src0 = ExtendToType(Src0, WideDataVT, DAG);
25097 // Mask element has to be i1.
25098 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25099 "Unexpected mask type");
25101 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25103 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25104 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
25105 N->getBasePtr(), Mask, Src0,
25106 N->getMemoryVT(), N->getMemOperand(),
25107 N->getExtensionType(),
25108 N->isExpandingLoad());
25110 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
25111 NewLoad.getValue(0),
25112 DAG.getIntPtrConstant(0, dl));
25113 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
25114 return DAG.getMergeValues(RetOps, dl);
25117 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
25118 SelectionDAG &DAG) {
25119 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
25120 SDValue DataToStore = N->getValue();
25121 MVT VT = DataToStore.getSimpleValueType();
25122 MVT ScalarVT = VT.getScalarType();
25123 SDValue Mask = N->getMask();
25126 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
25127 "Expanding masked load is supported on AVX-512 target only!");
25129 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
25130 "Expanding masked load is supported for 32 and 64-bit types only!");
25132 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25133 "Cannot lower masked store op.");
25135 assert((ScalarVT.getSizeInBits() >= 32 ||
25136 (Subtarget.hasBWI() &&
25137 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
25138 "Unsupported masked store op.");
25140 // This operation is legal for targets with VLX, but without
25141 // VLX the vector should be widened to 512 bit
25142 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
25143 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
25145 // Mask element has to be i1.
25146 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
25147 "Unexpected mask type");
25149 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
25151 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
25152 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
25153 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
25154 Mask, N->getMemoryVT(), N->getMemOperand(),
25155 N->isTruncatingStore(), N->isCompressingStore());
25158 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
25159 SelectionDAG &DAG) {
25160 assert(Subtarget.hasAVX2() &&
25161 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
25163 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
25165 MVT VT = Op.getSimpleValueType();
25166 SDValue Index = N->getIndex();
25167 SDValue Mask = N->getMask();
25168 SDValue Src0 = N->getValue();
25169 MVT IndexVT = Index.getSimpleValueType();
25170 MVT MaskVT = Mask.getSimpleValueType();
25172 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
25174 // If the index is v2i32, we're being called by type legalization.
25175 if (IndexVT == MVT::v2i32)
25178 // If we don't have VLX and neither the passthru or index is 512-bits, we
25179 // need to widen until one is.
25181 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
25182 !IndexVT.is512BitVector()) {
25183 // Determine how much we need to widen by to get a 512-bit type.
25184 unsigned Factor = std::min(512/VT.getSizeInBits(),
25185 512/IndexVT.getSizeInBits());
25187 unsigned NumElts = VT.getVectorNumElements() * Factor;
25189 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
25190 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
25191 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
25193 Src0 = ExtendToType(Src0, VT, DAG);
25194 Index = ExtendToType(Index, IndexVT, DAG);
25195 Mask = ExtendToType(Mask, MaskVT, DAG, true);
25198 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
25200 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25201 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
25202 N->getMemOperand());
25203 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
25204 NewGather, DAG.getIntPtrConstant(0, dl));
25205 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
25208 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
25209 SelectionDAG &DAG) const {
25210 // TODO: Eventually, the lowering of these nodes should be informed by or
25211 // deferred to the GC strategy for the function in which they appear. For
25212 // now, however, they must be lowered to something. Since they are logically
25213 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25214 // require special handling for these nodes), lower them as literal NOOPs for
25216 SmallVector<SDValue, 2> Ops;
25218 Ops.push_back(Op.getOperand(0));
25219 if (Op->getGluedNode())
25220 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25223 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25224 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25229 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
25230 SelectionDAG &DAG) const {
25231 // TODO: Eventually, the lowering of these nodes should be informed by or
25232 // deferred to the GC strategy for the function in which they appear. For
25233 // now, however, they must be lowered to something. Since they are logically
25234 // no-ops in the case of a null GC strategy (or a GC strategy which does not
25235 // require special handling for these nodes), lower them as literal NOOPs for
25237 SmallVector<SDValue, 2> Ops;
25239 Ops.push_back(Op.getOperand(0));
25240 if (Op->getGluedNode())
25241 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
25244 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
25245 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
25250 /// Provide custom lowering hooks for some operations.
25251 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
25252 switch (Op.getOpcode()) {
25253 default: llvm_unreachable("Should not custom lower this!");
25254 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
25255 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
25256 return LowerCMP_SWAP(Op, Subtarget, DAG);
25257 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
25258 case ISD::ATOMIC_LOAD_ADD:
25259 case ISD::ATOMIC_LOAD_SUB:
25260 case ISD::ATOMIC_LOAD_OR:
25261 case ISD::ATOMIC_LOAD_XOR:
25262 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
25263 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
25264 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
25265 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
25266 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
25267 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
25268 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
25269 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
25270 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
25271 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
25272 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
25273 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
25274 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
25275 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
25276 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
25277 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
25278 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
25279 case ISD::SHL_PARTS:
25280 case ISD::SRA_PARTS:
25281 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
25282 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
25283 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
25284 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
25285 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
25286 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
25287 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
25288 case ISD::ZERO_EXTEND_VECTOR_INREG:
25289 case ISD::SIGN_EXTEND_VECTOR_INREG:
25290 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
25291 case ISD::FP_TO_SINT:
25292 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
25293 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
25294 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
25295 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
25297 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
25298 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
25299 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
25300 case ISD::SETCC: return LowerSETCC(Op, DAG);
25301 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
25302 case ISD::SELECT: return LowerSELECT(Op, DAG);
25303 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
25304 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
25305 case ISD::VASTART: return LowerVASTART(Op, DAG);
25306 case ISD::VAARG: return LowerVAARG(Op, DAG);
25307 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
25308 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
25309 case ISD::INTRINSIC_VOID:
25310 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
25311 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
25312 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
25313 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
25314 case ISD::FRAME_TO_ARGS_OFFSET:
25315 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
25316 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
25317 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
25318 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
25319 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
25320 case ISD::EH_SJLJ_SETUP_DISPATCH:
25321 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
25322 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
25323 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
25324 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
25326 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
25328 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
25329 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
25331 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
25332 case ISD::UMUL_LOHI:
25333 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
25335 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
25338 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
25344 case ISD::UMULO: return LowerXALUO(Op, DAG);
25345 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
25346 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
25347 case ISD::ADDCARRY:
25348 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
25350 case ISD::SUB: return LowerADD_SUB(Op, DAG);
25354 case ISD::UMIN: return LowerMINMAX(Op, DAG);
25355 case ISD::ABS: return LowerABS(Op, DAG);
25356 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
25357 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
25358 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
25359 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
25360 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
25361 case ISD::GC_TRANSITION_START:
25362 return LowerGC_TRANSITION_START(Op, DAG);
25363 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
25367 /// Places new result values for the node in Results (their number
25368 /// and types must exactly match those of the original return values of
25369 /// the node), or leaves Results empty, which indicates that the node is not
25370 /// to be custom lowered after all.
25371 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
25372 SmallVectorImpl<SDValue> &Results,
25373 SelectionDAG &DAG) const {
25374 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
25376 if (!Res.getNode())
25379 assert((N->getNumValues() <= Res->getNumValues()) &&
25380 "Lowering returned the wrong number of results!");
25382 // Places new result values base on N result number.
25383 // In some cases (LowerSINT_TO_FP for example) Res has more result values
25384 // than original node, chain should be dropped(last value).
25385 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
25386 Results.push_back(Res.getValue(I));
25389 /// Replace a node with an illegal result type with a new node built out of
25391 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
25392 SmallVectorImpl<SDValue>&Results,
25393 SelectionDAG &DAG) const {
25395 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25396 switch (N->getOpcode()) {
25398 llvm_unreachable("Do not know how to custom type legalize this operation!");
25399 case X86ISD::AVG: {
25400 // Legalize types for X86ISD::AVG by expanding vectors.
25401 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25403 auto InVT = N->getValueType(0);
25404 assert(InVT.getSizeInBits() < 128);
25405 assert(128 % InVT.getSizeInBits() == 0);
25406 unsigned NumConcat = 128 / InVT.getSizeInBits();
25408 EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
25409 InVT.getVectorElementType(),
25410 NumConcat * InVT.getVectorNumElements());
25412 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
25413 Ops[0] = N->getOperand(0);
25414 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25415 Ops[0] = N->getOperand(1);
25416 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25418 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
25419 if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
25420 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
25421 DAG.getIntPtrConstant(0, dl));
25422 Results.push_back(Res);
25426 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
25427 // setCC result type is v2i1 because type legalzation will end up with
25428 // a v4i1 setcc plus an extend.
25429 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
25430 if (N->getOperand(0).getValueType() != MVT::v2f32)
25432 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
25433 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25434 N->getOperand(0), UNDEF);
25435 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25436 N->getOperand(1), UNDEF);
25437 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
25439 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25440 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25441 DAG.getIntPtrConstant(0, dl));
25442 Results.push_back(Res);
25445 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
25446 case X86ISD::FMINC:
25448 case X86ISD::FMAXC:
25449 case X86ISD::FMAX: {
25450 EVT VT = N->getValueType(0);
25451 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
25452 SDValue UNDEF = DAG.getUNDEF(VT);
25453 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25454 N->getOperand(0), UNDEF);
25455 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25456 N->getOperand(1), UNDEF);
25457 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
25465 case ISD::UDIVREM: {
25466 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
25467 Results.push_back(V);
25470 case ISD::FP_TO_SINT:
25471 case ISD::FP_TO_UINT: {
25472 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
25473 EVT VT = N->getValueType(0);
25474 SDValue Src = N->getOperand(0);
25475 EVT SrcVT = Src.getValueType();
25477 if (VT == MVT::v2i32) {
25478 assert((IsSigned || Subtarget.hasAVX512()) &&
25479 "Can only handle signed conversion without AVX512");
25480 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25481 if (Src.getValueType() == MVT::v2f64) {
25482 MVT ResVT = MVT::v4i32;
25483 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
25484 if (!IsSigned && !Subtarget.hasVLX()) {
25485 // Widen to 512-bits.
25486 ResVT = MVT::v8i32;
25487 Opc = ISD::FP_TO_UINT;
25488 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
25489 DAG.getUNDEF(MVT::v8f64),
25490 Src, DAG.getIntPtrConstant(0, dl));
25492 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
25493 bool WidenType = getTypeAction(*DAG.getContext(),
25494 MVT::v2i32) == TypeWidenVector;
25495 ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
25496 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
25497 DAG.getIntPtrConstant(0, dl));
25498 Results.push_back(Res);
25501 if (SrcVT == MVT::v2f32) {
25502 SDValue Idx = DAG.getIntPtrConstant(0, dl);
25503 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25504 DAG.getUNDEF(MVT::v2f32));
25505 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
25506 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
25507 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25508 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
25509 Results.push_back(Res);
25513 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
25514 // so early out here.
25518 if (Subtarget.hasDQI() && VT == MVT::i64 &&
25519 (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
25520 assert(!Subtarget.is64Bit() && "i64 should be legal");
25521 unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
25522 // Using a 256-bit input here to guarantee 128-bit input for f32 case.
25523 // TODO: Use 128-bit vectors for f64 case?
25524 // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
25525 MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
25526 MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
25528 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
25529 SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
25530 DAG.getConstantFP(0.0, dl, VecInVT), Src,
25532 Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
25533 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
25534 Results.push_back(Res);
25538 std::pair<SDValue,SDValue> Vals =
25539 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
25540 SDValue FIST = Vals.first, StackSlot = Vals.second;
25541 if (FIST.getNode()) {
25542 // Return a load from the stack slot.
25543 if (StackSlot.getNode())
25545 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
25547 Results.push_back(FIST);
25551 case ISD::SINT_TO_FP: {
25552 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
25553 SDValue Src = N->getOperand(0);
25554 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
25556 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
25559 case ISD::UINT_TO_FP: {
25560 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25561 EVT VT = N->getValueType(0);
25562 if (VT != MVT::v2f32)
25564 SDValue Src = N->getOperand(0);
25565 EVT SrcVT = Src.getValueType();
25566 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
25567 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
25570 if (SrcVT != MVT::v2i32)
25572 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
25574 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
25575 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
25576 DAG.getBitcast(MVT::v2i64, VBias));
25577 Or = DAG.getBitcast(MVT::v2f64, Or);
25578 // TODO: Are there any fast-math-flags to propagate here?
25579 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
25580 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
25583 case ISD::FP_ROUND: {
25584 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
25586 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
25587 Results.push_back(V);
25590 case ISD::FP_EXTEND: {
25591 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25592 // No other ValueType for FP_EXTEND should reach this point.
25593 assert(N->getValueType(0) == MVT::v2f32 &&
25594 "Do not know how to legalize this Node");
25597 case ISD::INTRINSIC_W_CHAIN: {
25598 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25600 default : llvm_unreachable("Do not know how to custom type "
25601 "legalize this intrinsic operation!");
25602 case Intrinsic::x86_rdtsc:
25603 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25605 case Intrinsic::x86_rdtscp:
25606 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25608 case Intrinsic::x86_rdpmc:
25609 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25611 case Intrinsic::x86_xgetbv:
25612 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25615 case ISD::INTRINSIC_WO_CHAIN: {
25616 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25617 Results.push_back(V);
25620 case ISD::READCYCLECOUNTER: {
25621 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25624 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25625 EVT T = N->getValueType(0);
25626 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25627 bool Regs64bit = T == MVT::i128;
25628 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25629 SDValue cpInL, cpInH;
25630 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25631 DAG.getConstant(0, dl, HalfT));
25632 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25633 DAG.getConstant(1, dl, HalfT));
25634 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25635 Regs64bit ? X86::RAX : X86::EAX,
25637 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25638 Regs64bit ? X86::RDX : X86::EDX,
25639 cpInH, cpInL.getValue(1));
25640 SDValue swapInL, swapInH;
25641 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25642 DAG.getConstant(0, dl, HalfT));
25643 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25644 DAG.getConstant(1, dl, HalfT));
25646 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25647 swapInH, cpInH.getValue(1));
25648 // If the current function needs the base pointer, RBX,
25649 // we shouldn't use cmpxchg directly.
25650 // Indeed the lowering of that instruction will clobber
25651 // that register and since RBX will be a reserved register
25652 // the register allocator will not make sure its value will
25653 // be properly saved and restored around this live-range.
25654 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25656 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25657 unsigned BasePtr = TRI->getBaseRegister();
25658 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25659 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25660 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25661 // ISel prefers the LCMPXCHG64 variant.
25662 // If that assert breaks, that means it is not the case anymore,
25663 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25664 // not just EBX. This is a matter of accepting i64 input for that
25665 // pseudo, and restoring into the register of the right wide
25666 // in expand pseudo. Everything else should just work.
25667 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25668 "Saving only half of the RBX");
25669 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25670 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25671 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25672 Regs64bit ? X86::RBX : X86::EBX,
25673 HalfT, swapInH.getValue(1));
25674 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25676 /*Glue*/ RBXSave.getValue(2)};
25677 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25680 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25681 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25682 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25683 swapInH.getValue(1));
25684 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25685 swapInL.getValue(1)};
25686 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25688 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25689 Regs64bit ? X86::RAX : X86::EAX,
25690 HalfT, Result.getValue(1));
25691 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25692 Regs64bit ? X86::RDX : X86::EDX,
25693 HalfT, cpOutL.getValue(2));
25694 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25696 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25697 MVT::i32, cpOutH.getValue(2));
25698 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25699 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25701 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25702 Results.push_back(Success);
25703 Results.push_back(EFLAGS.getValue(1));
25706 case ISD::ATOMIC_SWAP:
25707 case ISD::ATOMIC_LOAD_ADD:
25708 case ISD::ATOMIC_LOAD_SUB:
25709 case ISD::ATOMIC_LOAD_AND:
25710 case ISD::ATOMIC_LOAD_OR:
25711 case ISD::ATOMIC_LOAD_XOR:
25712 case ISD::ATOMIC_LOAD_NAND:
25713 case ISD::ATOMIC_LOAD_MIN:
25714 case ISD::ATOMIC_LOAD_MAX:
25715 case ISD::ATOMIC_LOAD_UMIN:
25716 case ISD::ATOMIC_LOAD_UMAX:
25717 case ISD::ATOMIC_LOAD: {
25718 // Delegate to generic TypeLegalization. Situations we can really handle
25719 // should have already been dealt with by AtomicExpandPass.cpp.
25722 case ISD::BITCAST: {
25723 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25724 EVT DstVT = N->getValueType(0);
25725 EVT SrcVT = N->getOperand(0).getValueType();
25727 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
25728 // we can split using the k-register rather than memory.
25729 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25730 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25732 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25733 Lo = DAG.getBitcast(MVT::i32, Lo);
25734 Hi = DAG.getBitcast(MVT::i32, Hi);
25735 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25736 Results.push_back(Res);
25740 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25741 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
25742 SrcVT.isVector() && isTypeLegal(SrcVT)) {
25744 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25745 MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
25746 Lo = DAG.getBitcast(CastVT, Lo);
25747 Hi = DAG.getBitcast(CastVT, Hi);
25748 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
25749 Results.push_back(Res);
25753 if (SrcVT != MVT::f64 ||
25754 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25757 unsigned NumElts = DstVT.getVectorNumElements();
25758 EVT SVT = DstVT.getVectorElementType();
25759 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25760 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25761 MVT::v2f64, N->getOperand(0));
25762 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25764 if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
25765 // If we are legalizing vectors by widening, we already have the desired
25766 // legal vector type, just return it.
25767 Results.push_back(ToVecInt);
25771 SmallVector<SDValue, 8> Elts;
25772 for (unsigned i = 0, e = NumElts; i != e; ++i)
25773 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25774 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25776 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25779 case ISD::MGATHER: {
25780 EVT VT = N->getValueType(0);
25781 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25782 auto *Gather = cast<MaskedGatherSDNode>(N);
25783 SDValue Index = Gather->getIndex();
25784 if (Index.getValueType() != MVT::v2i64)
25786 SDValue Mask = Gather->getMask();
25787 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25788 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25789 Gather->getValue(),
25790 DAG.getUNDEF(MVT::v2f32));
25791 if (!Subtarget.hasVLX()) {
25792 // We need to widen the mask, but the instruction will only use 2
25793 // of its elements. So we can use undef.
25794 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25795 DAG.getUNDEF(MVT::v2i1));
25796 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25798 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25799 Index, Gather->getScale() };
25800 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25801 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25802 Gather->getMemoryVT(), Gather->getMemOperand());
25803 Results.push_back(Res);
25804 Results.push_back(Res.getValue(2));
25807 if (VT == MVT::v2i32) {
25808 auto *Gather = cast<MaskedGatherSDNode>(N);
25809 SDValue Index = Gather->getIndex();
25810 SDValue Mask = Gather->getMask();
25811 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25812 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25813 Gather->getValue(),
25814 DAG.getUNDEF(MVT::v2i32));
25815 // If the index is v2i64 we can use it directly.
25816 if (Index.getValueType() == MVT::v2i64 &&
25817 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25818 if (!Subtarget.hasVLX()) {
25819 // We need to widen the mask, but the instruction will only use 2
25820 // of its elements. So we can use undef.
25821 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25822 DAG.getUNDEF(MVT::v2i1));
25823 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25825 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25826 Index, Gather->getScale() };
25827 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25828 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25829 Gather->getMemoryVT(), Gather->getMemOperand());
25830 SDValue Chain = Res.getValue(2);
25831 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
25832 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25833 DAG.getIntPtrConstant(0, dl));
25834 Results.push_back(Res);
25835 Results.push_back(Chain);
25838 EVT IndexVT = Index.getValueType();
25839 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25840 IndexVT.getScalarType(), 4);
25841 // Otherwise we need to custom widen everything to avoid promotion.
25842 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25843 DAG.getUNDEF(IndexVT));
25844 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25845 DAG.getConstant(0, dl, MVT::v2i1));
25846 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25847 Index, Gather->getScale() };
25848 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25849 Gather->getMemoryVT(), dl, Ops,
25850 Gather->getMemOperand());
25851 SDValue Chain = Res.getValue(1);
25852 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25853 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25854 DAG.getIntPtrConstant(0, dl));
25855 Results.push_back(Res);
25856 Results.push_back(Chain);
25864 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25865 switch ((X86ISD::NodeType)Opcode) {
25866 case X86ISD::FIRST_NUMBER: break;
25867 case X86ISD::BSF: return "X86ISD::BSF";
25868 case X86ISD::BSR: return "X86ISD::BSR";
25869 case X86ISD::SHLD: return "X86ISD::SHLD";
25870 case X86ISD::SHRD: return "X86ISD::SHRD";
25871 case X86ISD::FAND: return "X86ISD::FAND";
25872 case X86ISD::FANDN: return "X86ISD::FANDN";
25873 case X86ISD::FOR: return "X86ISD::FOR";
25874 case X86ISD::FXOR: return "X86ISD::FXOR";
25875 case X86ISD::FILD: return "X86ISD::FILD";
25876 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25877 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25878 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25879 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25880 case X86ISD::FLD: return "X86ISD::FLD";
25881 case X86ISD::FST: return "X86ISD::FST";
25882 case X86ISD::CALL: return "X86ISD::CALL";
25883 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25884 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25885 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25886 case X86ISD::BT: return "X86ISD::BT";
25887 case X86ISD::CMP: return "X86ISD::CMP";
25888 case X86ISD::COMI: return "X86ISD::COMI";
25889 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25890 case X86ISD::CMPM: return "X86ISD::CMPM";
25891 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25892 case X86ISD::SETCC: return "X86ISD::SETCC";
25893 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25894 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25895 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25896 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25897 case X86ISD::CMOV: return "X86ISD::CMOV";
25898 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25899 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25900 case X86ISD::IRET: return "X86ISD::IRET";
25901 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25902 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25903 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25904 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25905 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25906 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25907 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25908 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25909 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25910 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25911 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25912 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25913 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25914 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25915 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25916 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25917 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25918 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25919 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25920 case X86ISD::HADD: return "X86ISD::HADD";
25921 case X86ISD::HSUB: return "X86ISD::HSUB";
25922 case X86ISD::FHADD: return "X86ISD::FHADD";
25923 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25924 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25925 case X86ISD::FMAX: return "X86ISD::FMAX";
25926 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25927 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25928 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25929 case X86ISD::FMIN: return "X86ISD::FMIN";
25930 case X86ISD::FMINS: return "X86ISD::FMINS";
25931 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25932 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25933 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25934 case X86ISD::FMINC: return "X86ISD::FMINC";
25935 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25936 case X86ISD::FRCP: return "X86ISD::FRCP";
25937 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25938 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25939 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25940 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25941 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25942 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25943 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25944 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25945 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25946 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25947 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25948 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25949 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25950 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25951 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25952 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25953 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25954 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25955 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25956 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25957 case X86ISD::LADD: return "X86ISD::LADD";
25958 case X86ISD::LSUB: return "X86ISD::LSUB";
25959 case X86ISD::LOR: return "X86ISD::LOR";
25960 case X86ISD::LXOR: return "X86ISD::LXOR";
25961 case X86ISD::LAND: return "X86ISD::LAND";
25962 case X86ISD::LINC: return "X86ISD::LINC";
25963 case X86ISD::LDEC: return "X86ISD::LDEC";
25964 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25965 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25966 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25967 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25968 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25969 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25970 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25971 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25972 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25973 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25974 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25975 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25976 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25977 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25978 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25979 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25980 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25981 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25982 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25983 case X86ISD::VSHL: return "X86ISD::VSHL";
25984 case X86ISD::VSRL: return "X86ISD::VSRL";
25985 case X86ISD::VSRA: return "X86ISD::VSRA";
25986 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25987 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25988 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25989 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25990 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25991 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25992 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25993 case X86ISD::CMPP: return "X86ISD::CMPP";
25994 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25995 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25996 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25997 case X86ISD::ADD: return "X86ISD::ADD";
25998 case X86ISD::SUB: return "X86ISD::SUB";
25999 case X86ISD::ADC: return "X86ISD::ADC";
26000 case X86ISD::SBB: return "X86ISD::SBB";
26001 case X86ISD::SMUL: return "X86ISD::SMUL";
26002 case X86ISD::UMUL: return "X86ISD::UMUL";
26003 case X86ISD::SMUL8: return "X86ISD::SMUL8";
26004 case X86ISD::UMUL8: return "X86ISD::UMUL8";
26005 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
26006 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
26007 case X86ISD::INC: return "X86ISD::INC";
26008 case X86ISD::DEC: return "X86ISD::DEC";
26009 case X86ISD::OR: return "X86ISD::OR";
26010 case X86ISD::XOR: return "X86ISD::XOR";
26011 case X86ISD::AND: return "X86ISD::AND";
26012 case X86ISD::BEXTR: return "X86ISD::BEXTR";
26013 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
26014 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
26015 case X86ISD::PTEST: return "X86ISD::PTEST";
26016 case X86ISD::TESTP: return "X86ISD::TESTP";
26017 case X86ISD::KORTEST: return "X86ISD::KORTEST";
26018 case X86ISD::KTEST: return "X86ISD::KTEST";
26019 case X86ISD::KADD: return "X86ISD::KADD";
26020 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
26021 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
26022 case X86ISD::PACKSS: return "X86ISD::PACKSS";
26023 case X86ISD::PACKUS: return "X86ISD::PACKUS";
26024 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
26025 case X86ISD::VALIGN: return "X86ISD::VALIGN";
26026 case X86ISD::VSHLD: return "X86ISD::VSHLD";
26027 case X86ISD::VSHRD: return "X86ISD::VSHRD";
26028 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
26029 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
26030 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
26031 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
26032 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
26033 case X86ISD::SHUFP: return "X86ISD::SHUFP";
26034 case X86ISD::SHUF128: return "X86ISD::SHUF128";
26035 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
26036 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
26037 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
26038 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
26039 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
26040 case X86ISD::MOVSD: return "X86ISD::MOVSD";
26041 case X86ISD::MOVSS: return "X86ISD::MOVSS";
26042 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
26043 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
26044 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
26045 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
26046 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
26047 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
26048 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
26049 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
26050 case X86ISD::VPERMV: return "X86ISD::VPERMV";
26051 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
26052 case X86ISD::VPERMI: return "X86ISD::VPERMI";
26053 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
26054 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
26055 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
26056 case X86ISD::VRANGE: return "X86ISD::VRANGE";
26057 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
26058 case X86ISD::VRANGES: return "X86ISD::VRANGES";
26059 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
26060 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
26061 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
26062 case X86ISD::PSADBW: return "X86ISD::PSADBW";
26063 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
26064 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
26065 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
26066 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
26067 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
26068 case X86ISD::MFENCE: return "X86ISD::MFENCE";
26069 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
26070 case X86ISD::SAHF: return "X86ISD::SAHF";
26071 case X86ISD::RDRAND: return "X86ISD::RDRAND";
26072 case X86ISD::RDSEED: return "X86ISD::RDSEED";
26073 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
26074 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
26075 case X86ISD::VPSHA: return "X86ISD::VPSHA";
26076 case X86ISD::VPSHL: return "X86ISD::VPSHL";
26077 case X86ISD::VPCOM: return "X86ISD::VPCOM";
26078 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
26079 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
26080 case X86ISD::FMSUB: return "X86ISD::FMSUB";
26081 case X86ISD::FNMADD: return "X86ISD::FNMADD";
26082 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
26083 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
26084 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
26085 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
26086 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
26087 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
26088 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
26089 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
26090 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
26091 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
26092 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
26093 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
26094 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
26095 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
26096 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
26097 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
26098 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
26099 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
26100 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
26101 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
26102 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
26103 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
26104 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
26105 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
26106 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
26107 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
26108 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
26109 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
26110 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
26111 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
26112 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
26113 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
26114 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
26115 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
26116 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
26117 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
26118 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
26119 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
26120 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
26121 case X86ISD::PCMPESTR: return "X86ISD::PCMPESTR";
26122 case X86ISD::PCMPISTR: return "X86ISD::PCMPISTR";
26123 case X86ISD::XTEST: return "X86ISD::XTEST";
26124 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
26125 case X86ISD::EXPAND: return "X86ISD::EXPAND";
26126 case X86ISD::SELECT: return "X86ISD::SELECT";
26127 case X86ISD::SELECTS: return "X86ISD::SELECTS";
26128 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
26129 case X86ISD::RCP14: return "X86ISD::RCP14";
26130 case X86ISD::RCP14S: return "X86ISD::RCP14S";
26131 case X86ISD::RCP28: return "X86ISD::RCP28";
26132 case X86ISD::RCP28S: return "X86ISD::RCP28S";
26133 case X86ISD::EXP2: return "X86ISD::EXP2";
26134 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
26135 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
26136 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
26137 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
26138 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
26139 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
26140 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
26141 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
26142 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
26143 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
26144 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
26145 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
26146 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
26147 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
26148 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
26149 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
26150 case X86ISD::SCALEF: return "X86ISD::SCALEF";
26151 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
26152 case X86ISD::ADDS: return "X86ISD::ADDS";
26153 case X86ISD::SUBS: return "X86ISD::SUBS";
26154 case X86ISD::AVG: return "X86ISD::AVG";
26155 case X86ISD::MULHRS: return "X86ISD::MULHRS";
26156 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
26157 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
26158 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
26159 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
26160 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
26161 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
26162 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
26163 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
26164 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
26165 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
26166 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
26167 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
26168 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
26169 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
26170 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
26171 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
26172 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
26173 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
26174 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
26175 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
26176 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
26177 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
26178 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
26179 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
26180 case X86ISD::LWPINS: return "X86ISD::LWPINS";
26181 case X86ISD::MGATHER: return "X86ISD::MGATHER";
26182 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
26183 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
26184 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
26185 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
26186 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
26187 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
26188 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
26189 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
26190 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
26191 case X86ISD::NT_CALL: return "X86ISD::NT_CALL";
26192 case X86ISD::NT_BRIND: return "X86ISD::NT_BRIND";
26193 case X86ISD::UMWAIT: return "X86ISD::UMWAIT";
26194 case X86ISD::TPAUSE: return "X86ISD::TPAUSE";
26199 /// Return true if the addressing mode represented by AM is legal for this
26200 /// target, for a load/store of the specified type.
26201 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
26202 const AddrMode &AM, Type *Ty,
26204 Instruction *I) const {
26205 // X86 supports extremely general addressing modes.
26206 CodeModel::Model M = getTargetMachine().getCodeModel();
26208 // X86 allows a sign-extended 32-bit immediate field as a displacement.
26209 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
26213 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
26215 // If a reference to this global requires an extra load, we can't fold it.
26216 if (isGlobalStubReference(GVFlags))
26219 // If BaseGV requires a register for the PIC base, we cannot also have a
26220 // BaseReg specified.
26221 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
26224 // If lower 4G is not available, then we must use rip-relative addressing.
26225 if ((M != CodeModel::Small || isPositionIndependent()) &&
26226 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
26230 switch (AM.Scale) {
26236 // These scales always work.
26241 // These scales are formed with basereg+scalereg. Only accept if there is
26246 default: // Other stuff never works.
26253 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
26254 unsigned Bits = Ty->getScalarSizeInBits();
26256 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
26257 // particularly cheaper than those without.
26261 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
26262 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
26263 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
26266 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
26267 // shifts just as cheap as scalar ones.
26268 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
26271 // AVX512BW has shifts such as vpsllvw.
26272 if (Subtarget.hasBWI() && Bits == 16)
26275 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
26276 // fully general vector.
26280 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
26281 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26283 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
26284 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
26285 return NumBits1 > NumBits2;
26288 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
26289 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
26292 if (!isTypeLegal(EVT::getEVT(Ty1)))
26295 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
26297 // Assuming the caller doesn't have a zeroext or signext return parameter,
26298 // truncation all the way down to i1 is valid.
26302 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
26303 return isInt<32>(Imm);
26306 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
26307 // Can also use sub to handle negated immediates.
26308 return isInt<32>(Imm);
26311 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
26312 if (!VT1.isInteger() || !VT2.isInteger())
26314 unsigned NumBits1 = VT1.getSizeInBits();
26315 unsigned NumBits2 = VT2.getSizeInBits();
26316 return NumBits1 > NumBits2;
26319 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
26320 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26321 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
26324 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
26325 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
26326 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
26329 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
26330 EVT VT1 = Val.getValueType();
26331 if (isZExtFree(VT1, VT2))
26334 if (Val.getOpcode() != ISD::LOAD)
26337 if (!VT1.isSimple() || !VT1.isInteger() ||
26338 !VT2.isSimple() || !VT2.isInteger())
26341 switch (VT1.getSimpleVT().SimpleTy) {
26346 // X86 has 8, 16, and 32-bit zero-extending loads.
26353 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
26354 EVT SrcVT = ExtVal.getOperand(0).getValueType();
26356 // There is no extending load for vXi1.
26357 if (SrcVT.getScalarType() == MVT::i1)
26364 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
26365 if (!Subtarget.hasAnyFMA())
26368 VT = VT.getScalarType();
26370 if (!VT.isSimple())
26373 switch (VT.getSimpleVT().SimpleTy) {
26384 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
26385 // i16 instructions are longer (0x66 prefix) and potentially slower.
26386 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
26389 /// Targets can use this to indicate that they only support *some*
26390 /// VECTOR_SHUFFLE operations, those with specific masks.
26391 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
26392 /// are assumed to be legal.
26393 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
26394 if (!VT.isSimple())
26397 // Not for i1 vectors
26398 if (VT.getSimpleVT().getScalarType() == MVT::i1)
26401 // Very little shuffling can be done for 64-bit vectors right now.
26402 if (VT.getSimpleVT().getSizeInBits() == 64)
26405 // We only care that the types being shuffled are legal. The lowering can
26406 // handle any possible shuffle mask that results.
26407 return isTypeLegal(VT.getSimpleVT());
26410 bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
26412 // Don't convert an 'and' into a shuffle that we don't directly support.
26413 // vpblendw and vpshufb for 256-bit vectors are not available on AVX1.
26414 if (!Subtarget.hasAVX2())
26415 if (VT == MVT::v32i8 || VT == MVT::v16i16)
26418 // Just delegate to the generic legality, clear masks aren't special.
26419 return isShuffleMaskLegal(Mask, VT);
26422 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
26423 // If the subtarget is using retpolines, we need to not generate jump tables.
26424 if (Subtarget.useRetpoline())
26427 // Otherwise, fallback on the generic logic.
26428 return TargetLowering::areJTsAllowed(Fn);
26431 //===----------------------------------------------------------------------===//
26432 // X86 Scheduler Hooks
26433 //===----------------------------------------------------------------------===//
26435 /// Utility function to emit xbegin specifying the start of an RTM region.
26436 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
26437 const TargetInstrInfo *TII) {
26438 DebugLoc DL = MI.getDebugLoc();
26440 const BasicBlock *BB = MBB->getBasicBlock();
26441 MachineFunction::iterator I = ++MBB->getIterator();
26443 // For the v = xbegin(), we generate
26452 // eax = # XABORT_DEF
26456 // v = phi(s0/mainBB, s1/fallBB)
26458 MachineBasicBlock *thisMBB = MBB;
26459 MachineFunction *MF = MBB->getParent();
26460 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26461 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
26462 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26463 MF->insert(I, mainMBB);
26464 MF->insert(I, fallMBB);
26465 MF->insert(I, sinkMBB);
26467 // Transfer the remainder of BB and its successor edges to sinkMBB.
26468 sinkMBB->splice(sinkMBB->begin(), MBB,
26469 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26470 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26472 MachineRegisterInfo &MRI = MF->getRegInfo();
26473 unsigned DstReg = MI.getOperand(0).getReg();
26474 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26475 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26476 unsigned fallDstReg = MRI.createVirtualRegister(RC);
26480 // # fallthrough to mainMBB
26481 // # abortion to fallMBB
26482 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
26483 thisMBB->addSuccessor(mainMBB);
26484 thisMBB->addSuccessor(fallMBB);
26487 // mainDstReg := -1
26488 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
26489 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26490 mainMBB->addSuccessor(sinkMBB);
26493 // ; pseudo instruction to model hardware's definition from XABORT
26494 // EAX := XABORT_DEF
26495 // fallDstReg := EAX
26496 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
26497 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
26499 fallMBB->addSuccessor(sinkMBB);
26502 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
26503 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
26504 .addReg(mainDstReg).addMBB(mainMBB)
26505 .addReg(fallDstReg).addMBB(fallMBB);
26507 MI.eraseFromParent();
26511 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26512 const X86Subtarget &Subtarget) {
26513 DebugLoc dl = MI.getDebugLoc();
26514 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26516 // insert input VAL into EAX
26517 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
26518 .addReg(MI.getOperand(0).getReg());
26519 // insert zero to ECX
26520 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26522 // insert zero to EDX
26523 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
26525 // insert WRPKRU instruction
26526 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
26528 MI.eraseFromParent(); // The pseudo is gone now.
26532 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26533 const X86Subtarget &Subtarget) {
26534 DebugLoc dl = MI.getDebugLoc();
26535 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26537 // insert zero to ECX
26538 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26540 // insert RDPKRU instruction
26541 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
26542 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26545 MI.eraseFromParent(); // The pseudo is gone now.
26549 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
26550 const X86Subtarget &Subtarget,
26552 DebugLoc dl = MI.getDebugLoc();
26553 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26554 // Address into RAX/EAX, other two args into ECX, EDX.
26555 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26556 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26557 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26558 for (int i = 0; i < X86::AddrNumOperands; ++i)
26559 MIB.add(MI.getOperand(i));
26561 unsigned ValOps = X86::AddrNumOperands;
26562 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26563 .addReg(MI.getOperand(ValOps).getReg());
26564 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26565 .addReg(MI.getOperand(ValOps + 1).getReg());
26567 // The instruction doesn't actually take any operands though.
26568 BuildMI(*BB, MI, dl, TII->get(Opc));
26570 MI.eraseFromParent(); // The pseudo is gone now.
26574 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26575 const X86Subtarget &Subtarget) {
26576 DebugLoc dl = MI->getDebugLoc();
26577 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26578 // Address into RAX/EAX
26579 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26580 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26581 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26582 for (int i = 0; i < X86::AddrNumOperands; ++i)
26583 MIB.add(MI->getOperand(i));
26585 // The instruction doesn't actually take any operands though.
26586 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26588 MI->eraseFromParent(); // The pseudo is gone now.
26594 MachineBasicBlock *
26595 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26596 MachineBasicBlock *MBB) const {
26597 // Emit va_arg instruction on X86-64.
26599 // Operands to this pseudo-instruction:
26600 // 0 ) Output : destination address (reg)
26601 // 1-5) Input : va_list address (addr, i64mem)
26602 // 6 ) ArgSize : Size (in bytes) of vararg type
26603 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26604 // 8 ) Align : Alignment of type
26605 // 9 ) EFLAGS (implicit-def)
26607 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26608 static_assert(X86::AddrNumOperands == 5,
26609 "VAARG_64 assumes 5 address operands");
26611 unsigned DestReg = MI.getOperand(0).getReg();
26612 MachineOperand &Base = MI.getOperand(1);
26613 MachineOperand &Scale = MI.getOperand(2);
26614 MachineOperand &Index = MI.getOperand(3);
26615 MachineOperand &Disp = MI.getOperand(4);
26616 MachineOperand &Segment = MI.getOperand(5);
26617 unsigned ArgSize = MI.getOperand(6).getImm();
26618 unsigned ArgMode = MI.getOperand(7).getImm();
26619 unsigned Align = MI.getOperand(8).getImm();
26621 // Memory Reference
26622 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26623 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26624 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26626 // Machine Information
26627 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26628 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26629 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26630 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26631 DebugLoc DL = MI.getDebugLoc();
26633 // struct va_list {
26636 // i64 overflow_area (address)
26637 // i64 reg_save_area (address)
26639 // sizeof(va_list) = 24
26640 // alignment(va_list) = 8
26642 unsigned TotalNumIntRegs = 6;
26643 unsigned TotalNumXMMRegs = 8;
26644 bool UseGPOffset = (ArgMode == 1);
26645 bool UseFPOffset = (ArgMode == 2);
26646 unsigned MaxOffset = TotalNumIntRegs * 8 +
26647 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26649 /* Align ArgSize to a multiple of 8 */
26650 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26651 bool NeedsAlign = (Align > 8);
26653 MachineBasicBlock *thisMBB = MBB;
26654 MachineBasicBlock *overflowMBB;
26655 MachineBasicBlock *offsetMBB;
26656 MachineBasicBlock *endMBB;
26658 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26659 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26660 unsigned OffsetReg = 0;
26662 if (!UseGPOffset && !UseFPOffset) {
26663 // If we only pull from the overflow region, we don't create a branch.
26664 // We don't need to alter control flow.
26665 OffsetDestReg = 0; // unused
26666 OverflowDestReg = DestReg;
26668 offsetMBB = nullptr;
26669 overflowMBB = thisMBB;
26672 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26673 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26674 // If not, pull from overflow_area. (branch to overflowMBB)
26679 // offsetMBB overflowMBB
26684 // Registers for the PHI in endMBB
26685 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26686 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26688 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26689 MachineFunction *MF = MBB->getParent();
26690 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26691 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26692 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26694 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26696 // Insert the new basic blocks
26697 MF->insert(MBBIter, offsetMBB);
26698 MF->insert(MBBIter, overflowMBB);
26699 MF->insert(MBBIter, endMBB);
26701 // Transfer the remainder of MBB and its successor edges to endMBB.
26702 endMBB->splice(endMBB->begin(), thisMBB,
26703 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26704 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26706 // Make offsetMBB and overflowMBB successors of thisMBB
26707 thisMBB->addSuccessor(offsetMBB);
26708 thisMBB->addSuccessor(overflowMBB);
26710 // endMBB is a successor of both offsetMBB and overflowMBB
26711 offsetMBB->addSuccessor(endMBB);
26712 overflowMBB->addSuccessor(endMBB);
26714 // Load the offset value into a register
26715 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26716 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26720 .addDisp(Disp, UseFPOffset ? 4 : 0)
26722 .setMemRefs(MMOBegin, MMOEnd);
26724 // Check if there is enough room left to pull this argument.
26725 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26727 .addImm(MaxOffset + 8 - ArgSizeA8);
26729 // Branch to "overflowMBB" if offset >= max
26730 // Fall through to "offsetMBB" otherwise
26731 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26732 .addMBB(overflowMBB);
26735 // In offsetMBB, emit code to use the reg_save_area.
26737 assert(OffsetReg != 0);
26739 // Read the reg_save_area address.
26740 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26741 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26747 .setMemRefs(MMOBegin, MMOEnd);
26749 // Zero-extend the offset
26750 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26751 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26754 .addImm(X86::sub_32bit);
26756 // Add the offset to the reg_save_area to get the final address.
26757 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26758 .addReg(OffsetReg64)
26759 .addReg(RegSaveReg);
26761 // Compute the offset for the next argument
26762 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26763 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26765 .addImm(UseFPOffset ? 16 : 8);
26767 // Store it back into the va_list.
26768 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26772 .addDisp(Disp, UseFPOffset ? 4 : 0)
26774 .addReg(NextOffsetReg)
26775 .setMemRefs(MMOBegin, MMOEnd);
26778 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26783 // Emit code to use overflow area
26786 // Load the overflow_area address into a register.
26787 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26788 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26794 .setMemRefs(MMOBegin, MMOEnd);
26796 // If we need to align it, do so. Otherwise, just copy the address
26797 // to OverflowDestReg.
26799 // Align the overflow address
26800 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26801 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26803 // aligned_addr = (addr + (align-1)) & ~(align-1)
26804 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26805 .addReg(OverflowAddrReg)
26808 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26810 .addImm(~(uint64_t)(Align-1));
26812 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26813 .addReg(OverflowAddrReg);
26816 // Compute the next overflow address after this argument.
26817 // (the overflow address should be kept 8-byte aligned)
26818 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26819 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26820 .addReg(OverflowDestReg)
26821 .addImm(ArgSizeA8);
26823 // Store the new overflow address.
26824 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26830 .addReg(NextAddrReg)
26831 .setMemRefs(MMOBegin, MMOEnd);
26833 // If we branched, emit the PHI to the front of endMBB.
26835 BuildMI(*endMBB, endMBB->begin(), DL,
26836 TII->get(X86::PHI), DestReg)
26837 .addReg(OffsetDestReg).addMBB(offsetMBB)
26838 .addReg(OverflowDestReg).addMBB(overflowMBB);
26841 // Erase the pseudo instruction
26842 MI.eraseFromParent();
26847 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26848 MachineInstr &MI, MachineBasicBlock *MBB) const {
26849 // Emit code to save XMM registers to the stack. The ABI says that the
26850 // number of registers to save is given in %al, so it's theoretically
26851 // possible to do an indirect jump trick to avoid saving all of them,
26852 // however this code takes a simpler approach and just executes all
26853 // of the stores if %al is non-zero. It's less code, and it's probably
26854 // easier on the hardware branch predictor, and stores aren't all that
26855 // expensive anyway.
26857 // Create the new basic blocks. One block contains all the XMM stores,
26858 // and one block is the final destination regardless of whether any
26859 // stores were performed.
26860 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26861 MachineFunction *F = MBB->getParent();
26862 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26863 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26864 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26865 F->insert(MBBIter, XMMSaveMBB);
26866 F->insert(MBBIter, EndMBB);
26868 // Transfer the remainder of MBB and its successor edges to EndMBB.
26869 EndMBB->splice(EndMBB->begin(), MBB,
26870 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26871 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26873 // The original block will now fall through to the XMM save block.
26874 MBB->addSuccessor(XMMSaveMBB);
26875 // The XMMSaveMBB will fall through to the end block.
26876 XMMSaveMBB->addSuccessor(EndMBB);
26878 // Now add the instructions.
26879 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26880 DebugLoc DL = MI.getDebugLoc();
26882 unsigned CountReg = MI.getOperand(0).getReg();
26883 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26884 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26886 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26887 // If %al is 0, branch around the XMM save block.
26888 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26889 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26890 MBB->addSuccessor(EndMBB);
26893 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26894 // that was just emitted, but clearly shouldn't be "saved".
26895 assert((MI.getNumOperands() <= 3 ||
26896 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26897 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26898 "Expected last argument to be EFLAGS");
26899 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26900 // In the XMM save block, save all the XMM argument registers.
26901 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26902 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26903 MachineMemOperand *MMO = F->getMachineMemOperand(
26904 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26905 MachineMemOperand::MOStore,
26906 /*Size=*/16, /*Align=*/16);
26907 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26908 .addFrameIndex(RegSaveFrameIndex)
26909 .addImm(/*Scale=*/1)
26910 .addReg(/*IndexReg=*/0)
26911 .addImm(/*Disp=*/Offset)
26912 .addReg(/*Segment=*/0)
26913 .addReg(MI.getOperand(i).getReg())
26914 .addMemOperand(MMO);
26917 MI.eraseFromParent(); // The pseudo instruction is gone now.
26922 // The EFLAGS operand of SelectItr might be missing a kill marker
26923 // because there were multiple uses of EFLAGS, and ISel didn't know
26924 // which to mark. Figure out whether SelectItr should have had a
26925 // kill marker, and set it if it should. Returns the correct kill
26927 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26928 MachineBasicBlock* BB,
26929 const TargetRegisterInfo* TRI) {
26930 // Scan forward through BB for a use/def of EFLAGS.
26931 MachineBasicBlock::iterator miI(std::next(SelectItr));
26932 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26933 const MachineInstr& mi = *miI;
26934 if (mi.readsRegister(X86::EFLAGS))
26936 if (mi.definesRegister(X86::EFLAGS))
26937 break; // Should have kill-flag - update below.
26940 // If we hit the end of the block, check whether EFLAGS is live into a
26942 if (miI == BB->end()) {
26943 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26944 sEnd = BB->succ_end();
26945 sItr != sEnd; ++sItr) {
26946 MachineBasicBlock* succ = *sItr;
26947 if (succ->isLiveIn(X86::EFLAGS))
26952 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26953 // out. SelectMI should have a kill flag on EFLAGS.
26954 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26958 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26959 // together with other CMOV pseudo-opcodes into a single basic-block with
26960 // conditional jump around it.
26961 static bool isCMOVPseudo(MachineInstr &MI) {
26962 switch (MI.getOpcode()) {
26963 case X86::CMOV_FR32:
26964 case X86::CMOV_FR64:
26965 case X86::CMOV_GR8:
26966 case X86::CMOV_GR16:
26967 case X86::CMOV_GR32:
26968 case X86::CMOV_RFP32:
26969 case X86::CMOV_RFP64:
26970 case X86::CMOV_RFP80:
26971 case X86::CMOV_V2F64:
26972 case X86::CMOV_V2I64:
26973 case X86::CMOV_V4F32:
26974 case X86::CMOV_V4F64:
26975 case X86::CMOV_V4I64:
26976 case X86::CMOV_V16F32:
26977 case X86::CMOV_V8F32:
26978 case X86::CMOV_V8F64:
26979 case X86::CMOV_V8I64:
26980 case X86::CMOV_V8I1:
26981 case X86::CMOV_V16I1:
26982 case X86::CMOV_V32I1:
26983 case X86::CMOV_V64I1:
26991 // Helper function, which inserts PHI functions into SinkMBB:
26992 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26993 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26994 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26995 // the last PHI function inserted.
26996 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26997 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26998 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26999 MachineBasicBlock *SinkMBB) {
27000 MachineFunction *MF = TrueMBB->getParent();
27001 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
27002 DebugLoc DL = MIItBegin->getDebugLoc();
27004 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
27005 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27007 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
27009 // As we are creating the PHIs, we have to be careful if there is more than
27010 // one. Later CMOVs may reference the results of earlier CMOVs, but later
27011 // PHIs have to reference the individual true/false inputs from earlier PHIs.
27012 // That also means that PHI construction must work forward from earlier to
27013 // later, and that the code must maintain a mapping from earlier PHI's
27014 // destination registers, and the registers that went into the PHI.
27015 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
27016 MachineInstrBuilder MIB;
27018 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
27019 unsigned DestReg = MIIt->getOperand(0).getReg();
27020 unsigned Op1Reg = MIIt->getOperand(1).getReg();
27021 unsigned Op2Reg = MIIt->getOperand(2).getReg();
27023 // If this CMOV we are generating is the opposite condition from
27024 // the jump we generated, then we have to swap the operands for the
27025 // PHI that is going to be generated.
27026 if (MIIt->getOperand(3).getImm() == OppCC)
27027 std::swap(Op1Reg, Op2Reg);
27029 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
27030 Op1Reg = RegRewriteTable[Op1Reg].first;
27032 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
27033 Op2Reg = RegRewriteTable[Op2Reg].second;
27035 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
27041 // Add this PHI to the rewrite table.
27042 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
27048 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
27049 MachineBasicBlock *
27050 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
27051 MachineInstr &SecondCascadedCMOV,
27052 MachineBasicBlock *ThisMBB) const {
27053 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27054 DebugLoc DL = FirstCMOV.getDebugLoc();
27056 // We lower cascaded CMOVs such as
27058 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
27060 // to two successive branches.
27062 // Without this, we would add a PHI between the two jumps, which ends up
27063 // creating a few copies all around. For instance, for
27065 // (sitofp (zext (fcmp une)))
27067 // we would generate:
27069 // ucomiss %xmm1, %xmm0
27070 // movss <1.0f>, %xmm0
27071 // movaps %xmm0, %xmm1
27073 // xorps %xmm1, %xmm1
27076 // movaps %xmm1, %xmm0
27080 // because this custom-inserter would have generated:
27092 // A: X = ...; Y = ...
27094 // C: Z = PHI [X, A], [Y, B]
27096 // E: PHI [X, C], [Z, D]
27098 // If we lower both CMOVs in a single step, we can instead generate:
27110 // A: X = ...; Y = ...
27112 // E: PHI [X, A], [X, C], [Y, D]
27114 // Which, in our sitofp/fcmp example, gives us something like:
27116 // ucomiss %xmm1, %xmm0
27117 // movss <1.0f>, %xmm0
27120 // xorps %xmm0, %xmm0
27125 // We lower cascaded CMOV into two successive branches to the same block.
27126 // EFLAGS is used by both, so mark it as live in the second.
27127 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27128 MachineFunction *F = ThisMBB->getParent();
27129 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27130 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
27131 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27133 MachineFunction::iterator It = ++ThisMBB->getIterator();
27134 F->insert(It, FirstInsertedMBB);
27135 F->insert(It, SecondInsertedMBB);
27136 F->insert(It, SinkMBB);
27138 // For a cascaded CMOV, we lower it to two successive branches to
27139 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
27140 // the FirstInsertedMBB.
27141 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
27143 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27144 // live into the sink and copy blocks.
27145 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27146 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
27147 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
27148 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
27149 SinkMBB->addLiveIn(X86::EFLAGS);
27152 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27153 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27154 std::next(MachineBasicBlock::iterator(FirstCMOV)),
27156 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27158 // Fallthrough block for ThisMBB.
27159 ThisMBB->addSuccessor(FirstInsertedMBB);
27160 // The true block target of the first branch is always SinkMBB.
27161 ThisMBB->addSuccessor(SinkMBB);
27162 // Fallthrough block for FirstInsertedMBB.
27163 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
27164 // The true block for the branch of FirstInsertedMBB.
27165 FirstInsertedMBB->addSuccessor(SinkMBB);
27166 // This is fallthrough.
27167 SecondInsertedMBB->addSuccessor(SinkMBB);
27169 // Create the conditional branch instructions.
27170 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
27171 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
27172 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27174 X86::CondCode SecondCC =
27175 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
27176 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
27177 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
27180 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
27181 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
27182 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
27183 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
27184 MachineInstrBuilder MIB =
27185 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
27187 .addMBB(SecondInsertedMBB)
27191 // The second SecondInsertedMBB provides the same incoming value as the
27192 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
27193 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
27194 // Copy the PHI result to the register defined by the second CMOV.
27195 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
27196 TII->get(TargetOpcode::COPY),
27197 SecondCascadedCMOV.getOperand(0).getReg())
27198 .addReg(FirstCMOV.getOperand(0).getReg());
27200 // Now remove the CMOVs.
27201 FirstCMOV.eraseFromParent();
27202 SecondCascadedCMOV.eraseFromParent();
27207 MachineBasicBlock *
27208 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
27209 MachineBasicBlock *ThisMBB) const {
27210 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27211 DebugLoc DL = MI.getDebugLoc();
27213 // To "insert" a SELECT_CC instruction, we actually have to insert the
27214 // diamond control-flow pattern. The incoming instruction knows the
27215 // destination vreg to set, the condition code register to branch on, the
27216 // true/false values to select between and a branch opcode to use.
27221 // cmpTY ccX, r1, r2
27223 // fallthrough --> FalseMBB
27225 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
27226 // as described above, by inserting a BB, and then making a PHI at the join
27227 // point to select the true and false operands of the CMOV in the PHI.
27229 // The code also handles two different cases of multiple CMOV opcodes
27233 // In this case, there are multiple CMOVs in a row, all which are based on
27234 // the same condition setting (or the exact opposite condition setting).
27235 // In this case we can lower all the CMOVs using a single inserted BB, and
27236 // then make a number of PHIs at the join point to model the CMOVs. The only
27237 // trickiness here, is that in a case like:
27239 // t2 = CMOV cond1 t1, f1
27240 // t3 = CMOV cond1 t2, f2
27242 // when rewriting this into PHIs, we have to perform some renaming on the
27243 // temps since you cannot have a PHI operand refer to a PHI result earlier
27244 // in the same block. The "simple" but wrong lowering would be:
27246 // t2 = PHI t1(BB1), f1(BB2)
27247 // t3 = PHI t2(BB1), f2(BB2)
27249 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
27250 // renaming is to note that on the path through BB1, t2 is really just a
27251 // copy of t1, and do that renaming, properly generating:
27253 // t2 = PHI t1(BB1), f1(BB2)
27254 // t3 = PHI t1(BB1), f2(BB2)
27257 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
27258 // function - EmitLoweredCascadedSelect.
27260 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
27261 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
27262 MachineInstr *LastCMOV = &MI;
27263 MachineBasicBlock::iterator NextMIIt =
27264 std::next(MachineBasicBlock::iterator(MI));
27266 // Check for case 1, where there are multiple CMOVs with the same condition
27267 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
27268 // number of jumps the most.
27270 if (isCMOVPseudo(MI)) {
27271 // See if we have a string of CMOVS with the same condition.
27272 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
27273 (NextMIIt->getOperand(3).getImm() == CC ||
27274 NextMIIt->getOperand(3).getImm() == OppCC)) {
27275 LastCMOV = &*NextMIIt;
27280 // This checks for case 2, but only do this if we didn't already find
27281 // case 1, as indicated by LastCMOV == MI.
27282 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
27283 NextMIIt->getOpcode() == MI.getOpcode() &&
27284 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
27285 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
27286 NextMIIt->getOperand(1).isKill()) {
27287 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
27290 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27291 MachineFunction *F = ThisMBB->getParent();
27292 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
27293 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27295 MachineFunction::iterator It = ++ThisMBB->getIterator();
27296 F->insert(It, FalseMBB);
27297 F->insert(It, SinkMBB);
27299 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27300 // live into the sink and copy blocks.
27301 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27302 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
27303 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
27304 FalseMBB->addLiveIn(X86::EFLAGS);
27305 SinkMBB->addLiveIn(X86::EFLAGS);
27308 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27309 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27310 std::next(MachineBasicBlock::iterator(LastCMOV)),
27312 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27314 // Fallthrough block for ThisMBB.
27315 ThisMBB->addSuccessor(FalseMBB);
27316 // The true block target of the first (or only) branch is always a SinkMBB.
27317 ThisMBB->addSuccessor(SinkMBB);
27318 // Fallthrough block for FalseMBB.
27319 FalseMBB->addSuccessor(SinkMBB);
27321 // Create the conditional branch instruction.
27322 unsigned Opc = X86::GetCondBranchFromCond(CC);
27323 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27326 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
27328 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
27329 MachineBasicBlock::iterator MIItEnd =
27330 std::next(MachineBasicBlock::iterator(LastCMOV));
27331 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
27333 // Now remove the CMOV(s).
27334 ThisMBB->erase(MIItBegin, MIItEnd);
27339 MachineBasicBlock *
27340 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
27341 MachineBasicBlock *BB) const {
27342 // Combine the following atomic floating-point modification pattern:
27343 // a.store(reg OP a.load(acquire), release)
27344 // Transform them into:
27345 // OPss (%gpr), %xmm
27346 // movss %xmm, (%gpr)
27347 // Or sd equivalent for 64-bit operations.
27349 switch (MI.getOpcode()) {
27350 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
27351 case X86::RELEASE_FADD32mr:
27352 FOp = X86::ADDSSrm;
27353 MOp = X86::MOVSSmr;
27355 case X86::RELEASE_FADD64mr:
27356 FOp = X86::ADDSDrm;
27357 MOp = X86::MOVSDmr;
27360 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27361 DebugLoc DL = MI.getDebugLoc();
27362 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
27363 unsigned ValOpIdx = X86::AddrNumOperands;
27364 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
27365 MachineInstrBuilder MIB =
27366 BuildMI(*BB, MI, DL, TII->get(FOp),
27367 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
27369 for (int i = 0; i < X86::AddrNumOperands; ++i) {
27370 MachineOperand &Operand = MI.getOperand(i);
27371 // Clear any kill flags on register operands as we'll create a second
27372 // instruction using the same address operands.
27373 if (Operand.isReg())
27374 Operand.setIsKill(false);
27377 MachineInstr *FOpMI = MIB;
27378 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
27379 for (int i = 0; i < X86::AddrNumOperands; ++i)
27380 MIB.add(MI.getOperand(i));
27381 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
27382 MI.eraseFromParent(); // The pseudo instruction is gone now.
27386 MachineBasicBlock *
27387 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
27388 MachineBasicBlock *BB) const {
27389 MachineFunction *MF = BB->getParent();
27390 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27391 DebugLoc DL = MI.getDebugLoc();
27392 const BasicBlock *LLVM_BB = BB->getBasicBlock();
27394 assert(MF->shouldSplitStack());
27396 const bool Is64Bit = Subtarget.is64Bit();
27397 const bool IsLP64 = Subtarget.isTarget64BitLP64();
27399 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
27400 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
27403 // ... [Till the alloca]
27404 // If stacklet is not large enough, jump to mallocMBB
27407 // Allocate by subtracting from RSP
27408 // Jump to continueMBB
27411 // Allocate by call to runtime
27415 // [rest of original BB]
27418 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27419 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27420 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27422 MachineRegisterInfo &MRI = MF->getRegInfo();
27423 const TargetRegisterClass *AddrRegClass =
27424 getRegClassFor(getPointerTy(MF->getDataLayout()));
27426 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27427 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27428 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
27429 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
27430 sizeVReg = MI.getOperand(1).getReg(),
27432 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
27434 MachineFunction::iterator MBBIter = ++BB->getIterator();
27436 MF->insert(MBBIter, bumpMBB);
27437 MF->insert(MBBIter, mallocMBB);
27438 MF->insert(MBBIter, continueMBB);
27440 continueMBB->splice(continueMBB->begin(), BB,
27441 std::next(MachineBasicBlock::iterator(MI)), BB->end());
27442 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
27444 // Add code to the main basic block to check if the stack limit has been hit,
27445 // and if so, jump to mallocMBB otherwise to bumpMBB.
27446 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
27447 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
27448 .addReg(tmpSPVReg).addReg(sizeVReg);
27449 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
27450 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
27451 .addReg(SPLimitVReg);
27452 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
27454 // bumpMBB simply decreases the stack pointer, since we know the current
27455 // stacklet has enough space.
27456 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
27457 .addReg(SPLimitVReg);
27458 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
27459 .addReg(SPLimitVReg);
27460 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27462 // Calls into a routine in libgcc to allocate more space from the heap.
27463 const uint32_t *RegMask =
27464 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
27466 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
27468 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27469 .addExternalSymbol("__morestack_allocate_stack_space")
27470 .addRegMask(RegMask)
27471 .addReg(X86::RDI, RegState::Implicit)
27472 .addReg(X86::RAX, RegState::ImplicitDefine);
27473 } else if (Is64Bit) {
27474 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
27476 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27477 .addExternalSymbol("__morestack_allocate_stack_space")
27478 .addRegMask(RegMask)
27479 .addReg(X86::EDI, RegState::Implicit)
27480 .addReg(X86::EAX, RegState::ImplicitDefine);
27482 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
27484 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
27485 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
27486 .addExternalSymbol("__morestack_allocate_stack_space")
27487 .addRegMask(RegMask)
27488 .addReg(X86::EAX, RegState::ImplicitDefine);
27492 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
27495 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
27496 .addReg(IsLP64 ? X86::RAX : X86::EAX);
27497 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27499 // Set up the CFG correctly.
27500 BB->addSuccessor(bumpMBB);
27501 BB->addSuccessor(mallocMBB);
27502 mallocMBB->addSuccessor(continueMBB);
27503 bumpMBB->addSuccessor(continueMBB);
27505 // Take care of the PHI nodes.
27506 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
27507 MI.getOperand(0).getReg())
27508 .addReg(mallocPtrVReg)
27510 .addReg(bumpSPPtrVReg)
27513 // Delete the original pseudo instruction.
27514 MI.eraseFromParent();
27517 return continueMBB;
27520 MachineBasicBlock *
27521 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
27522 MachineBasicBlock *BB) const {
27523 MachineFunction *MF = BB->getParent();
27524 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27525 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
27526 DebugLoc DL = MI.getDebugLoc();
27528 assert(!isAsynchronousEHPersonality(
27529 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
27530 "SEH does not use catchret!");
27532 // Only 32-bit EH needs to worry about manually restoring stack pointers.
27533 if (!Subtarget.is32Bit())
27536 // C++ EH creates a new target block to hold the restore code, and wires up
27537 // the new block to the return destination with a normal JMP_4.
27538 MachineBasicBlock *RestoreMBB =
27539 MF->CreateMachineBasicBlock(BB->getBasicBlock());
27540 assert(BB->succ_size() == 1);
27541 MF->insert(std::next(BB->getIterator()), RestoreMBB);
27542 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
27543 BB->addSuccessor(RestoreMBB);
27544 MI.getOperand(0).setMBB(RestoreMBB);
27546 auto RestoreMBBI = RestoreMBB->begin();
27547 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
27548 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
27552 MachineBasicBlock *
27553 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
27554 MachineBasicBlock *BB) const {
27555 MachineFunction *MF = BB->getParent();
27556 const Constant *PerFn = MF->getFunction().getPersonalityFn();
27557 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
27558 // Only 32-bit SEH requires special handling for catchpad.
27559 if (IsSEH && Subtarget.is32Bit()) {
27560 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27561 DebugLoc DL = MI.getDebugLoc();
27562 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27564 MI.eraseFromParent();
27568 MachineBasicBlock *
27569 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27570 MachineBasicBlock *BB) const {
27571 // So, here we replace TLSADDR with the sequence:
27572 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27573 // We need this because TLSADDR is lowered into calls
27574 // inside MC, therefore without the two markers shrink-wrapping
27575 // may push the prologue/epilogue pass them.
27576 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27577 DebugLoc DL = MI.getDebugLoc();
27578 MachineFunction &MF = *BB->getParent();
27580 // Emit CALLSEQ_START right before the instruction.
27581 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27582 MachineInstrBuilder CallseqStart =
27583 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27584 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27586 // Emit CALLSEQ_END right after the instruction.
27587 // We don't call erase from parent because we want to keep the
27588 // original instruction around.
27589 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27590 MachineInstrBuilder CallseqEnd =
27591 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27592 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27597 MachineBasicBlock *
27598 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27599 MachineBasicBlock *BB) const {
27600 // This is pretty easy. We're taking the value that we received from
27601 // our load from the relocation, sticking it in either RDI (x86-64)
27602 // or EAX and doing an indirect call. The return value will then
27603 // be in the normal return register.
27604 MachineFunction *F = BB->getParent();
27605 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27606 DebugLoc DL = MI.getDebugLoc();
27608 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27609 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27611 // Get a register mask for the lowered call.
27612 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27613 // proper register mask.
27614 const uint32_t *RegMask =
27615 Subtarget.is64Bit() ?
27616 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27617 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27618 if (Subtarget.is64Bit()) {
27619 MachineInstrBuilder MIB =
27620 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27624 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27625 MI.getOperand(3).getTargetFlags())
27627 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27628 addDirectMem(MIB, X86::RDI);
27629 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27630 } else if (!isPositionIndependent()) {
27631 MachineInstrBuilder MIB =
27632 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27636 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27637 MI.getOperand(3).getTargetFlags())
27639 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27640 addDirectMem(MIB, X86::EAX);
27641 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27643 MachineInstrBuilder MIB =
27644 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27645 .addReg(TII->getGlobalBaseReg(F))
27648 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27649 MI.getOperand(3).getTargetFlags())
27651 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27652 addDirectMem(MIB, X86::EAX);
27653 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27656 MI.eraseFromParent(); // The pseudo instruction is gone now.
27660 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27662 case X86::RETPOLINE_CALL32:
27663 return X86::CALLpcrel32;
27664 case X86::RETPOLINE_CALL64:
27665 return X86::CALL64pcrel32;
27666 case X86::RETPOLINE_TCRETURN32:
27667 return X86::TCRETURNdi;
27668 case X86::RETPOLINE_TCRETURN64:
27669 return X86::TCRETURNdi64;
27671 llvm_unreachable("not retpoline opcode");
27674 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27676 if (Subtarget.useRetpolineExternalThunk()) {
27677 // When using an external thunk for retpolines, we pick names that match the
27678 // names GCC happens to use as well. This helps simplify the implementation
27679 // of the thunks for kernels where they have no easy ability to create
27680 // aliases and are doing non-trivial configuration of the thunk's body. For
27681 // example, the Linux kernel will do boot-time hot patching of the thunk
27682 // bodies and cannot easily export aliases of these to loaded modules.
27684 // Note that at any point in the future, we may need to change the semantics
27685 // of how we implement retpolines and at that time will likely change the
27686 // name of the called thunk. Essentially, there is no hard guarantee that
27687 // LLVM will generate calls to specific thunks, we merely make a best-effort
27688 // attempt to help out kernels and other systems where duplicating the
27689 // thunks is costly.
27692 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27693 return "__x86_indirect_thunk_eax";
27695 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27696 return "__x86_indirect_thunk_ecx";
27698 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27699 return "__x86_indirect_thunk_edx";
27701 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27702 return "__x86_indirect_thunk_edi";
27704 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27705 return "__x86_indirect_thunk_r11";
27707 llvm_unreachable("unexpected reg for retpoline");
27710 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27713 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27714 return "__llvm_retpoline_eax";
27716 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27717 return "__llvm_retpoline_ecx";
27719 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27720 return "__llvm_retpoline_edx";
27722 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27723 return "__llvm_retpoline_edi";
27725 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27726 return "__llvm_retpoline_r11";
27728 llvm_unreachable("unexpected reg for retpoline");
27731 MachineBasicBlock *
27732 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27733 MachineBasicBlock *BB) const {
27734 // Copy the virtual register into the R11 physical register and
27735 // call the retpoline thunk.
27736 DebugLoc DL = MI.getDebugLoc();
27737 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27738 unsigned CalleeVReg = MI.getOperand(0).getReg();
27739 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27741 // Find an available scratch register to hold the callee. On 64-bit, we can
27742 // just use R11, but we scan for uses anyway to ensure we don't generate
27743 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27744 // already a register use operand to the call to hold the callee. If none
27745 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27746 // register and ESI is the base pointer to realigned stack frames with VLAs.
27747 SmallVector<unsigned, 3> AvailableRegs;
27748 if (Subtarget.is64Bit())
27749 AvailableRegs.push_back(X86::R11);
27751 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27753 // Zero out any registers that are already used.
27754 for (const auto &MO : MI.operands()) {
27755 if (MO.isReg() && MO.isUse())
27756 for (unsigned &Reg : AvailableRegs)
27757 if (Reg == MO.getReg())
27761 // Choose the first remaining non-zero available register.
27762 unsigned AvailableReg = 0;
27763 for (unsigned MaybeReg : AvailableRegs) {
27765 AvailableReg = MaybeReg;
27770 report_fatal_error("calling convention incompatible with retpoline, no "
27771 "available registers");
27773 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27775 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27776 .addReg(CalleeVReg);
27777 MI.getOperand(0).ChangeToES(Symbol);
27778 MI.setDesc(TII->get(Opc));
27779 MachineInstrBuilder(*BB->getParent(), &MI)
27780 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27784 /// SetJmp implies future control flow change upon calling the corresponding
27786 /// Instead of using the 'return' instruction, the long jump fixes the stack and
27787 /// performs an indirect branch. To do so it uses the registers that were stored
27788 /// in the jump buffer (when calling SetJmp).
27789 /// In case the shadow stack is enabled we need to fix it as well, because some
27790 /// return addresses will be skipped.
27791 /// The function will save the SSP for future fixing in the function
27792 /// emitLongJmpShadowStackFix.
27793 /// \sa emitLongJmpShadowStackFix
27794 /// \param [in] MI The temporary Machine Instruction for the builtin.
27795 /// \param [in] MBB The Machine Basic Block that will be modified.
27796 void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI,
27797 MachineBasicBlock *MBB) const {
27798 DebugLoc DL = MI.getDebugLoc();
27799 MachineFunction *MF = MBB->getParent();
27800 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27801 MachineRegisterInfo &MRI = MF->getRegInfo();
27802 MachineInstrBuilder MIB;
27804 // Memory Reference.
27805 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27806 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27808 // Initialize a register with zero.
27809 MVT PVT = getPointerTy(MF->getDataLayout());
27810 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27811 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
27812 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
27813 BuildMI(*MBB, MI, DL, TII->get(XorRROpc))
27815 .addReg(ZReg, RegState::Undef)
27816 .addReg(ZReg, RegState::Undef);
27818 // Read the current SSP Register value to the zeroed register.
27819 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
27820 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
27821 BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
27823 // Write the SSP register value to offset 3 in input memory buffer.
27824 unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27825 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc));
27826 const int64_t SSPOffset = 3 * PVT.getStoreSize();
27827 const unsigned MemOpndSlot = 1;
27828 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27829 if (i == X86::AddrDisp)
27830 MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset);
27832 MIB.add(MI.getOperand(MemOpndSlot + i));
27834 MIB.addReg(SSPCopyReg);
27835 MIB.setMemRefs(MMOBegin, MMOEnd);
27838 MachineBasicBlock *
27839 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27840 MachineBasicBlock *MBB) const {
27841 DebugLoc DL = MI.getDebugLoc();
27842 MachineFunction *MF = MBB->getParent();
27843 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27844 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27845 MachineRegisterInfo &MRI = MF->getRegInfo();
27847 const BasicBlock *BB = MBB->getBasicBlock();
27848 MachineFunction::iterator I = ++MBB->getIterator();
27850 // Memory Reference
27851 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27852 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27855 unsigned MemOpndSlot = 0;
27857 unsigned CurOp = 0;
27859 DstReg = MI.getOperand(CurOp++).getReg();
27860 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27861 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27863 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27864 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27866 MemOpndSlot = CurOp;
27868 MVT PVT = getPointerTy(MF->getDataLayout());
27869 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27870 "Invalid Pointer Size!");
27872 // For v = setjmp(buf), we generate
27875 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27876 // SjLjSetup restoreMBB
27882 // v = phi(main, restore)
27885 // if base pointer being used, load it from frame
27888 MachineBasicBlock *thisMBB = MBB;
27889 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27890 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27891 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27892 MF->insert(I, mainMBB);
27893 MF->insert(I, sinkMBB);
27894 MF->push_back(restoreMBB);
27895 restoreMBB->setHasAddressTaken();
27897 MachineInstrBuilder MIB;
27899 // Transfer the remainder of BB and its successor edges to sinkMBB.
27900 sinkMBB->splice(sinkMBB->begin(), MBB,
27901 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27902 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27905 unsigned PtrStoreOpc = 0;
27906 unsigned LabelReg = 0;
27907 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27908 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27909 !isPositionIndependent();
27911 // Prepare IP either in reg or imm.
27912 if (!UseImmLabel) {
27913 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27914 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27915 LabelReg = MRI.createVirtualRegister(PtrRC);
27916 if (Subtarget.is64Bit()) {
27917 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27921 .addMBB(restoreMBB)
27924 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27925 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27926 .addReg(XII->getGlobalBaseReg(MF))
27929 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27933 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27935 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27936 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27937 if (i == X86::AddrDisp)
27938 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27940 MIB.add(MI.getOperand(MemOpndSlot + i));
27943 MIB.addReg(LabelReg);
27945 MIB.addMBB(restoreMBB);
27946 MIB.setMemRefs(MMOBegin, MMOEnd);
27948 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
27949 emitSetJmpShadowStackFix(MI, thisMBB);
27953 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27954 .addMBB(restoreMBB);
27956 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27957 MIB.addRegMask(RegInfo->getNoPreservedMask());
27958 thisMBB->addSuccessor(mainMBB);
27959 thisMBB->addSuccessor(restoreMBB);
27963 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27964 mainMBB->addSuccessor(sinkMBB);
27967 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27968 TII->get(X86::PHI), DstReg)
27969 .addReg(mainDstReg).addMBB(mainMBB)
27970 .addReg(restoreDstReg).addMBB(restoreMBB);
27973 if (RegInfo->hasBasePointer(*MF)) {
27974 const bool Uses64BitFramePtr =
27975 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27976 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27977 X86FI->setRestoreBasePointer(MF);
27978 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27979 unsigned BasePtr = RegInfo->getBaseRegister();
27980 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27981 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27982 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27983 .setMIFlag(MachineInstr::FrameSetup);
27985 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27986 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27987 restoreMBB->addSuccessor(sinkMBB);
27989 MI.eraseFromParent();
27993 /// Fix the shadow stack using the previously saved SSP pointer.
27994 /// \sa emitSetJmpShadowStackFix
27995 /// \param [in] MI The temporary Machine Instruction for the builtin.
27996 /// \param [in] MBB The Machine Basic Block that will be modified.
27997 /// \return The sink MBB that will perform the future indirect branch.
27998 MachineBasicBlock *
27999 X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
28000 MachineBasicBlock *MBB) const {
28001 DebugLoc DL = MI.getDebugLoc();
28002 MachineFunction *MF = MBB->getParent();
28003 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28004 MachineRegisterInfo &MRI = MF->getRegInfo();
28006 // Memory Reference
28007 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28008 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28010 MVT PVT = getPointerTy(MF->getDataLayout());
28011 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
28014 // xor vreg1, vreg1
28016 // test vreg1, vreg1
28017 // je sinkMBB # Jump if Shadow Stack is not supported
28019 // mov buf+24/12(%rip), vreg2
28020 // sub vreg1, vreg2
28021 // jbe sinkMBB # No need to fix the Shadow Stack
28024 // incssp vreg2 # fix the SSP according to the lower 8 bits
28027 // fixShadowLoopPrepareMBB:
28030 // fixShadowLoopMBB:
28033 // jne fixShadowLoopMBB # Iterate until you finish fixing
28034 // # the Shadow Stack
28037 MachineFunction::iterator I = ++MBB->getIterator();
28038 const BasicBlock *BB = MBB->getBasicBlock();
28040 MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB);
28041 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
28042 MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB);
28043 MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB);
28044 MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB);
28045 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
28046 MF->insert(I, checkSspMBB);
28047 MF->insert(I, fallMBB);
28048 MF->insert(I, fixShadowMBB);
28049 MF->insert(I, fixShadowLoopPrepareMBB);
28050 MF->insert(I, fixShadowLoopMBB);
28051 MF->insert(I, sinkMBB);
28053 // Transfer the remainder of BB and its successor edges to sinkMBB.
28054 sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI),
28056 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
28058 MBB->addSuccessor(checkSspMBB);
28060 // Initialize a register with zero.
28061 unsigned ZReg = MRI.createVirtualRegister(PtrRC);
28062 unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
28063 BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
28065 .addReg(ZReg, RegState::Undef)
28066 .addReg(ZReg, RegState::Undef);
28068 // Read the current SSP Register value to the zeroed register.
28069 unsigned SSPCopyReg = MRI.createVirtualRegister(PtrRC);
28070 unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD;
28071 BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg);
28073 // Check whether the result of the SSP register is zero and jump directly
28075 unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr;
28076 BuildMI(checkSspMBB, DL, TII->get(TestRROpc))
28077 .addReg(SSPCopyReg)
28078 .addReg(SSPCopyReg);
28079 BuildMI(checkSspMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28080 checkSspMBB->addSuccessor(sinkMBB);
28081 checkSspMBB->addSuccessor(fallMBB);
28083 // Reload the previously saved SSP register value.
28084 unsigned PrevSSPReg = MRI.createVirtualRegister(PtrRC);
28085 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28086 const int64_t SPPOffset = 3 * PVT.getStoreSize();
28087 MachineInstrBuilder MIB =
28088 BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg);
28089 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28090 if (i == X86::AddrDisp)
28091 MIB.addDisp(MI.getOperand(i), SPPOffset);
28093 MIB.add(MI.getOperand(i));
28095 MIB.setMemRefs(MMOBegin, MMOEnd);
28097 // Subtract the current SSP from the previous SSP.
28098 unsigned SspSubReg = MRI.createVirtualRegister(PtrRC);
28099 unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr;
28100 BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg)
28101 .addReg(PrevSSPReg)
28102 .addReg(SSPCopyReg);
28104 // Jump to sink in case PrevSSPReg <= SSPCopyReg.
28105 BuildMI(fallMBB, DL, TII->get(X86::JBE_1)).addMBB(sinkMBB);
28106 fallMBB->addSuccessor(sinkMBB);
28107 fallMBB->addSuccessor(fixShadowMBB);
28109 // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8.
28110 unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri;
28111 unsigned Offset = (PVT == MVT::i64) ? 3 : 2;
28112 unsigned SspFirstShrReg = MRI.createVirtualRegister(PtrRC);
28113 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg)
28117 // Increase SSP when looking only on the lower 8 bits of the delta.
28118 unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD;
28119 BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg);
28121 // Reset the lower 8 bits.
28122 unsigned SspSecondShrReg = MRI.createVirtualRegister(PtrRC);
28123 BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg)
28124 .addReg(SspFirstShrReg)
28127 // Jump if the result of the shift is zero.
28128 BuildMI(fixShadowMBB, DL, TII->get(X86::JE_1)).addMBB(sinkMBB);
28129 fixShadowMBB->addSuccessor(sinkMBB);
28130 fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB);
28132 // Do a single shift left.
28133 unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1;
28134 unsigned SspAfterShlReg = MRI.createVirtualRegister(PtrRC);
28135 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg)
28136 .addReg(SspSecondShrReg);
28138 // Save the value 128 to a register (will be used next with incssp).
28139 unsigned Value128InReg = MRI.createVirtualRegister(PtrRC);
28140 unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri;
28141 BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg)
28143 fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB);
28145 // Since incssp only looks at the lower 8 bits, we might need to do several
28146 // iterations of incssp until we finish fixing the shadow stack.
28147 unsigned DecReg = MRI.createVirtualRegister(PtrRC);
28148 unsigned CounterReg = MRI.createVirtualRegister(PtrRC);
28149 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg)
28150 .addReg(SspAfterShlReg)
28151 .addMBB(fixShadowLoopPrepareMBB)
28153 .addMBB(fixShadowLoopMBB);
28155 // Every iteration we increase the SSP by 128.
28156 BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg);
28158 // Every iteration we decrement the counter by 1.
28159 unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r;
28160 BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg);
28162 // Jump if the counter is not zero yet.
28163 BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JNE_1)).addMBB(fixShadowLoopMBB);
28164 fixShadowLoopMBB->addSuccessor(sinkMBB);
28165 fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB);
28170 MachineBasicBlock *
28171 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
28172 MachineBasicBlock *MBB) const {
28173 DebugLoc DL = MI.getDebugLoc();
28174 MachineFunction *MF = MBB->getParent();
28175 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28176 MachineRegisterInfo &MRI = MF->getRegInfo();
28178 // Memory Reference
28179 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
28180 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
28182 MVT PVT = getPointerTy(MF->getDataLayout());
28183 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
28184 "Invalid Pointer Size!");
28186 const TargetRegisterClass *RC =
28187 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28188 unsigned Tmp = MRI.createVirtualRegister(RC);
28189 // Since FP is only updated here but NOT referenced, it's treated as GPR.
28190 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
28191 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
28192 unsigned SP = RegInfo->getStackRegister();
28194 MachineInstrBuilder MIB;
28196 const int64_t LabelOffset = 1 * PVT.getStoreSize();
28197 const int64_t SPOffset = 2 * PVT.getStoreSize();
28199 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
28200 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
28202 MachineBasicBlock *thisMBB = MBB;
28204 // When CET and shadow stack is enabled, we need to fix the Shadow Stack.
28205 if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) {
28206 thisMBB = emitLongJmpShadowStackFix(MI, thisMBB);
28210 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP);
28211 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
28212 MIB.add(MI.getOperand(i));
28213 MIB.setMemRefs(MMOBegin, MMOEnd);
28216 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
28217 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28218 if (i == X86::AddrDisp)
28219 MIB.addDisp(MI.getOperand(i), LabelOffset);
28221 MIB.add(MI.getOperand(i));
28223 MIB.setMemRefs(MMOBegin, MMOEnd);
28226 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP);
28227 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
28228 if (i == X86::AddrDisp)
28229 MIB.addDisp(MI.getOperand(i), SPOffset);
28231 MIB.add(MI.getOperand(i));
28233 MIB.setMemRefs(MMOBegin, MMOEnd);
28236 BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
28238 MI.eraseFromParent();
28242 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
28243 MachineBasicBlock *MBB,
28244 MachineBasicBlock *DispatchBB,
28246 DebugLoc DL = MI.getDebugLoc();
28247 MachineFunction *MF = MBB->getParent();
28248 MachineRegisterInfo *MRI = &MF->getRegInfo();
28249 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28251 MVT PVT = getPointerTy(MF->getDataLayout());
28252 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
28257 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
28258 !isPositionIndependent();
28261 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
28263 const TargetRegisterClass *TRC =
28264 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
28265 VR = MRI->createVirtualRegister(TRC);
28266 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
28268 if (Subtarget.is64Bit())
28269 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
28273 .addMBB(DispatchBB)
28276 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
28277 .addReg(0) /* TII->getGlobalBaseReg(MF) */
28280 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
28284 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
28285 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
28287 MIB.addMBB(DispatchBB);
28292 MachineBasicBlock *
28293 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
28294 MachineBasicBlock *BB) const {
28295 DebugLoc DL = MI.getDebugLoc();
28296 MachineFunction *MF = BB->getParent();
28297 MachineFrameInfo &MFI = MF->getFrameInfo();
28298 MachineRegisterInfo *MRI = &MF->getRegInfo();
28299 const X86InstrInfo *TII = Subtarget.getInstrInfo();
28300 int FI = MFI.getFunctionContextIndex();
28302 // Get a mapping of the call site numbers to all of the landing pads they're
28303 // associated with.
28304 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
28305 unsigned MaxCSNum = 0;
28306 for (auto &MBB : *MF) {
28307 if (!MBB.isEHPad())
28310 MCSymbol *Sym = nullptr;
28311 for (const auto &MI : MBB) {
28312 if (MI.isDebugInstr())
28315 assert(MI.isEHLabel() && "expected EH_LABEL");
28316 Sym = MI.getOperand(0).getMCSymbol();
28320 if (!MF->hasCallSiteLandingPad(Sym))
28323 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
28324 CallSiteNumToLPad[CSI].push_back(&MBB);
28325 MaxCSNum = std::max(MaxCSNum, CSI);
28329 // Get an ordered list of the machine basic blocks for the jump table.
28330 std::vector<MachineBasicBlock *> LPadList;
28331 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
28332 LPadList.reserve(CallSiteNumToLPad.size());
28334 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
28335 for (auto &LP : CallSiteNumToLPad[CSI]) {
28336 LPadList.push_back(LP);
28337 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
28341 assert(!LPadList.empty() &&
28342 "No landing pad destinations for the dispatch jump table!");
28344 // Create the MBBs for the dispatch code.
28346 // Shove the dispatch's address into the return slot in the function context.
28347 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
28348 DispatchBB->setIsEHPad(true);
28350 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
28351 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
28352 DispatchBB->addSuccessor(TrapBB);
28354 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
28355 DispatchBB->addSuccessor(DispContBB);
28358 MF->push_back(DispatchBB);
28359 MF->push_back(DispContBB);
28360 MF->push_back(TrapBB);
28362 // Insert code into the entry block that creates and registers the function
28364 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
28366 // Create the jump table and associated information
28367 unsigned JTE = getJumpTableEncoding();
28368 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
28369 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
28371 const X86RegisterInfo &RI = TII->getRegisterInfo();
28372 // Add a register mask with no preserved registers. This results in all
28373 // registers being marked as clobbered.
28374 if (RI.hasBasePointer(*MF)) {
28375 const bool FPIs64Bit =
28376 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
28377 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
28378 MFI->setRestoreBasePointer(MF);
28380 unsigned FP = RI.getFrameRegister(*MF);
28381 unsigned BP = RI.getBaseRegister();
28382 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
28383 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
28384 MFI->getRestoreBasePointerOffset())
28385 .addRegMask(RI.getNoPreservedMask());
28387 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
28388 .addRegMask(RI.getNoPreservedMask());
28391 // IReg is used as an index in a memory operand and therefore can't be SP
28392 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
28393 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
28394 Subtarget.is64Bit() ? 8 : 4);
28395 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
28397 .addImm(LPadList.size());
28398 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
28400 if (Subtarget.is64Bit()) {
28401 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28402 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
28404 // leaq .LJTI0_0(%rip), BReg
28405 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
28409 .addJumpTableIndex(MJTI)
28411 // movzx IReg64, IReg
28412 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
28415 .addImm(X86::sub_32bit);
28418 case MachineJumpTableInfo::EK_BlockAddress:
28419 // jmpq *(BReg,IReg64,8)
28420 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
28427 case MachineJumpTableInfo::EK_LabelDifference32: {
28428 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
28429 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
28430 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
28432 // movl (BReg,IReg64,4), OReg
28433 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
28439 // movsx OReg64, OReg
28440 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
28441 // addq BReg, OReg64, TReg
28442 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
28446 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
28450 llvm_unreachable("Unexpected jump table encoding");
28453 // jmpl *.LJTI0_0(,IReg,4)
28454 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
28458 .addJumpTableIndex(MJTI)
28462 // Add the jump table entries as successors to the MBB.
28463 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
28464 for (auto &LP : LPadList)
28465 if (SeenMBBs.insert(LP).second)
28466 DispContBB->addSuccessor(LP);
28468 // N.B. the order the invoke BBs are processed in doesn't matter here.
28469 SmallVector<MachineBasicBlock *, 64> MBBLPads;
28470 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
28471 for (MachineBasicBlock *MBB : InvokeBBs) {
28472 // Remove the landing pad successor from the invoke block and replace it
28473 // with the new dispatch block.
28474 // Keep a copy of Successors since it's modified inside the loop.
28475 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
28477 // FIXME: Avoid quadratic complexity.
28478 for (auto MBBS : Successors) {
28479 if (MBBS->isEHPad()) {
28480 MBB->removeSuccessor(MBBS);
28481 MBBLPads.push_back(MBBS);
28485 MBB->addSuccessor(DispatchBB);
28487 // Find the invoke call and mark all of the callee-saved registers as
28488 // 'implicit defined' so that they're spilled. This prevents code from
28489 // moving instructions to before the EH block, where they will never be
28491 for (auto &II : reverse(*MBB)) {
28495 DenseMap<unsigned, bool> DefRegs;
28496 for (auto &MOp : II.operands())
28498 DefRegs[MOp.getReg()] = true;
28500 MachineInstrBuilder MIB(*MF, &II);
28501 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
28502 unsigned Reg = SavedRegs[RI];
28504 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
28511 // Mark all former landing pads as non-landing pads. The dispatch is the only
28512 // landing pad now.
28513 for (auto &LP : MBBLPads)
28514 LP->setIsEHPad(false);
28516 // The instruction is gone now.
28517 MI.eraseFromParent();
28521 MachineBasicBlock *
28522 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
28523 MachineBasicBlock *BB) const {
28524 MachineFunction *MF = BB->getParent();
28525 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28526 DebugLoc DL = MI.getDebugLoc();
28528 switch (MI.getOpcode()) {
28529 default: llvm_unreachable("Unexpected instr type to insert");
28530 case X86::TLS_addr32:
28531 case X86::TLS_addr64:
28532 case X86::TLS_base_addr32:
28533 case X86::TLS_base_addr64:
28534 return EmitLoweredTLSAddr(MI, BB);
28535 case X86::RETPOLINE_CALL32:
28536 case X86::RETPOLINE_CALL64:
28537 case X86::RETPOLINE_TCRETURN32:
28538 case X86::RETPOLINE_TCRETURN64:
28539 return EmitLoweredRetpoline(MI, BB);
28540 case X86::CATCHRET:
28541 return EmitLoweredCatchRet(MI, BB);
28542 case X86::CATCHPAD:
28543 return EmitLoweredCatchPad(MI, BB);
28544 case X86::SEG_ALLOCA_32:
28545 case X86::SEG_ALLOCA_64:
28546 return EmitLoweredSegAlloca(MI, BB);
28547 case X86::TLSCall_32:
28548 case X86::TLSCall_64:
28549 return EmitLoweredTLSCall(MI, BB);
28550 case X86::CMOV_FR32:
28551 case X86::CMOV_FR64:
28552 case X86::CMOV_FR128:
28553 case X86::CMOV_GR8:
28554 case X86::CMOV_GR16:
28555 case X86::CMOV_GR32:
28556 case X86::CMOV_RFP32:
28557 case X86::CMOV_RFP64:
28558 case X86::CMOV_RFP80:
28559 case X86::CMOV_V2F64:
28560 case X86::CMOV_V2I64:
28561 case X86::CMOV_V4F32:
28562 case X86::CMOV_V4F64:
28563 case X86::CMOV_V4I64:
28564 case X86::CMOV_V16F32:
28565 case X86::CMOV_V8F32:
28566 case X86::CMOV_V8F64:
28567 case X86::CMOV_V8I64:
28568 case X86::CMOV_V8I1:
28569 case X86::CMOV_V16I1:
28570 case X86::CMOV_V32I1:
28571 case X86::CMOV_V64I1:
28572 return EmitLoweredSelect(MI, BB);
28574 case X86::RDFLAGS32:
28575 case X86::RDFLAGS64: {
28577 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
28578 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
28579 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
28580 // Permit reads of the EFLAGS and DF registers without them being defined.
28581 // This intrinsic exists to read external processor state in flags, such as
28582 // the trap flag, interrupt flag, and direction flag, none of which are
28583 // modeled by the backend.
28584 assert(Push->getOperand(2).getReg() == X86::EFLAGS &&
28585 "Unexpected register in operand!");
28586 Push->getOperand(2).setIsUndef();
28587 assert(Push->getOperand(3).getReg() == X86::DF &&
28588 "Unexpected register in operand!");
28589 Push->getOperand(3).setIsUndef();
28590 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
28592 MI.eraseFromParent(); // The pseudo is gone now.
28596 case X86::WRFLAGS32:
28597 case X86::WRFLAGS64: {
28599 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
28601 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
28602 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
28603 BuildMI(*BB, MI, DL, TII->get(PopF));
28605 MI.eraseFromParent(); // The pseudo is gone now.
28609 case X86::RELEASE_FADD32mr:
28610 case X86::RELEASE_FADD64mr:
28611 return EmitLoweredAtomicFP(MI, BB);
28613 case X86::FP32_TO_INT16_IN_MEM:
28614 case X86::FP32_TO_INT32_IN_MEM:
28615 case X86::FP32_TO_INT64_IN_MEM:
28616 case X86::FP64_TO_INT16_IN_MEM:
28617 case X86::FP64_TO_INT32_IN_MEM:
28618 case X86::FP64_TO_INT64_IN_MEM:
28619 case X86::FP80_TO_INT16_IN_MEM:
28620 case X86::FP80_TO_INT32_IN_MEM:
28621 case X86::FP80_TO_INT64_IN_MEM: {
28622 // Change the floating point control register to use "round towards zero"
28623 // mode when truncating to an integer value.
28624 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
28625 addFrameReference(BuildMI(*BB, MI, DL,
28626 TII->get(X86::FNSTCW16m)), CWFrameIdx);
28628 // Load the old value of the high byte of the control word...
28630 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
28631 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
28634 // Set the high part to be round to zero...
28635 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
28638 // Reload the modified control word now...
28639 addFrameReference(BuildMI(*BB, MI, DL,
28640 TII->get(X86::FLDCW16m)), CWFrameIdx);
28642 // Restore the memory image of control word to original value
28643 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
28646 // Get the X86 opcode to use.
28648 switch (MI.getOpcode()) {
28649 default: llvm_unreachable("illegal opcode!");
28650 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
28651 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
28652 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
28653 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
28654 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
28655 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
28656 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
28657 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
28658 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
28661 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28662 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
28663 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
28665 // Reload the original control word now.
28666 addFrameReference(BuildMI(*BB, MI, DL,
28667 TII->get(X86::FLDCW16m)), CWFrameIdx);
28669 MI.eraseFromParent(); // The pseudo instruction is gone now.
28672 // Thread synchronization.
28674 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
28675 case X86::MONITORX:
28676 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
28680 return emitClzero(&MI, BB, Subtarget);
28684 return emitWRPKRU(MI, BB, Subtarget);
28686 return emitRDPKRU(MI, BB, Subtarget);
28689 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
28691 case X86::VASTART_SAVE_XMM_REGS:
28692 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
28694 case X86::VAARG_64:
28695 return EmitVAARG64WithCustomInserter(MI, BB);
28697 case X86::EH_SjLj_SetJmp32:
28698 case X86::EH_SjLj_SetJmp64:
28699 return emitEHSjLjSetJmp(MI, BB);
28701 case X86::EH_SjLj_LongJmp32:
28702 case X86::EH_SjLj_LongJmp64:
28703 return emitEHSjLjLongJmp(MI, BB);
28705 case X86::Int_eh_sjlj_setup_dispatch:
28706 return EmitSjLjDispatchBlock(MI, BB);
28708 case TargetOpcode::STATEPOINT:
28709 // As an implementation detail, STATEPOINT shares the STACKMAP format at
28710 // this point in the process. We diverge later.
28711 return emitPatchPoint(MI, BB);
28713 case TargetOpcode::STACKMAP:
28714 case TargetOpcode::PATCHPOINT:
28715 return emitPatchPoint(MI, BB);
28717 case TargetOpcode::PATCHABLE_EVENT_CALL:
28718 return emitXRayCustomEvent(MI, BB);
28720 case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
28721 return emitXRayTypedEvent(MI, BB);
28723 case X86::LCMPXCHG8B: {
28724 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28725 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
28726 // requires a memory operand. If it happens that current architecture is
28727 // i686 and for current function we need a base pointer
28728 // - which is ESI for i686 - register allocator would not be able to
28729 // allocate registers for an address in form of X(%reg, %reg, Y)
28730 // - there never would be enough unreserved registers during regalloc
28731 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
28732 // We are giving a hand to register allocator by precomputing the address in
28733 // a new vreg using LEA.
28735 // If it is not i686 or there is no base pointer - nothing to do here.
28736 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
28739 // Even though this code does not necessarily needs the base pointer to
28740 // be ESI, we check for that. The reason: if this assert fails, there are
28741 // some changes happened in the compiler base pointer handling, which most
28742 // probably have to be addressed somehow here.
28743 assert(TRI->getBaseRegister() == X86::ESI &&
28744 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
28745 "base pointer in mind");
28747 MachineRegisterInfo &MRI = MF->getRegInfo();
28748 MVT SPTy = getPointerTy(MF->getDataLayout());
28749 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
28750 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
28752 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28753 // Regalloc does not need any help when the memory operand of CMPXCHG8B
28754 // does not use index register.
28755 if (AM.IndexReg == X86::NoRegister)
28758 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
28759 // four operand definitions that are E[ABCD] registers. We skip them and
28760 // then insert the LEA.
28761 MachineBasicBlock::iterator MBBI(MI);
28762 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
28763 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
28766 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
28768 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28772 case X86::LCMPXCHG16B:
28774 case X86::LCMPXCHG8B_SAVE_EBX:
28775 case X86::LCMPXCHG16B_SAVE_RBX: {
28777 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28778 if (!BB->isLiveIn(BasePtr))
28779 BB->addLiveIn(BasePtr);
28785 //===----------------------------------------------------------------------===//
28786 // X86 Optimization Hooks
28787 //===----------------------------------------------------------------------===//
28790 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
28791 const APInt &Demanded,
28792 TargetLoweringOpt &TLO) const {
28793 // Only optimize Ands to prevent shrinking a constant that could be
28794 // matched by movzx.
28795 if (Op.getOpcode() != ISD::AND)
28798 EVT VT = Op.getValueType();
28804 unsigned Size = VT.getSizeInBits();
28806 // Make sure the RHS really is a constant.
28807 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
28811 const APInt &Mask = C->getAPIntValue();
28813 // Clear all non-demanded bits initially.
28814 APInt ShrunkMask = Mask & Demanded;
28816 // Find the width of the shrunk mask.
28817 unsigned Width = ShrunkMask.getActiveBits();
28819 // If the mask is all 0s there's nothing to do here.
28823 // Find the next power of 2 width, rounding up to a byte.
28824 Width = PowerOf2Ceil(std::max(Width, 8U));
28825 // Truncate the width to size to handle illegal types.
28826 Width = std::min(Width, Size);
28828 // Calculate a possible zero extend mask for this constant.
28829 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
28831 // If we aren't changing the mask, just return true to keep it and prevent
28832 // the caller from optimizing.
28833 if (ZeroExtendMask == Mask)
28836 // Make sure the new mask can be represented by a combination of mask bits
28837 // and non-demanded bits.
28838 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
28841 // Replace the constant with the zero extend mask.
28843 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
28844 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
28845 return TLO.CombineTo(Op, NewOp);
28848 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28850 const APInt &DemandedElts,
28851 const SelectionDAG &DAG,
28852 unsigned Depth) const {
28853 unsigned BitWidth = Known.getBitWidth();
28854 unsigned Opc = Op.getOpcode();
28855 EVT VT = Op.getValueType();
28856 assert((Opc >= ISD::BUILTIN_OP_END ||
28857 Opc == ISD::INTRINSIC_WO_CHAIN ||
28858 Opc == ISD::INTRINSIC_W_CHAIN ||
28859 Opc == ISD::INTRINSIC_VOID) &&
28860 "Should use MaskedValueIsZero if you don't know whether Op"
28861 " is a target node!");
28866 case X86ISD::SETCC:
28867 Known.Zero.setBitsFrom(1);
28869 case X86ISD::MOVMSK: {
28870 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28871 Known.Zero.setBitsFrom(NumLoBits);
28874 case X86ISD::PEXTRB:
28875 case X86ISD::PEXTRW: {
28876 SDValue Src = Op.getOperand(0);
28877 EVT SrcVT = Src.getValueType();
28878 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28879 Op.getConstantOperandVal(1));
28880 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28881 Known = Known.zextOrTrunc(BitWidth);
28882 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28885 case X86ISD::VSHLI:
28886 case X86ISD::VSRLI: {
28887 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28888 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28889 Known.setAllZero();
28893 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28894 unsigned ShAmt = ShiftImm->getZExtValue();
28895 if (Opc == X86ISD::VSHLI) {
28896 Known.Zero <<= ShAmt;
28897 Known.One <<= ShAmt;
28898 // Low bits are known zero.
28899 Known.Zero.setLowBits(ShAmt);
28901 Known.Zero.lshrInPlace(ShAmt);
28902 Known.One.lshrInPlace(ShAmt);
28903 // High bits are known zero.
28904 Known.Zero.setHighBits(ShAmt);
28909 case X86ISD::PACKUS: {
28910 // PACKUS is just a truncation if the upper half is zero.
28911 // TODO: Add DemandedElts support.
28913 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
28914 DAG.computeKnownBits(Op.getOperand(1), Known2, Depth + 1);
28915 Known.One &= Known2.One;
28916 Known.Zero &= Known2.Zero;
28917 if (Known.countMinLeadingZeros() < BitWidth)
28919 Known = Known.trunc(BitWidth);
28922 case X86ISD::VZEXT: {
28923 // TODO: Add DemandedElts support.
28924 SDValue N0 = Op.getOperand(0);
28925 unsigned NumElts = VT.getVectorNumElements();
28927 EVT SrcVT = N0.getValueType();
28928 unsigned InNumElts = SrcVT.getVectorNumElements();
28929 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28930 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28932 Known = KnownBits(InBitWidth);
28933 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28934 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28935 Known = Known.zext(BitWidth);
28936 Known.Zero.setBitsFrom(InBitWidth);
28939 case X86ISD::CMOV: {
28940 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28941 // If we don't know any bits, early out.
28942 if (Known.isUnknown())
28945 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28947 // Only known if known in both the LHS and RHS.
28948 Known.One &= Known2.One;
28949 Known.Zero &= Known2.Zero;
28952 case X86ISD::UDIVREM8_ZEXT_HREG:
28953 // TODO: Support more than just the zero extended bits?
28954 if (Op.getResNo() != 1)
28956 // The remainder is zero extended.
28957 Known.Zero.setBitsFrom(8);
28961 // Handle target shuffles.
28962 // TODO - use resolveTargetShuffleInputs once we can limit recursive depth.
28963 if (isTargetShuffle(Opc)) {
28965 SmallVector<int, 64> Mask;
28966 SmallVector<SDValue, 2> Ops;
28967 if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask,
28969 unsigned NumOps = Ops.size();
28970 unsigned NumElts = VT.getVectorNumElements();
28971 if (Mask.size() == NumElts) {
28972 SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0));
28973 Known.Zero.setAllBits(); Known.One.setAllBits();
28974 for (unsigned i = 0; i != NumElts; ++i) {
28975 if (!DemandedElts[i])
28978 if (M == SM_SentinelUndef) {
28979 // For UNDEF elements, we don't know anything about the common state
28980 // of the shuffle result.
28983 } else if (M == SM_SentinelZero) {
28984 Known.One.clearAllBits();
28987 assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&
28988 "Shuffle index out of range");
28990 unsigned OpIdx = (unsigned)M / NumElts;
28991 unsigned EltIdx = (unsigned)M % NumElts;
28992 if (Ops[OpIdx].getValueType() != VT) {
28993 // TODO - handle target shuffle ops with different value types.
28997 DemandedOps[OpIdx].setBit(EltIdx);
28999 // Known bits are the values that are shared by every demanded element.
29000 for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) {
29001 if (!DemandedOps[i])
29004 DAG.computeKnownBits(Ops[i], Known2, DemandedOps[i], Depth + 1);
29005 Known.One &= Known2.One;
29006 Known.Zero &= Known2.Zero;
29013 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
29014 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
29015 unsigned Depth) const {
29016 unsigned VTBits = Op.getScalarValueSizeInBits();
29017 unsigned Opcode = Op.getOpcode();
29019 case X86ISD::SETCC_CARRY:
29020 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
29023 case X86ISD::VSEXT: {
29024 // TODO: Add DemandedElts support.
29025 SDValue Src = Op.getOperand(0);
29026 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29027 Tmp += VTBits - Src.getScalarValueSizeInBits();
29031 case X86ISD::VTRUNC: {
29032 // TODO: Add DemandedElts support.
29033 SDValue Src = Op.getOperand(0);
29034 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
29035 assert(VTBits < NumSrcBits && "Illegal truncation input type");
29036 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
29037 if (Tmp > (NumSrcBits - VTBits))
29038 return Tmp - (NumSrcBits - VTBits);
29042 case X86ISD::PACKSS: {
29043 // PACKSS is just a truncation if the sign bits extend to the packed size.
29044 // TODO: Add DemandedElts support.
29045 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
29046 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
29047 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
29048 unsigned Tmp = std::min(Tmp0, Tmp1);
29049 if (Tmp > (SrcBits - VTBits))
29050 return Tmp - (SrcBits - VTBits);
29054 case X86ISD::VSHLI: {
29055 SDValue Src = Op.getOperand(0);
29056 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29057 if (ShiftVal.uge(VTBits))
29058 return VTBits; // Shifted all bits out --> zero.
29059 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29060 if (ShiftVal.uge(Tmp))
29061 return 1; // Shifted all sign bits out --> unknown.
29062 return Tmp - ShiftVal.getZExtValue();
29065 case X86ISD::VSRAI: {
29066 SDValue Src = Op.getOperand(0);
29067 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
29068 if (ShiftVal.uge(VTBits - 1))
29069 return VTBits; // Sign splat.
29070 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
29072 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
29075 case X86ISD::PCMPGT:
29076 case X86ISD::PCMPEQ:
29078 case X86ISD::VPCOM:
29079 case X86ISD::VPCOMU:
29080 // Vector compares return zero/all-bits result values.
29083 case X86ISD::CMOV: {
29084 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
29085 if (Tmp0 == 1) return 1; // Early out.
29086 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
29087 return std::min(Tmp0, Tmp1);
29089 case X86ISD::SDIVREM8_SEXT_HREG:
29090 // TODO: Support more than just the sign extended bits?
29091 if (Op.getResNo() != 1)
29093 // The remainder is sign extended.
29101 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
29102 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
29103 return N->getOperand(0);
29107 /// Returns true (and the GlobalValue and the offset) if the node is a
29108 /// GlobalAddress + offset.
29109 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
29110 const GlobalValue* &GA,
29111 int64_t &Offset) const {
29112 if (N->getOpcode() == X86ISD::Wrapper) {
29113 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
29114 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
29115 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
29119 return TargetLowering::isGAPlusOffset(N, GA, Offset);
29122 // Attempt to match a combined shuffle mask against supported unary shuffle
29124 // TODO: Investigate sharing more of this with shuffle lowering.
29125 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29126 bool AllowFloatDomain, bool AllowIntDomain,
29127 SDValue &V1, const SDLoc &DL,
29129 const X86Subtarget &Subtarget,
29130 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
29131 unsigned NumMaskElts = Mask.size();
29132 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
29134 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
29135 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
29136 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
29137 Shuffle = X86ISD::VZEXT_MOVL;
29138 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29142 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
29143 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
29144 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
29145 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
29146 unsigned MaxScale = 64 / MaskEltSize;
29147 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
29149 unsigned NumDstElts = NumMaskElts / Scale;
29150 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
29151 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
29152 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
29155 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
29156 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
29157 MVT::getIntegerVT(MaskEltSize);
29158 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
29160 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
29161 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
29162 Shuffle = unsigned(X86ISD::VZEXT);
29164 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
29166 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
29167 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
29173 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
29174 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
29175 isUndefOrEqual(Mask[0], 0) &&
29176 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
29177 Shuffle = X86ISD::VZEXT_MOVL;
29178 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
29182 // Check if we have SSE3 which will let us use MOVDDUP etc. The
29183 // instructions are no slower than UNPCKLPD but has the option to
29184 // fold the input operand into even an unaligned memory load.
29185 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
29186 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
29187 Shuffle = X86ISD::MOVDDUP;
29188 SrcVT = DstVT = MVT::v2f64;
29191 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29192 Shuffle = X86ISD::MOVSLDUP;
29193 SrcVT = DstVT = MVT::v4f32;
29196 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
29197 Shuffle = X86ISD::MOVSHDUP;
29198 SrcVT = DstVT = MVT::v4f32;
29203 if (MaskVT.is256BitVector() && AllowFloatDomain) {
29204 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
29205 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
29206 Shuffle = X86ISD::MOVDDUP;
29207 SrcVT = DstVT = MVT::v4f64;
29210 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29211 Shuffle = X86ISD::MOVSLDUP;
29212 SrcVT = DstVT = MVT::v8f32;
29215 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
29216 Shuffle = X86ISD::MOVSHDUP;
29217 SrcVT = DstVT = MVT::v8f32;
29222 if (MaskVT.is512BitVector() && AllowFloatDomain) {
29223 assert(Subtarget.hasAVX512() &&
29224 "AVX512 required for 512-bit vector shuffles");
29225 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
29226 Shuffle = X86ISD::MOVDDUP;
29227 SrcVT = DstVT = MVT::v8f64;
29230 if (isTargetShuffleEquivalent(
29231 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
29232 Shuffle = X86ISD::MOVSLDUP;
29233 SrcVT = DstVT = MVT::v16f32;
29236 if (isTargetShuffleEquivalent(
29237 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
29238 Shuffle = X86ISD::MOVSHDUP;
29239 SrcVT = DstVT = MVT::v16f32;
29244 // Attempt to match against broadcast-from-vector.
29245 if (Subtarget.hasAVX2()) {
29246 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
29247 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
29248 SrcVT = DstVT = MaskVT;
29249 Shuffle = X86ISD::VBROADCAST;
29257 // Attempt to match a combined shuffle mask against supported unary immediate
29258 // permute instructions.
29259 // TODO: Investigate sharing more of this with shuffle lowering.
29260 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29261 const APInt &Zeroable,
29262 bool AllowFloatDomain,
29263 bool AllowIntDomain,
29264 const X86Subtarget &Subtarget,
29265 unsigned &Shuffle, MVT &ShuffleVT,
29266 unsigned &PermuteImm) {
29267 unsigned NumMaskElts = Mask.size();
29268 unsigned InputSizeInBits = MaskVT.getSizeInBits();
29269 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
29270 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
29272 bool ContainsZeros =
29273 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29275 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
29276 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
29277 // Check for lane crossing permutes.
29278 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
29279 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
29280 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
29281 Shuffle = X86ISD::VPERMI;
29282 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
29283 PermuteImm = getV4X86ShuffleImm(Mask);
29286 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
29287 SmallVector<int, 4> RepeatedMask;
29288 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
29289 Shuffle = X86ISD::VPERMI;
29290 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
29291 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
29295 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
29296 // VPERMILPD can permute with a non-repeating shuffle.
29297 Shuffle = X86ISD::VPERMILPI;
29298 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
29300 for (int i = 0, e = Mask.size(); i != e; ++i) {
29302 if (M == SM_SentinelUndef)
29304 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
29305 PermuteImm |= (M & 1) << i;
29311 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
29312 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
29313 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
29314 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
29315 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
29316 SmallVector<int, 4> RepeatedMask;
29317 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29318 // Narrow the repeated mask to create 32-bit element permutes.
29319 SmallVector<int, 4> WordMask = RepeatedMask;
29320 if (MaskScalarSizeInBits == 64)
29321 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
29323 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
29324 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
29325 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
29326 PermuteImm = getV4X86ShuffleImm(WordMask);
29331 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
29332 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
29333 SmallVector<int, 4> RepeatedMask;
29334 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
29335 ArrayRef<int> LoMask(Mask.data() + 0, 4);
29336 ArrayRef<int> HiMask(Mask.data() + 4, 4);
29338 // PSHUFLW: permute lower 4 elements only.
29339 if (isUndefOrInRange(LoMask, 0, 4) &&
29340 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
29341 Shuffle = X86ISD::PSHUFLW;
29342 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29343 PermuteImm = getV4X86ShuffleImm(LoMask);
29347 // PSHUFHW: permute upper 4 elements only.
29348 if (isUndefOrInRange(HiMask, 4, 8) &&
29349 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
29350 // Offset the HiMask so that we can create the shuffle immediate.
29351 int OffsetHiMask[4];
29352 for (int i = 0; i != 4; ++i)
29353 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
29355 Shuffle = X86ISD::PSHUFHW;
29356 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
29357 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
29363 // Attempt to match against byte/bit shifts.
29364 // FIXME: Add 512-bit support.
29365 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29366 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29367 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
29368 MaskScalarSizeInBits, Mask,
29369 0, Zeroable, Subtarget);
29370 if (0 < ShiftAmt) {
29371 PermuteImm = (unsigned)ShiftAmt;
29379 // Attempt to match a combined unary shuffle mask against supported binary
29380 // shuffle instructions.
29381 // TODO: Investigate sharing more of this with shuffle lowering.
29382 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
29383 bool AllowFloatDomain, bool AllowIntDomain,
29384 SDValue &V1, SDValue &V2, const SDLoc &DL,
29386 const X86Subtarget &Subtarget,
29387 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
29389 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29391 if (MaskVT.is128BitVector()) {
29392 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
29394 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
29395 Shuffle = X86ISD::MOVLHPS;
29396 SrcVT = DstVT = MVT::v4f32;
29399 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
29401 Shuffle = X86ISD::MOVHLPS;
29402 SrcVT = DstVT = MVT::v4f32;
29405 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
29406 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29408 Shuffle = X86ISD::MOVSD;
29409 SrcVT = DstVT = MaskVT;
29412 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
29413 (AllowFloatDomain || !Subtarget.hasSSE41())) {
29414 Shuffle = X86ISD::MOVSS;
29415 SrcVT = DstVT = MaskVT;
29420 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
29421 // TODO add support for 256/512-bit types.
29422 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
29423 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
29430 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
29431 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
29432 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29433 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
29434 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
29435 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
29436 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
29438 SrcVT = DstVT = MaskVT;
29439 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
29440 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
29448 static bool matchBinaryPermuteVectorShuffle(
29449 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
29450 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
29451 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
29452 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
29453 unsigned NumMaskElts = Mask.size();
29454 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
29456 // Attempt to match against PALIGNR byte rotate.
29457 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29458 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
29459 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
29460 if (0 < ByteRotation) {
29461 Shuffle = X86ISD::PALIGNR;
29462 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
29463 PermuteImm = ByteRotation;
29468 // Attempt to combine to X86ISD::BLENDI.
29469 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
29470 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
29471 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
29472 uint64_t BlendMask = 0;
29473 bool ForceV1Zero = false, ForceV2Zero = false;
29474 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
29475 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
29477 if (MaskVT == MVT::v16i16) {
29478 // We can only use v16i16 PBLENDW if the lanes are repeated.
29479 SmallVector<int, 8> RepeatedMask;
29480 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
29482 assert(RepeatedMask.size() == 8 &&
29483 "Repeated mask size doesn't match!");
29485 for (int i = 0; i < 8; ++i)
29486 if (RepeatedMask[i] >= 8)
29487 PermuteImm |= 1 << i;
29488 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29489 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29490 Shuffle = X86ISD::BLENDI;
29491 ShuffleVT = MaskVT;
29495 // Determine a type compatible with X86ISD::BLENDI.
29496 ShuffleVT = MaskVT;
29497 if (Subtarget.hasAVX2()) {
29498 if (ShuffleVT == MVT::v4i64)
29499 ShuffleVT = MVT::v8i32;
29500 else if (ShuffleVT == MVT::v2i64)
29501 ShuffleVT = MVT::v4i32;
29503 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
29504 ShuffleVT = MVT::v8i16;
29505 else if (ShuffleVT == MVT::v4i64)
29506 ShuffleVT = MVT::v4f64;
29507 else if (ShuffleVT == MVT::v8i32)
29508 ShuffleVT = MVT::v8f32;
29511 if (!ShuffleVT.isFloatingPoint()) {
29512 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
29514 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
29515 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
29516 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
29519 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
29520 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
29521 PermuteImm = (unsigned)BlendMask;
29522 Shuffle = X86ISD::BLENDI;
29528 // Attempt to combine to INSERTPS.
29529 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
29530 MaskVT.is128BitVector()) {
29531 if (Zeroable.getBoolValue() &&
29532 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
29533 Shuffle = X86ISD::INSERTPS;
29534 ShuffleVT = MVT::v4f32;
29539 // Attempt to combine to SHUFPD.
29540 if (AllowFloatDomain && EltSizeInBits == 64 &&
29541 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
29542 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29543 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29544 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
29545 Shuffle = X86ISD::SHUFP;
29546 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
29551 // Attempt to combine to SHUFPS.
29552 if (AllowFloatDomain && EltSizeInBits == 32 &&
29553 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
29554 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
29555 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
29556 SmallVector<int, 4> RepeatedMask;
29557 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
29558 // Match each half of the repeated mask, to determine if its just
29559 // referencing one of the vectors, is zeroable or entirely undef.
29560 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
29561 int M0 = RepeatedMask[Offset];
29562 int M1 = RepeatedMask[Offset + 1];
29564 if (isUndefInRange(RepeatedMask, Offset, 2)) {
29565 return DAG.getUNDEF(MaskVT);
29566 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
29567 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
29568 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
29569 return getZeroVector(MaskVT, Subtarget, DAG, DL);
29570 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
29571 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29572 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29574 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
29575 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29576 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29583 int ShufMask[4] = {-1, -1, -1, -1};
29584 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
29585 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
29590 Shuffle = X86ISD::SHUFP;
29591 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
29592 PermuteImm = getV4X86ShuffleImm(ShufMask);
29601 /// Combine an arbitrary chain of shuffles into a single instruction if
29604 /// This is the leaf of the recursive combine below. When we have found some
29605 /// chain of single-use x86 shuffle instructions and accumulated the combined
29606 /// shuffle mask represented by them, this will try to pattern match that mask
29607 /// into either a single instruction if there is a special purpose instruction
29608 /// for this operation, or into a PSHUFB instruction which is a fully general
29609 /// instruction but should only be used to replace chains over a certain depth.
29610 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
29611 ArrayRef<int> BaseMask, int Depth,
29612 bool HasVariableMask, SelectionDAG &DAG,
29613 const X86Subtarget &Subtarget) {
29614 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
29615 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
29616 "Unexpected number of shuffle inputs!");
29618 // Find the inputs that enter the chain. Note that multiple uses are OK
29619 // here, we're not going to remove the operands we find.
29620 bool UnaryShuffle = (Inputs.size() == 1);
29621 SDValue V1 = peekThroughBitcasts(Inputs[0]);
29622 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
29623 : peekThroughBitcasts(Inputs[1]));
29625 MVT VT1 = V1.getSimpleValueType();
29626 MVT VT2 = V2.getSimpleValueType();
29627 MVT RootVT = Root.getSimpleValueType();
29628 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
29629 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
29630 "Vector size mismatch");
29635 unsigned NumBaseMaskElts = BaseMask.size();
29636 if (NumBaseMaskElts == 1) {
29637 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
29638 return DAG.getBitcast(RootVT, V1);
29641 unsigned RootSizeInBits = RootVT.getSizeInBits();
29642 unsigned NumRootElts = RootVT.getVectorNumElements();
29643 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
29644 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
29645 (RootVT.isFloatingPoint() && Depth >= 2) ||
29646 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
29648 // Don't combine if we are a AVX512/EVEX target and the mask element size
29649 // is different from the root element size - this would prevent writemasks
29650 // from being reused.
29651 // TODO - this currently prevents all lane shuffles from occurring.
29652 // TODO - check for writemasks usage instead of always preventing combining.
29653 // TODO - attempt to narrow Mask back to writemask size.
29654 bool IsEVEXShuffle =
29655 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
29657 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
29659 // Handle 128-bit lane shuffles of 256-bit vectors.
29660 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
29661 // we need to use the zeroing feature.
29662 // TODO - this should support binary shuffles.
29663 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
29664 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
29665 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
29666 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
29667 return SDValue(); // Nothing to do!
29668 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
29669 unsigned PermMask = 0;
29670 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
29671 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
29673 Res = DAG.getBitcast(ShuffleVT, V1);
29674 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
29675 DAG.getUNDEF(ShuffleVT),
29676 DAG.getConstant(PermMask, DL, MVT::i8));
29677 return DAG.getBitcast(RootVT, Res);
29680 // For masks that have been widened to 128-bit elements or more,
29681 // narrow back down to 64-bit elements.
29682 SmallVector<int, 64> Mask;
29683 if (BaseMaskEltSizeInBits > 64) {
29684 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
29685 int MaskScale = BaseMaskEltSizeInBits / 64;
29686 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
29688 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
29691 unsigned NumMaskElts = Mask.size();
29692 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
29694 // Determine the effective mask value type.
29695 FloatDomain &= (32 <= MaskEltSizeInBits);
29696 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
29697 : MVT::getIntegerVT(MaskEltSizeInBits);
29698 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
29700 // Only allow legal mask types.
29701 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
29704 // Attempt to match the mask against known shuffle patterns.
29705 MVT ShuffleSrcVT, ShuffleVT;
29706 unsigned Shuffle, PermuteImm;
29708 // Which shuffle domains are permitted?
29709 // Permit domain crossing at higher combine depths.
29710 bool AllowFloatDomain = FloatDomain || (Depth > 3);
29711 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
29712 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
29714 // Determine zeroable mask elements.
29715 APInt Zeroable(NumMaskElts, 0);
29716 for (unsigned i = 0; i != NumMaskElts; ++i)
29717 if (isUndefOrZero(Mask[i]))
29718 Zeroable.setBit(i);
29720 if (UnaryShuffle) {
29721 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
29722 // directly if we don't shuffle the lower element and we shuffle the upper
29723 // (zero) elements within themselves.
29724 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
29725 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
29726 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
29727 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
29728 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
29729 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
29730 return DAG.getBitcast(RootVT, V1);
29734 SDValue NewV1 = V1; // Save operand in case early exit happens.
29735 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29736 NewV1, DL, DAG, Subtarget, Shuffle,
29737 ShuffleSrcVT, ShuffleVT) &&
29738 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29739 if (Depth == 1 && Root.getOpcode() == Shuffle)
29740 return SDValue(); // Nothing to do!
29741 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
29742 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
29743 return DAG.getBitcast(RootVT, Res);
29746 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
29747 AllowIntDomain, Subtarget, Shuffle,
29748 ShuffleVT, PermuteImm) &&
29749 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29750 if (Depth == 1 && Root.getOpcode() == Shuffle)
29751 return SDValue(); // Nothing to do!
29752 Res = DAG.getBitcast(ShuffleVT, V1);
29753 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
29754 DAG.getConstant(PermuteImm, DL, MVT::i8));
29755 return DAG.getBitcast(RootVT, Res);
29759 SDValue NewV1 = V1; // Save operands in case early exit happens.
29760 SDValue NewV2 = V2;
29761 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29762 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
29763 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
29764 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29765 if (Depth == 1 && Root.getOpcode() == Shuffle)
29766 return SDValue(); // Nothing to do!
29767 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
29768 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
29769 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
29770 return DAG.getBitcast(RootVT, Res);
29773 NewV1 = V1; // Save operands in case early exit happens.
29775 if (matchBinaryPermuteVectorShuffle(
29776 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
29777 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
29778 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29779 if (Depth == 1 && Root.getOpcode() == Shuffle)
29780 return SDValue(); // Nothing to do!
29781 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
29782 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
29783 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
29784 DAG.getConstant(PermuteImm, DL, MVT::i8));
29785 return DAG.getBitcast(RootVT, Res);
29788 // Typically from here on, we need an integer version of MaskVT.
29789 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
29790 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
29792 // Annoyingly, SSE4A instructions don't map into the above match helpers.
29793 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
29794 uint64_t BitLen, BitIdx;
29795 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
29797 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
29798 return SDValue(); // Nothing to do!
29799 V1 = DAG.getBitcast(IntMaskVT, V1);
29800 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
29801 DAG.getConstant(BitLen, DL, MVT::i8),
29802 DAG.getConstant(BitIdx, DL, MVT::i8));
29803 return DAG.getBitcast(RootVT, Res);
29806 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
29807 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
29808 return SDValue(); // Nothing to do!
29809 V1 = DAG.getBitcast(IntMaskVT, V1);
29810 V2 = DAG.getBitcast(IntMaskVT, V2);
29811 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
29812 DAG.getConstant(BitLen, DL, MVT::i8),
29813 DAG.getConstant(BitIdx, DL, MVT::i8));
29814 return DAG.getBitcast(RootVT, Res);
29818 // Don't try to re-form single instruction chains under any circumstances now
29819 // that we've done encoding canonicalization for them.
29823 // Depth threshold above which we can efficiently use variable mask shuffles.
29824 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
29825 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
29827 bool MaskContainsZeros =
29828 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29830 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
29831 // If we have a single input lane-crossing shuffle then lower to VPERMV.
29832 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29833 ((Subtarget.hasAVX2() &&
29834 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29835 (Subtarget.hasAVX512() &&
29836 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29837 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29838 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29839 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29840 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29841 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29842 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29843 Res = DAG.getBitcast(MaskVT, V1);
29844 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
29845 return DAG.getBitcast(RootVT, Res);
29848 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
29849 // vector as the second source.
29850 if (UnaryShuffle && AllowVariableMask &&
29851 ((Subtarget.hasAVX512() &&
29852 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29853 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29854 (Subtarget.hasVLX() &&
29855 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29856 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29857 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29858 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29859 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29860 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29861 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
29862 for (unsigned i = 0; i != NumMaskElts; ++i)
29863 if (Mask[i] == SM_SentinelZero)
29864 Mask[i] = NumMaskElts + i;
29866 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29867 Res = DAG.getBitcast(MaskVT, V1);
29868 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
29869 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
29870 return DAG.getBitcast(RootVT, Res);
29873 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
29874 if (AllowVariableMask && !MaskContainsZeros &&
29875 ((Subtarget.hasAVX512() &&
29876 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29877 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29878 (Subtarget.hasVLX() &&
29879 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29880 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29881 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29882 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29883 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29884 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29885 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29886 V1 = DAG.getBitcast(MaskVT, V1);
29887 V2 = DAG.getBitcast(MaskVT, V2);
29888 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29889 return DAG.getBitcast(RootVT, Res);
29894 // See if we can combine a single input shuffle with zeros to a bit-mask,
29895 // which is much simpler than any shuffle.
29896 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29897 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29898 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29899 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29900 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29901 APInt UndefElts(NumMaskElts, 0);
29902 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29903 for (unsigned i = 0; i != NumMaskElts; ++i) {
29905 if (M == SM_SentinelUndef) {
29906 UndefElts.setBit(i);
29909 if (M == SM_SentinelZero)
29911 EltBits[i] = AllOnes;
29913 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29914 Res = DAG.getBitcast(MaskVT, V1);
29915 unsigned AndOpcode =
29916 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29917 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29918 return DAG.getBitcast(RootVT, Res);
29921 // If we have a single input shuffle with different shuffle patterns in the
29922 // the 128-bit lanes use the variable mask to VPERMILPS.
29923 // TODO Combine other mask types at higher depths.
29924 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29925 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29926 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29927 SmallVector<SDValue, 16> VPermIdx;
29928 for (int M : Mask) {
29930 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29931 VPermIdx.push_back(Idx);
29933 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29934 Res = DAG.getBitcast(MaskVT, V1);
29935 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29936 return DAG.getBitcast(RootVT, Res);
29939 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29940 // to VPERMIL2PD/VPERMIL2PS.
29941 if (AllowVariableMask && Subtarget.hasXOP() &&
29942 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29943 MaskVT == MVT::v8f32)) {
29944 // VPERMIL2 Operation.
29945 // Bits[3] - Match Bit.
29946 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29947 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29948 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29949 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29950 SmallVector<int, 8> VPerm2Idx;
29951 unsigned M2ZImm = 0;
29952 for (int M : Mask) {
29953 if (M == SM_SentinelUndef) {
29954 VPerm2Idx.push_back(-1);
29957 if (M == SM_SentinelZero) {
29959 VPerm2Idx.push_back(8);
29962 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29963 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29964 VPerm2Idx.push_back(Index);
29966 V1 = DAG.getBitcast(MaskVT, V1);
29967 V2 = DAG.getBitcast(MaskVT, V2);
29968 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29969 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29970 DAG.getConstant(M2ZImm, DL, MVT::i8));
29971 return DAG.getBitcast(RootVT, Res);
29974 // If we have 3 or more shuffle instructions or a chain involving a variable
29975 // mask, we can replace them with a single PSHUFB instruction profitably.
29976 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29977 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29978 // more aggressive.
29979 if (UnaryShuffle && AllowVariableMask &&
29980 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29981 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29982 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29983 SmallVector<SDValue, 16> PSHUFBMask;
29984 int NumBytes = RootVT.getSizeInBits() / 8;
29985 int Ratio = NumBytes / NumMaskElts;
29986 for (int i = 0; i < NumBytes; ++i) {
29987 int M = Mask[i / Ratio];
29988 if (M == SM_SentinelUndef) {
29989 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29992 if (M == SM_SentinelZero) {
29993 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29996 M = Ratio * M + i % Ratio;
29997 assert((M / 16) == (i / 16) && "Lane crossing detected");
29998 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30000 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
30001 Res = DAG.getBitcast(ByteVT, V1);
30002 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
30003 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
30004 return DAG.getBitcast(RootVT, Res);
30007 // With XOP, if we have a 128-bit binary input shuffle we can always combine
30008 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
30009 // slower than PSHUFB on targets that support both.
30010 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
30011 // VPPERM Mask Operation
30012 // Bits[4:0] - Byte Index (0 - 31)
30013 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
30014 SmallVector<SDValue, 16> VPPERMMask;
30016 int Ratio = NumBytes / NumMaskElts;
30017 for (int i = 0; i < NumBytes; ++i) {
30018 int M = Mask[i / Ratio];
30019 if (M == SM_SentinelUndef) {
30020 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
30023 if (M == SM_SentinelZero) {
30024 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
30027 M = Ratio * M + i % Ratio;
30028 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
30030 MVT ByteVT = MVT::v16i8;
30031 V1 = DAG.getBitcast(ByteVT, V1);
30032 V2 = DAG.getBitcast(ByteVT, V2);
30033 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
30034 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
30035 return DAG.getBitcast(RootVT, Res);
30038 // Failed to find any combines.
30042 // Attempt to constant fold all of the constant source ops.
30043 // Returns true if the entire shuffle is folded to a constant.
30044 // TODO: Extend this to merge multiple constant Ops and update the mask.
30045 static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
30046 ArrayRef<int> Mask, SDValue Root,
30047 bool HasVariableMask,
30049 const X86Subtarget &Subtarget) {
30050 MVT VT = Root.getSimpleValueType();
30052 unsigned SizeInBits = VT.getSizeInBits();
30053 unsigned NumMaskElts = Mask.size();
30054 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
30055 unsigned NumOps = Ops.size();
30057 // Extract constant bits from each source op.
30058 bool OneUseConstantOp = false;
30059 SmallVector<APInt, 16> UndefEltsOps(NumOps);
30060 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
30061 for (unsigned i = 0; i != NumOps; ++i) {
30062 SDValue SrcOp = Ops[i];
30063 OneUseConstantOp |= SrcOp.hasOneUse();
30064 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
30069 // Only fold if at least one of the constants is only used once or
30070 // the combined shuffle has included a variable mask shuffle, this
30071 // is to avoid constant pool bloat.
30072 if (!OneUseConstantOp && !HasVariableMask)
30075 // Shuffle the constant bits according to the mask.
30076 APInt UndefElts(NumMaskElts, 0);
30077 APInt ZeroElts(NumMaskElts, 0);
30078 APInt ConstantElts(NumMaskElts, 0);
30079 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
30080 APInt::getNullValue(MaskSizeInBits));
30081 for (unsigned i = 0; i != NumMaskElts; ++i) {
30083 if (M == SM_SentinelUndef) {
30084 UndefElts.setBit(i);
30086 } else if (M == SM_SentinelZero) {
30087 ZeroElts.setBit(i);
30090 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
30092 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
30093 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
30095 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
30096 if (SrcUndefElts[SrcMaskIdx]) {
30097 UndefElts.setBit(i);
30101 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
30102 APInt &Bits = SrcEltBits[SrcMaskIdx];
30104 ZeroElts.setBit(i);
30108 ConstantElts.setBit(i);
30109 ConstantBitData[i] = Bits;
30111 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
30113 // Create the constant data.
30115 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
30116 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
30118 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
30120 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
30123 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
30124 return DAG.getBitcast(VT, CstOp);
30127 /// Fully generic combining of x86 shuffle instructions.
30129 /// This should be the last combine run over the x86 shuffle instructions. Once
30130 /// they have been fully optimized, this will recursively consider all chains
30131 /// of single-use shuffle instructions, build a generic model of the cumulative
30132 /// shuffle operation, and check for simpler instructions which implement this
30133 /// operation. We use this primarily for two purposes:
30135 /// 1) Collapse generic shuffles to specialized single instructions when
30136 /// equivalent. In most cases, this is just an encoding size win, but
30137 /// sometimes we will collapse multiple generic shuffles into a single
30138 /// special-purpose shuffle.
30139 /// 2) Look for sequences of shuffle instructions with 3 or more total
30140 /// instructions, and replace them with the slightly more expensive SSSE3
30141 /// PSHUFB instruction if available. We do this as the last combining step
30142 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
30143 /// a suitable short sequence of other instructions. The PSHUFB will either
30144 /// use a register or have to read from memory and so is slightly (but only
30145 /// slightly) more expensive than the other shuffle instructions.
30147 /// Because this is inherently a quadratic operation (for each shuffle in
30148 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
30149 /// This should never be an issue in practice as the shuffle lowering doesn't
30150 /// produce sequences of more than 8 instructions.
30152 /// FIXME: We will currently miss some cases where the redundant shuffling
30153 /// would simplify under the threshold for PSHUFB formation because of
30154 /// combine-ordering. To fix this, we should do the redundant instruction
30155 /// combining in this recursive walk.
30156 static SDValue combineX86ShufflesRecursively(
30157 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
30158 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
30159 bool HasVariableMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) {
30160 // Bound the depth of our recursive combine because this is ultimately
30161 // quadratic in nature.
30162 const unsigned MaxRecursionDepth = 8;
30163 if (Depth > MaxRecursionDepth)
30166 // Directly rip through bitcasts to find the underlying operand.
30167 SDValue Op = SrcOps[SrcOpIndex];
30168 Op = peekThroughOneUseBitcasts(Op);
30170 MVT VT = Op.getSimpleValueType();
30171 if (!VT.isVector())
30172 return SDValue(); // Bail if we hit a non-vector.
30174 assert(Root.getSimpleValueType().isVector() &&
30175 "Shuffles operate on vector types!");
30176 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
30177 "Can only combine shuffles of the same vector register size.");
30179 // Extract target shuffle mask and resolve sentinels and inputs.
30180 SmallVector<int, 64> OpMask;
30181 SmallVector<SDValue, 2> OpInputs;
30182 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
30185 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
30186 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
30187 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
30189 // Add the inputs to the Ops list, avoiding duplicates.
30190 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
30192 int InputIdx0 = -1, InputIdx1 = -1;
30193 for (int i = 0, e = Ops.size(); i < e; ++i) {
30194 SDValue BC = peekThroughBitcasts(Ops[i]);
30195 if (Input0 && BC == peekThroughBitcasts(Input0))
30197 if (Input1 && BC == peekThroughBitcasts(Input1))
30201 if (Input0 && InputIdx0 < 0) {
30202 InputIdx0 = SrcOpIndex;
30203 Ops[SrcOpIndex] = Input0;
30205 if (Input1 && InputIdx1 < 0) {
30206 InputIdx1 = Ops.size();
30207 Ops.push_back(Input1);
30210 assert(((RootMask.size() > OpMask.size() &&
30211 RootMask.size() % OpMask.size() == 0) ||
30212 (OpMask.size() > RootMask.size() &&
30213 OpMask.size() % RootMask.size() == 0) ||
30214 OpMask.size() == RootMask.size()) &&
30215 "The smaller number of elements must divide the larger.");
30217 // This function can be performance-critical, so we rely on the power-of-2
30218 // knowledge that we have about the mask sizes to replace div/rem ops with
30219 // bit-masks and shifts.
30220 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
30221 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
30222 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
30223 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
30225 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
30226 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
30227 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
30228 assert((RootRatio == 1 || OpRatio == 1) &&
30229 "Must not have a ratio for both incoming and op masks!");
30231 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
30232 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
30233 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
30234 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
30235 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
30237 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
30239 // Merge this shuffle operation's mask into our accumulated mask. Note that
30240 // this shuffle's mask will be the first applied to the input, followed by the
30241 // root mask to get us all the way to the root value arrangement. The reason
30242 // for this order is that we are recursing up the operation chain.
30243 for (unsigned i = 0; i < MaskWidth; ++i) {
30244 unsigned RootIdx = i >> RootRatioLog2;
30245 if (RootMask[RootIdx] < 0) {
30246 // This is a zero or undef lane, we're done.
30247 Mask[i] = RootMask[RootIdx];
30251 unsigned RootMaskedIdx =
30253 ? RootMask[RootIdx]
30254 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
30256 // Just insert the scaled root mask value if it references an input other
30257 // than the SrcOp we're currently inserting.
30258 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
30259 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
30260 Mask[i] = RootMaskedIdx;
30264 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
30265 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
30266 if (OpMask[OpIdx] < 0) {
30267 // The incoming lanes are zero or undef, it doesn't matter which ones we
30269 Mask[i] = OpMask[OpIdx];
30273 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
30274 unsigned OpMaskedIdx =
30277 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
30279 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
30280 if (OpMask[OpIdx] < (int)OpMask.size()) {
30281 assert(0 <= InputIdx0 && "Unknown target shuffle input");
30282 OpMaskedIdx += InputIdx0 * MaskWidth;
30284 assert(0 <= InputIdx1 && "Unknown target shuffle input");
30285 OpMaskedIdx += InputIdx1 * MaskWidth;
30288 Mask[i] = OpMaskedIdx;
30291 // Handle the all undef/zero cases early.
30292 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
30293 return DAG.getUNDEF(Root.getValueType());
30295 // TODO - should we handle the mixed zero/undef case as well? Just returning
30296 // a zero mask will lose information on undef elements possibly reducing
30297 // future combine possibilities.
30298 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
30299 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
30302 // Remove unused shuffle source ops.
30303 resolveTargetShuffleInputsAndMask(Ops, Mask);
30304 assert(!Ops.empty() && "Shuffle with no inputs detected");
30306 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
30308 // Update the list of shuffle nodes that have been combined so far.
30309 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
30311 CombinedNodes.push_back(Op.getNode());
30313 // See if we can recurse into each shuffle source op (if it's a target
30314 // shuffle). The source op should only be combined if it either has a
30315 // single use (i.e. current Op) or all its users have already been combined.
30316 // Don't recurse if we already have more source ops than we can combine in
30317 // the remaining recursion depth.
30318 if (Ops.size() < (MaxRecursionDepth - Depth)) {
30319 for (int i = 0, e = Ops.size(); i < e; ++i)
30320 if (Ops[i].getNode()->hasOneUse() ||
30321 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
30322 if (SDValue Res = combineX86ShufflesRecursively(
30323 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
30328 // Attempt to constant fold all of the constant source ops.
30329 if (SDValue Cst = combineX86ShufflesConstants(
30330 Ops, Mask, Root, HasVariableMask, DAG, Subtarget))
30333 // We can only combine unary and binary shuffle mask cases.
30334 if (Ops.size() > 2)
30337 // Minor canonicalization of the accumulated shuffle mask to make it easier
30338 // to match below. All this does is detect masks with sequential pairs of
30339 // elements, and shrink them to the half-width mask. It does this in a loop
30340 // so it will reduce the size of the mask to the minimal width mask which
30341 // performs an equivalent shuffle.
30342 SmallVector<int, 64> WidenedMask;
30343 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
30344 Mask = std::move(WidenedMask);
30347 // Canonicalization of binary shuffle masks to improve pattern matching by
30348 // commuting the inputs.
30349 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
30350 ShuffleVectorSDNode::commuteMask(Mask);
30351 std::swap(Ops[0], Ops[1]);
30354 // Finally, try to combine into a single shuffle instruction.
30355 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
30359 /// Get the PSHUF-style mask from PSHUF node.
30361 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
30362 /// PSHUF-style masks that can be reused with such instructions.
30363 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
30364 MVT VT = N.getSimpleValueType();
30365 SmallVector<int, 4> Mask;
30366 SmallVector<SDValue, 2> Ops;
30369 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
30373 // If we have more than 128-bits, only the low 128-bits of shuffle mask
30374 // matter. Check that the upper masks are repeats and remove them.
30375 if (VT.getSizeInBits() > 128) {
30376 int LaneElts = 128 / VT.getScalarSizeInBits();
30378 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
30379 for (int j = 0; j < LaneElts; ++j)
30380 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
30381 "Mask doesn't repeat in high 128-bit lanes!");
30383 Mask.resize(LaneElts);
30386 switch (N.getOpcode()) {
30387 case X86ISD::PSHUFD:
30389 case X86ISD::PSHUFLW:
30392 case X86ISD::PSHUFHW:
30393 Mask.erase(Mask.begin(), Mask.begin() + 4);
30394 for (int &M : Mask)
30398 llvm_unreachable("No valid shuffle instruction found!");
30402 /// Search for a combinable shuffle across a chain ending in pshufd.
30404 /// We walk up the chain and look for a combinable shuffle, skipping over
30405 /// shuffles that we could hoist this shuffle's transformation past without
30406 /// altering anything.
30408 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
30409 SelectionDAG &DAG) {
30410 assert(N.getOpcode() == X86ISD::PSHUFD &&
30411 "Called with something other than an x86 128-bit half shuffle!");
30414 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
30415 // of the shuffles in the chain so that we can form a fresh chain to replace
30417 SmallVector<SDValue, 8> Chain;
30418 SDValue V = N.getOperand(0);
30419 for (; V.hasOneUse(); V = V.getOperand(0)) {
30420 switch (V.getOpcode()) {
30422 return SDValue(); // Nothing combined!
30425 // Skip bitcasts as we always know the type for the target specific
30429 case X86ISD::PSHUFD:
30430 // Found another dword shuffle.
30433 case X86ISD::PSHUFLW:
30434 // Check that the low words (being shuffled) are the identity in the
30435 // dword shuffle, and the high words are self-contained.
30436 if (Mask[0] != 0 || Mask[1] != 1 ||
30437 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
30440 Chain.push_back(V);
30443 case X86ISD::PSHUFHW:
30444 // Check that the high words (being shuffled) are the identity in the
30445 // dword shuffle, and the low words are self-contained.
30446 if (Mask[2] != 2 || Mask[3] != 3 ||
30447 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
30450 Chain.push_back(V);
30453 case X86ISD::UNPCKL:
30454 case X86ISD::UNPCKH:
30455 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
30456 // shuffle into a preceding word shuffle.
30457 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
30458 V.getSimpleValueType().getVectorElementType() != MVT::i16)
30461 // Search for a half-shuffle which we can combine with.
30462 unsigned CombineOp =
30463 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
30464 if (V.getOperand(0) != V.getOperand(1) ||
30465 !V->isOnlyUserOf(V.getOperand(0).getNode()))
30467 Chain.push_back(V);
30468 V = V.getOperand(0);
30470 switch (V.getOpcode()) {
30472 return SDValue(); // Nothing to combine.
30474 case X86ISD::PSHUFLW:
30475 case X86ISD::PSHUFHW:
30476 if (V.getOpcode() == CombineOp)
30479 Chain.push_back(V);
30483 V = V.getOperand(0);
30487 } while (V.hasOneUse());
30490 // Break out of the loop if we break out of the switch.
30494 if (!V.hasOneUse())
30495 // We fell out of the loop without finding a viable combining instruction.
30498 // Merge this node's mask and our incoming mask.
30499 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30500 for (int &M : Mask)
30502 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
30503 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30505 // Rebuild the chain around this new shuffle.
30506 while (!Chain.empty()) {
30507 SDValue W = Chain.pop_back_val();
30509 if (V.getValueType() != W.getOperand(0).getValueType())
30510 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
30512 switch (W.getOpcode()) {
30514 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
30516 case X86ISD::UNPCKL:
30517 case X86ISD::UNPCKH:
30518 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
30521 case X86ISD::PSHUFD:
30522 case X86ISD::PSHUFLW:
30523 case X86ISD::PSHUFHW:
30524 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
30528 if (V.getValueType() != N.getValueType())
30529 V = DAG.getBitcast(N.getValueType(), V);
30531 // Return the new chain to replace N.
30535 /// Search for a combinable shuffle across a chain ending in pshuflw or
30538 /// We walk up the chain, skipping shuffles of the other half and looking
30539 /// through shuffles which switch halves trying to find a shuffle of the same
30540 /// pair of dwords.
30541 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
30543 TargetLowering::DAGCombinerInfo &DCI) {
30545 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
30546 "Called with something other than an x86 128-bit half shuffle!");
30548 unsigned CombineOpcode = N.getOpcode();
30550 // Walk up a single-use chain looking for a combinable shuffle.
30551 SDValue V = N.getOperand(0);
30552 for (; V.hasOneUse(); V = V.getOperand(0)) {
30553 switch (V.getOpcode()) {
30555 return false; // Nothing combined!
30558 // Skip bitcasts as we always know the type for the target specific
30562 case X86ISD::PSHUFLW:
30563 case X86ISD::PSHUFHW:
30564 if (V.getOpcode() == CombineOpcode)
30567 // Other-half shuffles are no-ops.
30570 // Break out of the loop if we break out of the switch.
30574 if (!V.hasOneUse())
30575 // We fell out of the loop without finding a viable combining instruction.
30578 // Combine away the bottom node as its shuffle will be accumulated into
30579 // a preceding shuffle.
30580 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30582 // Record the old value.
30585 // Merge this node's mask and our incoming mask (adjusted to account for all
30586 // the pshufd instructions encountered).
30587 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30588 for (int &M : Mask)
30590 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
30591 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30593 // Check that the shuffles didn't cancel each other out. If not, we need to
30594 // combine to the new one.
30596 // Replace the combinable shuffle with the combined one, updating all users
30597 // so that we re-evaluate the chain here.
30598 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
30603 /// Try to combine x86 target specific shuffles.
30604 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
30605 TargetLowering::DAGCombinerInfo &DCI,
30606 const X86Subtarget &Subtarget) {
30608 MVT VT = N.getSimpleValueType();
30609 SmallVector<int, 4> Mask;
30610 unsigned Opcode = N.getOpcode();
30612 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
30613 // single instruction.
30614 if (VT.getScalarSizeInBits() == 64 &&
30615 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
30616 Opcode == X86ISD::UNPCKL)) {
30617 auto BC0 = peekThroughBitcasts(N.getOperand(0));
30618 auto BC1 = peekThroughBitcasts(N.getOperand(1));
30619 EVT VT0 = BC0.getValueType();
30620 EVT VT1 = BC1.getValueType();
30621 unsigned Opcode0 = BC0.getOpcode();
30622 unsigned Opcode1 = BC1.getOpcode();
30623 if (Opcode0 == Opcode1 && VT0 == VT1 &&
30624 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
30625 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
30626 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
30628 if (Opcode == X86ISD::MOVSD) {
30629 Lo = BC1.getOperand(0);
30630 Hi = BC0.getOperand(1);
30632 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30633 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30635 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
30636 DCI.AddToWorklist(Horiz.getNode());
30637 return DAG.getBitcast(VT, Horiz);
30642 case X86ISD::VBROADCAST: {
30643 // If broadcasting from another shuffle, attempt to simplify it.
30644 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
30645 SDValue Src = N.getOperand(0);
30646 SDValue BC = peekThroughBitcasts(Src);
30647 EVT SrcVT = Src.getValueType();
30648 EVT BCVT = BC.getValueType();
30649 if (isTargetShuffle(BC.getOpcode()) &&
30650 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
30651 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
30652 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
30654 for (unsigned i = 0; i != Scale; ++i)
30655 DemandedMask[i] = i;
30656 if (SDValue Res = combineX86ShufflesRecursively(
30657 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
30658 /*HasVarMask*/ false, DAG, Subtarget))
30659 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
30660 DAG.getBitcast(SrcVT, Res));
30664 case X86ISD::PSHUFD:
30665 case X86ISD::PSHUFLW:
30666 case X86ISD::PSHUFHW:
30667 Mask = getPSHUFShuffleMask(N);
30668 assert(Mask.size() == 4);
30670 case X86ISD::UNPCKL: {
30671 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
30672 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
30673 // moves upper half elements into the lower half part. For example:
30675 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
30677 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
30679 // will be combined to:
30681 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
30683 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
30684 // happen due to advanced instructions.
30685 if (!VT.is128BitVector())
30688 auto Op0 = N.getOperand(0);
30689 auto Op1 = N.getOperand(1);
30690 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
30691 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
30693 unsigned NumElts = VT.getVectorNumElements();
30694 SmallVector<int, 8> ExpectedMask(NumElts, -1);
30695 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
30698 auto ShufOp = Op1.getOperand(0);
30699 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
30700 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
30704 case X86ISD::BLENDI: {
30705 SDValue V0 = N->getOperand(0);
30706 SDValue V1 = N->getOperand(1);
30707 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
30708 "Unexpected input vector types");
30710 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
30711 // operands and changing the mask to 1. This saves us a bunch of
30712 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
30713 // x86InstrInfo knows how to commute this back after instruction selection
30714 // if it would help register allocation.
30716 // TODO: If optimizing for size or a processor that doesn't suffer from
30717 // partial register update stalls, this should be transformed into a MOVSD
30718 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
30720 if (VT == MVT::v2f64)
30721 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
30722 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
30723 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
30724 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
30729 case X86ISD::MOVSD:
30730 case X86ISD::MOVSS: {
30731 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
30732 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
30733 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
30734 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
30735 if (isZero0 && isZero1)
30738 // We often lower to MOVSD/MOVSS from integer as well as native float
30739 // types; remove unnecessary domain-crossing bitcasts if we can to make it
30740 // easier to combine shuffles later on. We've already accounted for the
30741 // domain switching cost when we decided to lower with it.
30742 bool isFloat = VT.isFloatingPoint();
30743 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
30744 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
30745 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
30746 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
30747 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
30748 V0 = DAG.getBitcast(NewVT, V0);
30749 V1 = DAG.getBitcast(NewVT, V1);
30750 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
30755 case X86ISD::INSERTPS: {
30756 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
30757 SDValue Op0 = N.getOperand(0);
30758 SDValue Op1 = N.getOperand(1);
30759 SDValue Op2 = N.getOperand(2);
30760 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
30761 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
30762 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
30763 unsigned ZeroMask = InsertPSMask & 0xF;
30765 // If we zero out all elements from Op0 then we don't need to reference it.
30766 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
30767 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
30768 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30770 // If we zero out the element from Op1 then we don't need to reference it.
30771 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
30772 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30773 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30775 // Attempt to merge insertps Op1 with an inner target shuffle node.
30776 SmallVector<int, 8> TargetMask1;
30777 SmallVector<SDValue, 2> Ops1;
30778 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
30779 int M = TargetMask1[SrcIdx];
30780 if (isUndefOrZero(M)) {
30781 // Zero/UNDEF insertion - zero out element and remove dependency.
30782 InsertPSMask |= (1u << DstIdx);
30783 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30784 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30786 // Update insertps mask srcidx and reference the source input directly.
30787 assert(0 <= M && M < 8 && "Shuffle index out of range");
30788 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
30789 Op1 = Ops1[M < 4 ? 0 : 1];
30790 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30791 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30794 // Attempt to merge insertps Op0 with an inner target shuffle node.
30795 SmallVector<int, 8> TargetMask0;
30796 SmallVector<SDValue, 2> Ops0;
30797 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
30800 bool Updated = false;
30801 bool UseInput00 = false;
30802 bool UseInput01 = false;
30803 for (int i = 0; i != 4; ++i) {
30804 int M = TargetMask0[i];
30805 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
30806 // No change if element is already zero or the inserted element.
30808 } else if (isUndefOrZero(M)) {
30809 // If the target mask is undef/zero then we must zero the element.
30810 InsertPSMask |= (1u << i);
30815 // The input vector element must be inline.
30816 if (M != i && M != (i + 4))
30819 // Determine which inputs of the target shuffle we're using.
30820 UseInput00 |= (0 <= M && M < 4);
30821 UseInput01 |= (4 <= M);
30824 // If we're not using both inputs of the target shuffle then use the
30825 // referenced input directly.
30826 if (UseInput00 && !UseInput01) {
30829 } else if (!UseInput00 && UseInput01) {
30835 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30836 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30844 // Nuke no-op shuffles that show up after combining.
30845 if (isNoopShuffleMask(Mask))
30846 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30848 // Look for simplifications involving one or two shuffle instructions.
30849 SDValue V = N.getOperand(0);
30850 switch (N.getOpcode()) {
30853 case X86ISD::PSHUFLW:
30854 case X86ISD::PSHUFHW:
30855 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
30857 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
30858 return SDValue(); // We combined away this shuffle, so we're done.
30860 // See if this reduces to a PSHUFD which is no more expensive and can
30861 // combine with more operations. Note that it has to at least flip the
30862 // dwords as otherwise it would have been removed as a no-op.
30863 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
30864 int DMask[] = {0, 1, 2, 3};
30865 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
30866 DMask[DOffset + 0] = DOffset + 1;
30867 DMask[DOffset + 1] = DOffset + 0;
30868 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30869 V = DAG.getBitcast(DVT, V);
30870 DCI.AddToWorklist(V.getNode());
30871 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
30872 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
30873 DCI.AddToWorklist(V.getNode());
30874 return DAG.getBitcast(VT, V);
30877 // Look for shuffle patterns which can be implemented as a single unpack.
30878 // FIXME: This doesn't handle the location of the PSHUFD generically, and
30879 // only works when we have a PSHUFD followed by two half-shuffles.
30880 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30881 (V.getOpcode() == X86ISD::PSHUFLW ||
30882 V.getOpcode() == X86ISD::PSHUFHW) &&
30883 V.getOpcode() != N.getOpcode() &&
30885 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30886 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30887 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30888 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30889 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30890 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30892 for (int i = 0; i < 4; ++i) {
30893 WordMask[i + NOffset] = Mask[i] + NOffset;
30894 WordMask[i + VOffset] = VMask[i] + VOffset;
30896 // Map the word mask through the DWord mask.
30898 for (int i = 0; i < 8; ++i)
30899 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30900 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30901 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30902 // We can replace all three shuffles with an unpack.
30903 V = DAG.getBitcast(VT, D.getOperand(0));
30904 DCI.AddToWorklist(V.getNode());
30905 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30914 case X86ISD::PSHUFD:
30915 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30924 /// Checks if the shuffle mask takes subsequent elements
30925 /// alternately from two vectors.
30926 /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct.
30927 static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) {
30929 int ParitySrc[2] = {-1, -1};
30930 unsigned Size = Mask.size();
30931 for (unsigned i = 0; i != Size; ++i) {
30936 // Make sure we are using the matching element from the input.
30937 if ((M % Size) != i)
30940 // Make sure we use the same input for all elements of the same parity.
30941 int Src = M / Size;
30942 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
30944 ParitySrc[i % 2] = Src;
30947 // Make sure each input is used.
30948 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
30951 Op0Even = ParitySrc[0] == 0;
30955 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30956 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30957 /// are written to the parameters \p Opnd0 and \p Opnd1.
30959 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30960 /// so it is easier to generically match. We also insert dummy vector shuffle
30961 /// nodes for the operands which explicitly discard the lanes which are unused
30962 /// by this operation to try to flow through the rest of the combiner
30963 /// the fact that they're unused.
30964 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30965 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
30968 EVT VT = N->getValueType(0);
30969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30970 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
30971 !VT.getSimpleVT().isFloatingPoint())
30974 // We only handle target-independent shuffles.
30975 // FIXME: It would be easy and harmless to use the target shuffle mask
30976 // extraction tool to support more.
30977 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30980 SDValue V1 = N->getOperand(0);
30981 SDValue V2 = N->getOperand(1);
30983 // Make sure we have an FADD and an FSUB.
30984 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
30985 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
30986 V1.getOpcode() == V2.getOpcode())
30989 // If there are other uses of these operations we can't fold them.
30990 if (!V1->hasOneUse() || !V2->hasOneUse())
30993 // Ensure that both operations have the same operands. Note that we can
30994 // commute the FADD operands.
30996 if (V1.getOpcode() == ISD::FSUB) {
30997 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
30998 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30999 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
31002 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
31003 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
31004 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
31005 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
31009 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31011 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31014 // It's a subadd if the vector in the even parity is an FADD.
31015 IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD
31016 : V2->getOpcode() == ISD::FADD;
31023 /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd.
31024 static SDValue combineShuffleToFMAddSub(SDNode *N,
31025 const X86Subtarget &Subtarget,
31026 SelectionDAG &DAG) {
31027 // We only handle target-independent shuffles.
31028 // FIXME: It would be easy and harmless to use the target shuffle mask
31029 // extraction tool to support more.
31030 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
31033 MVT VT = N->getSimpleValueType(0);
31034 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31035 if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT))
31038 // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c).
31039 SDValue Op0 = N->getOperand(0);
31040 SDValue Op1 = N->getOperand(1);
31041 SDValue FMAdd = Op0, FMSub = Op1;
31042 if (FMSub.getOpcode() != X86ISD::FMSUB)
31043 std::swap(FMAdd, FMSub);
31045 if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB ||
31046 FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() ||
31047 FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() ||
31048 FMAdd.getOperand(2) != FMSub.getOperand(2))
31051 // Check for correct shuffle mask.
31052 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31054 if (!isAddSubOrSubAddMask(Mask, Op0Even))
31057 // FMAddSub takes zeroth operand from FMSub node.
31059 bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd;
31060 unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31061 return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1),
31062 FMAdd.getOperand(2));
31065 /// Try to combine a shuffle into a target-specific add-sub or
31066 /// mul-add-sub node.
31067 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
31068 const X86Subtarget &Subtarget,
31069 SelectionDAG &DAG) {
31070 if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG))
31073 SDValue Opnd0, Opnd1;
31075 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
31078 MVT VT = N->getSimpleValueType(0);
31081 // Try to generate X86ISD::FMADDSUB node here.
31083 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
31084 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
31085 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
31091 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
31092 // the ADDSUB idiom has been successfully recognized. There are no known
31093 // X86 targets with 512-bit ADDSUB instructions!
31094 if (VT.is512BitVector())
31097 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
31100 // We are looking for a shuffle where both sources are concatenated with undef
31101 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
31102 // if we can express this as a single-source shuffle, that's preferable.
31103 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
31104 const X86Subtarget &Subtarget) {
31105 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
31108 EVT VT = N->getValueType(0);
31110 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
31111 if (!VT.is128BitVector() && !VT.is256BitVector())
31114 if (VT.getVectorElementType() != MVT::i32 &&
31115 VT.getVectorElementType() != MVT::i64 &&
31116 VT.getVectorElementType() != MVT::f32 &&
31117 VT.getVectorElementType() != MVT::f64)
31120 SDValue N0 = N->getOperand(0);
31121 SDValue N1 = N->getOperand(1);
31123 // Check that both sources are concats with undef.
31124 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
31125 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
31126 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
31127 !N1.getOperand(1).isUndef())
31130 // Construct the new shuffle mask. Elements from the first source retain their
31131 // index, but elements from the second source no longer need to skip an undef.
31132 SmallVector<int, 8> Mask;
31133 int NumElts = VT.getVectorNumElements();
31135 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31136 for (int Elt : SVOp->getMask())
31137 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
31140 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
31142 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
31145 /// Eliminate a redundant shuffle of a horizontal math op.
31146 static SDValue foldShuffleOfHorizOp(SDNode *N) {
31147 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
31150 SDValue HOp = N->getOperand(0);
31151 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
31152 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
31155 // 128-bit horizontal math instructions are defined to operate on adjacent
31156 // lanes of each operand as:
31157 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
31158 // ...similarly for v2f64 and v8i16.
31159 // TODO: 256-bit is not the same because...x86.
31160 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
31163 // When the operands of a horizontal math op are identical, the low half of
31164 // the result is the same as the high half. If the shuffle is also replicating
31165 // low and high halves, we don't need the shuffle.
31166 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
31167 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
31168 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
31169 // but this should be tied to whatever horizontal op matching and shuffle
31170 // canonicalization are producing.
31171 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
31172 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
31173 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
31179 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
31180 TargetLowering::DAGCombinerInfo &DCI,
31181 const X86Subtarget &Subtarget) {
31183 EVT VT = N->getValueType(0);
31184 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31185 // If we have legalized the vector types, look for blends of FADD and FSUB
31186 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
31187 if (TLI.isTypeLegal(VT)) {
31188 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
31191 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
31195 // During Type Legalization, when promoting illegal vector types,
31196 // the backend might introduce new shuffle dag nodes and bitcasts.
31198 // This code performs the following transformation:
31199 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
31200 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
31202 // We do this only if both the bitcast and the BINOP dag nodes have
31203 // one use. Also, perform this transformation only if the new binary
31204 // operation is legal. This is to avoid introducing dag nodes that
31205 // potentially need to be further expanded (or custom lowered) into a
31206 // less optimal sequence of dag nodes.
31207 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
31208 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
31209 N->getOperand(0).getOpcode() == ISD::BITCAST &&
31210 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
31211 SDValue N0 = N->getOperand(0);
31212 SDValue N1 = N->getOperand(1);
31214 SDValue BC0 = N0.getOperand(0);
31215 EVT SVT = BC0.getValueType();
31216 unsigned Opcode = BC0.getOpcode();
31217 unsigned NumElts = VT.getVectorNumElements();
31219 if (BC0.hasOneUse() && SVT.isVector() &&
31220 SVT.getVectorNumElements() * 2 == NumElts &&
31221 TLI.isOperationLegal(Opcode, VT)) {
31222 bool CanFold = false;
31228 // isOperationLegal lies for integer ops on floating point types.
31229 CanFold = VT.isInteger();
31234 // isOperationLegal lies for floating point ops on integer types.
31235 CanFold = VT.isFloatingPoint();
31239 unsigned SVTNumElts = SVT.getVectorNumElements();
31240 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
31241 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
31242 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
31243 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
31244 CanFold = SVOp->getMaskElt(i) < 0;
31247 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
31248 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
31249 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
31250 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
31255 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
31256 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
31257 // consecutive, non-overlapping, and in the right order.
31258 SmallVector<SDValue, 16> Elts;
31259 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
31260 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
31261 Elts.push_back(Elt);
31268 if (Elts.size() == VT.getVectorNumElements())
31270 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
31273 // For AVX2, we sometimes want to combine
31274 // (vector_shuffle <mask> (concat_vectors t1, undef)
31275 // (concat_vectors t2, undef))
31277 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
31278 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
31279 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
31282 if (isTargetShuffle(N->getOpcode())) {
31284 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
31287 // Try recursively combining arbitrary sequences of x86 shuffle
31288 // instructions into higher-order shuffles. We do this after combining
31289 // specific PSHUF instruction sequences into their minimal form so that we
31290 // can evaluate how many specialized shuffle instructions are involved in
31291 // a particular chain.
31292 if (SDValue Res = combineX86ShufflesRecursively(
31293 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
31294 /*HasVarMask*/ false, DAG, Subtarget)) {
31295 DCI.CombineTo(N, Res);
31303 /// Check if a vector extract from a target-specific shuffle of a load can be
31304 /// folded into a single element load.
31305 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
31306 /// shuffles have been custom lowered so we need to handle those here.
31307 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
31308 TargetLowering::DAGCombinerInfo &DCI) {
31309 if (DCI.isBeforeLegalizeOps())
31312 SDValue InVec = N->getOperand(0);
31313 SDValue EltNo = N->getOperand(1);
31314 EVT EltVT = N->getValueType(0);
31316 if (!isa<ConstantSDNode>(EltNo))
31319 EVT OriginalVT = InVec.getValueType();
31321 // Peek through bitcasts, don't duplicate a load with other uses.
31322 InVec = peekThroughOneUseBitcasts(InVec);
31324 EVT CurrentVT = InVec.getValueType();
31325 if (!CurrentVT.isVector() ||
31326 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
31329 if (!isTargetShuffle(InVec.getOpcode()))
31332 // Don't duplicate a load with other uses.
31333 if (!InVec.hasOneUse())
31336 SmallVector<int, 16> ShuffleMask;
31337 SmallVector<SDValue, 2> ShuffleOps;
31339 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
31340 ShuffleOps, ShuffleMask, UnaryShuffle))
31343 // Select the input vector, guarding against out of range extract vector.
31344 unsigned NumElems = CurrentVT.getVectorNumElements();
31345 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
31346 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
31348 if (Idx == SM_SentinelZero)
31349 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
31350 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
31351 if (Idx == SM_SentinelUndef)
31352 return DAG.getUNDEF(EltVT);
31354 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
31355 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
31358 // If inputs to shuffle are the same for both ops, then allow 2 uses
31359 unsigned AllowedUses =
31360 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
31362 if (LdNode.getOpcode() == ISD::BITCAST) {
31363 // Don't duplicate a load with other uses.
31364 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
31367 AllowedUses = 1; // only allow 1 load use if we have a bitcast
31368 LdNode = LdNode.getOperand(0);
31371 if (!ISD::isNormalLoad(LdNode.getNode()))
31374 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
31376 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
31379 // If there's a bitcast before the shuffle, check if the load type and
31380 // alignment is valid.
31381 unsigned Align = LN0->getAlignment();
31382 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31383 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
31384 EltVT.getTypeForEVT(*DAG.getContext()));
31386 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
31389 // All checks match so transform back to vector_shuffle so that DAG combiner
31390 // can finish the job
31393 // Create shuffle node taking into account the case that its a unary shuffle
31394 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
31395 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
31397 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
31398 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
31402 // Try to match patterns such as
31403 // (i16 bitcast (v16i1 x))
31405 // (i16 movmsk (16i8 sext (v16i1 x)))
31406 // before the illegal vector is scalarized on subtargets that don't have legal
31408 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
31409 const X86Subtarget &Subtarget) {
31410 EVT VT = BitCast.getValueType();
31411 SDValue N0 = BitCast.getOperand(0);
31412 EVT VecVT = N0->getValueType(0);
31414 if (!VT.isScalarInteger() || !VecVT.isSimple())
31417 // With AVX512 vxi1 types are legal and we prefer using k-regs.
31418 // MOVMSK is supported in SSE2 or later.
31419 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
31422 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
31423 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
31424 // v8i16 and v16i16.
31425 // For these two cases, we can shuffle the upper element bytes to a
31426 // consecutive sequence at the start of the vector and treat the results as
31427 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
31428 // for v16i16 this is not the case, because the shuffle is expensive, so we
31429 // avoid sign-extending to this type entirely.
31430 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
31431 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
31433 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
31434 switch (VecVT.getSimpleVT().SimpleTy) {
31438 SExtVT = MVT::v2i64;
31439 FPCastVT = MVT::v2f64;
31442 SExtVT = MVT::v4i32;
31443 FPCastVT = MVT::v4f32;
31444 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
31445 // sign-extend to a 256-bit operation to avoid truncation.
31446 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31447 N0->getOperand(0).getValueType().is256BitVector()) {
31448 SExtVT = MVT::v4i64;
31449 FPCastVT = MVT::v4f64;
31453 SExtVT = MVT::v8i16;
31454 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
31455 // sign-extend to a 256-bit operation to match the compare.
31456 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
31457 // 256-bit because the shuffle is cheaper than sign extending the result of
31459 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
31460 (N0->getOperand(0).getValueType().is256BitVector() ||
31461 N0->getOperand(0).getValueType().is512BitVector())) {
31462 SExtVT = MVT::v8i32;
31463 FPCastVT = MVT::v8f32;
31467 SExtVT = MVT::v16i8;
31468 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
31469 // it is not profitable to sign-extend to 256-bit because this will
31470 // require an extra cross-lane shuffle which is more expensive than
31471 // truncating the result of the compare to 128-bits.
31474 SExtVT = MVT::v32i8;
31479 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
31481 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
31482 V = getPMOVMSKB(DL, V, DAG, Subtarget);
31483 return DAG.getZExtOrTrunc(V, DL, VT);
31486 if (SExtVT == MVT::v8i16) {
31487 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
31488 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
31489 DAG.getUNDEF(MVT::v8i16));
31491 assert(SExtVT.getScalarType() != MVT::i16 &&
31492 "Vectors of i16 must be packed");
31493 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
31494 V = DAG.getBitcast(FPCastVT, V);
31495 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
31496 return DAG.getZExtOrTrunc(V, DL, VT);
31499 // Convert a vXi1 constant build vector to the same width scalar integer.
31500 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
31501 EVT SrcVT = Op.getValueType();
31502 assert(SrcVT.getVectorElementType() == MVT::i1 &&
31503 "Expected a vXi1 vector");
31504 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
31505 "Expected a constant build vector");
31507 APInt Imm(SrcVT.getVectorNumElements(), 0);
31508 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
31509 SDValue In = Op.getOperand(Idx);
31510 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
31513 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
31514 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
31517 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31518 TargetLowering::DAGCombinerInfo &DCI,
31519 const X86Subtarget &Subtarget) {
31520 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
31522 if (!DCI.isBeforeLegalizeOps())
31525 // Only do this if we have k-registers.
31526 if (!Subtarget.hasAVX512())
31529 EVT DstVT = N->getValueType(0);
31530 SDValue Op = N->getOperand(0);
31531 EVT SrcVT = Op.getValueType();
31533 if (!Op.hasOneUse())
31536 // Look for logic ops.
31537 if (Op.getOpcode() != ISD::AND &&
31538 Op.getOpcode() != ISD::OR &&
31539 Op.getOpcode() != ISD::XOR)
31542 // Make sure we have a bitcast between mask registers and a scalar type.
31543 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31544 DstVT.isScalarInteger()) &&
31545 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
31546 SrcVT.isScalarInteger()))
31549 SDValue LHS = Op.getOperand(0);
31550 SDValue RHS = Op.getOperand(1);
31552 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
31553 LHS.getOperand(0).getValueType() == DstVT)
31554 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
31555 DAG.getBitcast(DstVT, RHS));
31557 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
31558 RHS.getOperand(0).getValueType() == DstVT)
31559 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31560 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
31562 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
31563 // Most of these have to move a constant from the scalar domain anyway.
31564 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
31565 RHS = combinevXi1ConstantToInteger(RHS, DAG);
31566 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
31567 DAG.getBitcast(DstVT, LHS), RHS);
31573 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
31574 const X86Subtarget &Subtarget) {
31576 unsigned NumElts = N.getNumOperands();
31578 auto *BV = cast<BuildVectorSDNode>(N);
31579 SDValue Splat = BV->getSplatValue();
31581 // Build MMX element from integer GPR or SSE float values.
31582 auto CreateMMXElement = [&](SDValue V) {
31584 return DAG.getUNDEF(MVT::x86mmx);
31585 if (V.getValueType().isFloatingPoint()) {
31586 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
31587 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31588 V = DAG.getBitcast(MVT::v2i64, V);
31589 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31591 V = DAG.getBitcast(MVT::i32, V);
31593 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31595 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31598 // Convert build vector ops to MMX data in the bottom elements.
31599 SmallVector<SDValue, 8> Ops;
31601 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31603 if (Splat.isUndef())
31604 return DAG.getUNDEF(MVT::x86mmx);
31606 Splat = CreateMMXElement(Splat);
31608 if (Subtarget.hasSSE1()) {
31609 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31611 Splat = DAG.getNode(
31612 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31613 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31616 // Use PSHUFW to repeat 16-bit elements.
31617 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31618 return DAG.getNode(
31619 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31620 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31621 DAG.getConstant(ShufMask, DL, MVT::i8));
31623 Ops.append(NumElts, Splat);
31625 for (unsigned i = 0; i != NumElts; ++i)
31626 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31629 // Use tree of PUNPCKLs to build up general MMX vector.
31630 while (Ops.size() > 1) {
31631 unsigned NumOps = Ops.size();
31632 unsigned IntrinOp =
31633 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31634 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31635 : Intrinsic::x86_mmx_punpcklbw));
31636 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31637 for (unsigned i = 0; i != NumOps; i += 2)
31638 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31639 Ops[i], Ops[i + 1]);
31640 Ops.resize(NumOps / 2);
31646 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
31647 TargetLowering::DAGCombinerInfo &DCI,
31648 const X86Subtarget &Subtarget) {
31649 SDValue N0 = N->getOperand(0);
31650 EVT VT = N->getValueType(0);
31651 EVT SrcVT = N0.getValueType();
31653 // Try to match patterns such as
31654 // (i16 bitcast (v16i1 x))
31656 // (i16 movmsk (16i8 sext (v16i1 x)))
31657 // before the setcc result is scalarized on subtargets that don't have legal
31659 if (DCI.isBeforeLegalize()) {
31660 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
31663 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31664 // type, widen both sides to avoid a trip through memory.
31665 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
31666 Subtarget.hasAVX512()) {
31668 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
31669 N0 = DAG.getBitcast(MVT::v8i1, N0);
31670 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
31671 DAG.getIntPtrConstant(0, dl));
31674 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31675 // type, widen both sides to avoid a trip through memory.
31676 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
31677 Subtarget.hasAVX512()) {
31679 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
31680 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
31682 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
31683 N0 = DAG.getBitcast(MVT::i8, N0);
31684 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
31688 // Since MMX types are special and don't usually play with other vector types,
31689 // it's better to handle them early to be sure we emit efficient code by
31690 // avoiding store-load conversions.
31691 if (VT == MVT::x86mmx) {
31692 // Detect MMX constant vectors.
31694 SmallVector<APInt, 1> EltBits;
31695 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
31697 // Handle zero-extension of i32 with MOVD.
31698 if (EltBits[0].countLeadingZeros() >= 32)
31699 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
31700 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
31701 // Else, bitcast to a double.
31702 // TODO - investigate supporting sext 32-bit immediates on x86_64.
31703 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
31704 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
31707 // Detect bitcasts to x86mmx low word.
31708 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31709 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
31710 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
31711 bool LowUndef = true, AllUndefOrZero = true;
31712 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
31713 SDValue Op = N0.getOperand(i);
31714 LowUndef &= Op.isUndef() || (i >= e/2);
31715 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
31717 if (AllUndefOrZero) {
31718 SDValue N00 = N0.getOperand(0);
31720 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
31721 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
31722 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
31726 // Detect bitcasts of 64-bit build vectors and convert to a
31727 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31729 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31730 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31731 SrcVT == MVT::v8i8))
31732 return createMMXBuildVector(N0, DAG, Subtarget);
31734 // Detect bitcasts between element or subvector extraction to x86mmx.
31735 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
31736 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
31737 isNullConstant(N0.getOperand(1))) {
31738 SDValue N00 = N0.getOperand(0);
31739 if (N00.getValueType().is128BitVector())
31740 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
31741 DAG.getBitcast(MVT::v2i64, N00));
31744 // Detect bitcasts from FP_TO_SINT to x86mmx.
31745 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
31747 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
31748 DAG.getUNDEF(MVT::v2i32));
31749 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
31750 DAG.getBitcast(MVT::v2i64, Res));
31754 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
31755 // most of these to scalar anyway.
31756 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
31757 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31758 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
31759 return combinevXi1ConstantToInteger(N0, DAG);
31762 if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
31763 VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
31764 isa<ConstantSDNode>(N0)) {
31765 auto *C = cast<ConstantSDNode>(N0);
31766 if (C->isAllOnesValue())
31767 return DAG.getConstant(1, SDLoc(N0), VT);
31768 if (C->isNullValue())
31769 return DAG.getConstant(0, SDLoc(N0), VT);
31772 // Try to remove bitcasts from input and output of mask arithmetic to
31773 // remove GPR<->K-register crossings.
31774 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
31777 // Convert a bitcasted integer logic operation that has one bitcasted
31778 // floating-point operand into a floating-point logic operation. This may
31779 // create a load of a constant, but that is cheaper than materializing the
31780 // constant in an integer register and transferring it to an SSE register or
31781 // transferring the SSE operand to integer register and back.
31783 switch (N0.getOpcode()) {
31784 case ISD::AND: FPOpcode = X86ISD::FAND; break;
31785 case ISD::OR: FPOpcode = X86ISD::FOR; break;
31786 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
31787 default: return SDValue();
31790 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
31791 (Subtarget.hasSSE2() && VT == MVT::f64)))
31794 SDValue LogicOp0 = N0.getOperand(0);
31795 SDValue LogicOp1 = N0.getOperand(1);
31798 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
31799 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
31800 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
31801 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
31802 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
31803 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
31805 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
31806 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
31807 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
31808 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
31809 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
31810 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
31816 // Match a binop + shuffle pyramid that represents a horizontal reduction over
31817 // the elements of a vector.
31818 // Returns the vector that is being reduced on, or SDValue() if a reduction
31819 // was not matched.
31820 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
31821 ArrayRef<ISD::NodeType> CandidateBinOps) {
31822 // The pattern must end in an extract from index 0.
31823 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
31824 !isNullConstant(Extract->getOperand(1)))
31827 SDValue Op = Extract->getOperand(0);
31828 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
31830 // Match against one of the candidate binary ops.
31831 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
31832 return Op.getOpcode() == unsigned(BinOp);
31836 // At each stage, we're looking for something that looks like:
31837 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
31838 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
31839 // i32 undef, i32 undef, i32 undef, i32 undef>
31840 // %a = binop <8 x i32> %op, %s
31841 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
31842 // we expect something like:
31843 // <4,5,6,7,u,u,u,u>
31844 // <2,3,u,u,u,u,u,u>
31845 // <1,u,u,u,u,u,u,u>
31846 unsigned CandidateBinOp = Op.getOpcode();
31847 for (unsigned i = 0; i < Stages; ++i) {
31848 if (Op.getOpcode() != CandidateBinOp)
31851 ShuffleVectorSDNode *Shuffle =
31852 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
31854 Op = Op.getOperand(1);
31856 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
31857 Op = Op.getOperand(0);
31860 // The first operand of the shuffle should be the same as the other operand
31862 if (!Shuffle || Shuffle->getOperand(0) != Op)
31865 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
31866 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
31867 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
31871 BinOp = CandidateBinOp;
31875 // Given a select, detect the following pattern:
31876 // 1: %2 = zext <N x i8> %0 to <N x i32>
31877 // 2: %3 = zext <N x i8> %1 to <N x i32>
31878 // 3: %4 = sub nsw <N x i32> %2, %3
31879 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
31880 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
31881 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
31882 // This is useful as it is the input into a SAD pattern.
31883 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
31885 // Check the condition of the select instruction is greater-than.
31886 SDValue SetCC = Select->getOperand(0);
31887 if (SetCC.getOpcode() != ISD::SETCC)
31889 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
31890 if (CC != ISD::SETGT && CC != ISD::SETLT)
31893 SDValue SelectOp1 = Select->getOperand(1);
31894 SDValue SelectOp2 = Select->getOperand(2);
31896 // The following instructions assume SelectOp1 is the subtraction operand
31897 // and SelectOp2 is the negation operand.
31898 // In the case of SETLT this is the other way around.
31899 if (CC == ISD::SETLT)
31900 std::swap(SelectOp1, SelectOp2);
31902 // The second operand of the select should be the negation of the first
31903 // operand, which is implemented as 0 - SelectOp1.
31904 if (!(SelectOp2.getOpcode() == ISD::SUB &&
31905 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
31906 SelectOp2.getOperand(1) == SelectOp1))
31909 // The first operand of SetCC is the first operand of the select, which is the
31910 // difference between the two input vectors.
31911 if (SetCC.getOperand(0) != SelectOp1)
31914 // In SetLT case, The second operand of the comparison can be either 1 or 0.
31916 if ((CC == ISD::SETLT) &&
31917 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
31918 SplatVal.isOneValue()) ||
31919 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
31922 // In SetGT case, The second operand of the comparison can be either -1 or 0.
31923 if ((CC == ISD::SETGT) &&
31924 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
31925 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
31928 // The first operand of the select is the difference between the two input
31930 if (SelectOp1.getOpcode() != ISD::SUB)
31933 Op0 = SelectOp1.getOperand(0);
31934 Op1 = SelectOp1.getOperand(1);
31936 // Check if the operands of the sub are zero-extended from vectors of i8.
31937 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
31938 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
31939 Op1.getOpcode() != ISD::ZERO_EXTEND ||
31940 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
31946 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
31948 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
31949 const SDValue &Zext1, const SDLoc &DL,
31950 const X86Subtarget &Subtarget) {
31951 // Find the appropriate width for the PSADBW.
31952 EVT InVT = Zext0.getOperand(0).getValueType();
31953 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
31955 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
31956 // fill in the missing vector elements with 0.
31957 unsigned NumConcat = RegSize / InVT.getSizeInBits();
31958 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
31959 Ops[0] = Zext0.getOperand(0);
31960 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
31961 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31962 Ops[0] = Zext1.getOperand(0);
31963 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31965 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
31966 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
31967 ArrayRef<SDValue> Ops) {
31968 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
31969 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
31971 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
31972 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
31976 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
31978 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
31979 const X86Subtarget &Subtarget) {
31980 // Bail without SSE41.
31981 if (!Subtarget.hasSSE41())
31984 EVT ExtractVT = Extract->getValueType(0);
31985 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
31988 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
31990 SDValue Src = matchBinOpReduction(
31991 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
31995 EVT SrcVT = Src.getValueType();
31996 EVT SrcSVT = SrcVT.getScalarType();
31997 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
32001 SDValue MinPos = Src;
32003 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
32004 while (SrcVT.getSizeInBits() > 128) {
32005 unsigned NumElts = SrcVT.getVectorNumElements();
32006 unsigned NumSubElts = NumElts / 2;
32007 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
32008 unsigned SubSizeInBits = SrcVT.getSizeInBits();
32009 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
32010 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
32011 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
32013 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
32014 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
32015 "Unexpected value type");
32017 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
32018 // to flip the value accordingly.
32020 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
32021 if (BinOp == ISD::SMAX)
32022 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
32023 else if (BinOp == ISD::SMIN)
32024 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
32025 else if (BinOp == ISD::UMAX)
32026 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
32029 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32031 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
32032 // shuffling each upper element down and insert zeros. This means that the
32033 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
32034 // ready for the PHMINPOS.
32035 if (ExtractVT == MVT::i8) {
32036 SDValue Upper = DAG.getVectorShuffle(
32037 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
32038 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
32039 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
32042 // Perform the PHMINPOS on a v8i16 vector,
32043 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
32044 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
32045 MinPos = DAG.getBitcast(SrcVT, MinPos);
32048 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
32050 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
32051 DAG.getIntPtrConstant(0, DL));
32054 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
32055 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
32057 const X86Subtarget &Subtarget) {
32058 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
32059 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
32062 EVT ExtractVT = Extract->getValueType(0);
32063 unsigned BitWidth = ExtractVT.getSizeInBits();
32064 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
32065 ExtractVT != MVT::i8)
32068 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
32069 unsigned BinOp = 0;
32070 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
32074 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
32075 // which we can't support here for now.
32076 if (Match.getScalarValueSizeInBits() != BitWidth)
32079 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
32080 unsigned MatchSizeInBits = Match.getValueSizeInBits();
32081 if (!(MatchSizeInBits == 128 ||
32082 (MatchSizeInBits == 256 &&
32083 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
32086 // Don't bother performing this for 2-element vectors.
32087 if (Match.getValueType().getVectorNumElements() <= 2)
32090 // Check that we are extracting a reduction of all sign bits.
32091 if (DAG.ComputeNumSignBits(Match) != BitWidth)
32094 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
32096 if (64 == BitWidth || 32 == BitWidth)
32097 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
32098 MatchSizeInBits / BitWidth);
32100 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
32103 ISD::CondCode CondCode;
32104 if (BinOp == ISD::OR) {
32105 // any_of -> MOVMSK != 0
32106 CompareBits = APInt::getNullValue(32);
32107 CondCode = ISD::CondCode::SETNE;
32109 // all_of -> MOVMSK == ((1 << NumElts) - 1)
32110 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
32111 CondCode = ISD::CondCode::SETEQ;
32114 // Perform the select as i32/i64 and then truncate to avoid partial register
32116 unsigned ResWidth = std::max(BitWidth, 32u);
32117 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
32119 SDValue Zero = DAG.getConstant(0, DL, ResVT);
32120 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
32121 SDValue Res = DAG.getBitcast(MaskVT, Match);
32122 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
32123 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
32124 Ones, Zero, CondCode);
32125 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
32128 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
32129 const X86Subtarget &Subtarget) {
32130 // PSADBW is only supported on SSE2 and up.
32131 if (!Subtarget.hasSSE2())
32134 // Verify the type we're extracting from is any integer type above i16.
32135 EVT VT = Extract->getOperand(0).getValueType();
32136 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
32139 unsigned RegSize = 128;
32140 if (Subtarget.useBWIRegs())
32142 else if (Subtarget.hasAVX())
32145 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
32146 // TODO: We should be able to handle larger vectors by splitting them before
32147 // feeding them into several SADs, and then reducing over those.
32148 if (RegSize / VT.getVectorNumElements() < 8)
32151 // Match shuffle + add pyramid.
32152 unsigned BinOp = 0;
32153 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
32155 // The operand is expected to be zero extended from i8
32156 // (verified in detectZextAbsDiff).
32157 // In order to convert to i64 and above, additional any/zero/sign
32158 // extend is expected.
32159 // The zero extend from 32 bit has no mathematical effect on the result.
32160 // Also the sign extend is basically zero extend
32161 // (extends the sign bit which is zero).
32162 // So it is correct to skip the sign/zero extend instruction.
32163 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
32164 Root.getOpcode() == ISD::ZERO_EXTEND ||
32165 Root.getOpcode() == ISD::ANY_EXTEND))
32166 Root = Root.getOperand(0);
32168 // If there was a match, we want Root to be a select that is the root of an
32169 // abs-diff pattern.
32170 if (!Root || (Root.getOpcode() != ISD::VSELECT))
32173 // Check whether we have an abs-diff pattern feeding into the select.
32174 SDValue Zext0, Zext1;
32175 if (!detectZextAbsDiff(Root, Zext0, Zext1))
32178 // Create the SAD instruction.
32180 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
32182 // If the original vector was wider than 8 elements, sum over the results
32183 // in the SAD vector.
32184 unsigned Stages = Log2_32(VT.getVectorNumElements());
32185 MVT SadVT = SAD.getSimpleValueType();
32187 unsigned SadElems = SadVT.getVectorNumElements();
32189 for(unsigned i = Stages - 3; i > 0; --i) {
32190 SmallVector<int, 16> Mask(SadElems, -1);
32191 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
32192 Mask[j] = MaskEnd + j;
32195 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
32196 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
32200 MVT Type = Extract->getSimpleValueType(0);
32201 unsigned TypeSizeInBits = Type.getSizeInBits();
32202 // Return the lowest TypeSizeInBits bits.
32203 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
32204 SAD = DAG.getBitcast(ResVT, SAD);
32205 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
32206 Extract->getOperand(1));
32209 // Attempt to peek through a target shuffle and extract the scalar from the
32211 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
32212 TargetLowering::DAGCombinerInfo &DCI,
32213 const X86Subtarget &Subtarget) {
32214 if (DCI.isBeforeLegalizeOps())
32217 SDValue Src = N->getOperand(0);
32218 SDValue Idx = N->getOperand(1);
32220 EVT VT = N->getValueType(0);
32221 EVT SrcVT = Src.getValueType();
32222 EVT SrcSVT = SrcVT.getVectorElementType();
32223 unsigned NumSrcElts = SrcVT.getVectorNumElements();
32225 // Don't attempt this for boolean mask vectors or unknown extraction indices.
32226 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
32229 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
32230 if (X86ISD::VBROADCAST == Src.getOpcode() &&
32231 Src.getOperand(0).getValueType() == VT)
32232 return Src.getOperand(0);
32234 // Resolve the target shuffle inputs and mask.
32235 SmallVector<int, 16> Mask;
32236 SmallVector<SDValue, 2> Ops;
32237 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
32240 // Attempt to narrow/widen the shuffle mask to the correct size.
32241 if (Mask.size() != NumSrcElts) {
32242 if ((NumSrcElts % Mask.size()) == 0) {
32243 SmallVector<int, 16> ScaledMask;
32244 int Scale = NumSrcElts / Mask.size();
32245 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
32246 Mask = std::move(ScaledMask);
32247 } else if ((Mask.size() % NumSrcElts) == 0) {
32248 SmallVector<int, 16> WidenedMask;
32249 while (Mask.size() > NumSrcElts &&
32250 canWidenShuffleElements(Mask, WidenedMask))
32251 Mask = std::move(WidenedMask);
32252 // TODO - investigate support for wider shuffle masks with known upper
32253 // undef/zero elements for implicit zero-extension.
32257 // Check if narrowing/widening failed.
32258 if (Mask.size() != NumSrcElts)
32261 int SrcIdx = Mask[N->getConstantOperandVal(1)];
32264 // If the shuffle source element is undef/zero then we can just accept it.
32265 if (SrcIdx == SM_SentinelUndef)
32266 return DAG.getUNDEF(VT);
32268 if (SrcIdx == SM_SentinelZero)
32269 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
32270 : DAG.getConstant(0, dl, VT);
32272 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
32273 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
32274 SrcIdx = SrcIdx % Mask.size();
32276 // We can only extract other elements from 128-bit vectors and in certain
32277 // circumstances, depending on SSE-level.
32278 // TODO: Investigate using extract_subvector for larger vectors.
32279 // TODO: Investigate float/double extraction if it will be just stored.
32280 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
32281 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
32282 assert(SrcSVT == VT && "Unexpected extraction type");
32283 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
32284 DAG.getIntPtrConstant(SrcIdx, dl));
32287 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
32288 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
32289 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
32290 "Unexpected extraction type");
32291 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
32292 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
32293 DAG.getIntPtrConstant(SrcIdx, dl));
32294 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
32300 /// Detect vector gather/scatter index generation and convert it from being a
32301 /// bunch of shuffles and extracts into a somewhat faster sequence.
32302 /// For i686, the best sequence is apparently storing the value and loading
32303 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
32304 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
32305 TargetLowering::DAGCombinerInfo &DCI,
32306 const X86Subtarget &Subtarget) {
32307 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
32310 // TODO - Remove this once we can handle the implicit zero-extension of
32311 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
32312 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
32313 // combineBasicSADPattern.
32314 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
32317 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
32320 SDValue InputVector = N->getOperand(0);
32321 SDValue EltIdx = N->getOperand(1);
32323 EVT SrcVT = InputVector.getValueType();
32324 EVT VT = N->getValueType(0);
32325 SDLoc dl(InputVector);
32327 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
32328 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32329 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
32330 SDValue MMXSrc = InputVector.getOperand(0);
32332 // The bitcast source is a direct mmx result.
32333 if (MMXSrc.getValueType() == MVT::x86mmx)
32334 return DAG.getBitcast(VT, InputVector);
32337 // Detect mmx to i32 conversion through a v2i32 elt extract.
32338 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
32339 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
32340 SDValue MMXSrc = InputVector.getOperand(0);
32342 // The bitcast source is a direct mmx result.
32343 if (MMXSrc.getValueType() == MVT::x86mmx)
32344 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
32347 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
32348 isa<ConstantSDNode>(EltIdx) &&
32349 isa<ConstantSDNode>(InputVector.getOperand(0))) {
32350 uint64_t ExtractedElt = N->getConstantOperandVal(1);
32351 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
32352 const APInt &InputValue = InputC->getAPIntValue();
32353 uint64_t Res = InputValue[ExtractedElt];
32354 return DAG.getConstant(Res, dl, MVT::i1);
32357 // Check whether this extract is the root of a sum of absolute differences
32358 // pattern. This has to be done here because we really want it to happen
32359 // pre-legalization,
32360 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
32363 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
32364 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
32367 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
32368 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
32374 /// If a vector select has an operand that is -1 or 0, try to simplify the
32375 /// select to a bitwise logic operation.
32376 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
32378 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
32379 TargetLowering::DAGCombinerInfo &DCI,
32380 const X86Subtarget &Subtarget) {
32381 SDValue Cond = N->getOperand(0);
32382 SDValue LHS = N->getOperand(1);
32383 SDValue RHS = N->getOperand(2);
32384 EVT VT = LHS.getValueType();
32385 EVT CondVT = Cond.getValueType();
32387 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32389 if (N->getOpcode() != ISD::VSELECT)
32392 assert(CondVT.isVector() && "Vector select expects a vector selector!");
32394 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
32395 // Check if the first operand is all zeros and Cond type is vXi1.
32396 // This situation only applies to avx512.
32397 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
32398 CondVT.getVectorElementType() == MVT::i1) {
32399 // Invert the cond to not(cond) : xor(op,allones)=not(op)
32400 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
32401 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
32402 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
32405 // To use the condition operand as a bitwise mask, it must have elements that
32406 // are the same size as the select elements. Ie, the condition operand must
32407 // have already been promoted from the IR select condition type <N x i1>.
32408 // Don't check if the types themselves are equal because that excludes
32409 // vector floating-point selects.
32410 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
32413 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
32414 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
32416 // Try to invert the condition if true value is not all 1s and false value is
32418 if (!TValIsAllOnes && !FValIsAllZeros &&
32419 // Check if the selector will be produced by CMPP*/PCMP*.
32420 Cond.getOpcode() == ISD::SETCC &&
32421 // Check if SETCC has already been promoted.
32422 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
32424 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
32426 if (TValIsAllZeros || FValIsAllOnes) {
32427 SDValue CC = Cond.getOperand(2);
32428 ISD::CondCode NewCC =
32429 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
32430 Cond.getOperand(0).getValueType().isInteger());
32431 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
32433 std::swap(LHS, RHS);
32434 TValIsAllOnes = FValIsAllOnes;
32435 FValIsAllZeros = TValIsAllZeros;
32439 // Cond value must be 'sign splat' to be converted to a logical op.
32440 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
32443 // vselect Cond, 111..., 000... -> Cond
32444 if (TValIsAllOnes && FValIsAllZeros)
32445 return DAG.getBitcast(VT, Cond);
32447 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
32450 // vselect Cond, 111..., X -> or Cond, X
32451 if (TValIsAllOnes) {
32452 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
32453 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
32454 return DAG.getBitcast(VT, Or);
32457 // vselect Cond, X, 000... -> and Cond, X
32458 if (FValIsAllZeros) {
32459 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
32460 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
32461 return DAG.getBitcast(VT, And);
32464 // vselect Cond, 000..., X -> andn Cond, X
32465 if (TValIsAllZeros) {
32466 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
32467 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
32468 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
32469 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
32470 return DAG.getBitcast(VT, AndN);
32476 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
32477 SDValue Cond = N->getOperand(0);
32478 SDValue LHS = N->getOperand(1);
32479 SDValue RHS = N->getOperand(2);
32482 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
32483 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
32484 if (!TrueC || !FalseC)
32487 // Don't do this for crazy integer types.
32488 EVT VT = N->getValueType(0);
32489 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32492 // We're going to use the condition bit in math or logic ops. We could allow
32493 // this with a wider condition value (post-legalization it becomes an i8),
32494 // but if nothing is creating selects that late, it doesn't matter.
32495 if (Cond.getValueType() != MVT::i1)
32498 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
32499 // 3, 5, or 9 with i32/i64, so those get transformed too.
32500 // TODO: For constants that overflow or do not differ by power-of-2 or small
32501 // multiplier, convert to 'and' + 'add'.
32502 const APInt &TrueVal = TrueC->getAPIntValue();
32503 const APInt &FalseVal = FalseC->getAPIntValue();
32505 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
32509 APInt AbsDiff = Diff.abs();
32510 if (AbsDiff.isPowerOf2() ||
32511 ((VT == MVT::i32 || VT == MVT::i64) &&
32512 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
32514 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
32515 // of the condition can usually be folded into a compare predicate, but even
32516 // without that, the sequence should be cheaper than a CMOV alternative.
32517 if (TrueVal.slt(FalseVal)) {
32518 Cond = DAG.getNOT(DL, Cond, MVT::i1);
32519 std::swap(TrueC, FalseC);
32522 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
32523 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
32525 // Multiply condition by the difference if non-one.
32526 if (!AbsDiff.isOneValue())
32527 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
32529 // Add the base if non-zero.
32530 if (!FalseC->isNullValue())
32531 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
32539 /// If this is a *dynamic* select (non-constant condition) and we can match
32540 /// this node with one of the variable blend instructions, restructure the
32541 /// condition so that blends can use the high (sign) bit of each element.
32542 static SDValue combineVSelectToShrunkBlend(SDNode *N, SelectionDAG &DAG,
32543 TargetLowering::DAGCombinerInfo &DCI,
32544 const X86Subtarget &Subtarget) {
32545 SDValue Cond = N->getOperand(0);
32546 if (N->getOpcode() != ISD::VSELECT ||
32547 ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
32550 // Don't optimize before the condition has been transformed to a legal type
32551 // and don't ever optimize vector selects that map to AVX512 mask-registers.
32552 unsigned BitWidth = Cond.getScalarValueSizeInBits();
32553 if (BitWidth < 8 || BitWidth > 64)
32556 // We can only handle the cases where VSELECT is directly legal on the
32557 // subtarget. We custom lower VSELECT nodes with constant conditions and
32558 // this makes it hard to see whether a dynamic VSELECT will correctly
32559 // lower, so we both check the operation's status and explicitly handle the
32560 // cases where a *dynamic* blend will fail even though a constant-condition
32561 // blend could be custom lowered.
32562 // FIXME: We should find a better way to handle this class of problems.
32563 // Potentially, we should combine constant-condition vselect nodes
32564 // pre-legalization into shuffles and not mark as many types as custom
32566 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32567 EVT VT = N->getValueType(0);
32568 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
32570 // FIXME: We don't support i16-element blends currently. We could and
32571 // should support them by making *all* the bits in the condition be set
32572 // rather than just the high bit and using an i8-element blend.
32573 if (VT.getVectorElementType() == MVT::i16)
32575 // Dynamic blending was only available from SSE4.1 onward.
32576 if (VT.is128BitVector() && !Subtarget.hasSSE41())
32578 // Byte blends are only available in AVX2
32579 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
32581 // There are no 512-bit blend instructions that use sign bits.
32582 if (VT.is512BitVector())
32585 // TODO: Add other opcodes eventually lowered into BLEND.
32586 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
32588 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0)
32591 APInt DemandedMask(APInt::getSignMask(BitWidth));
32593 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32594 !DCI.isBeforeLegalizeOps());
32595 if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
32598 // If we changed the computation somewhere in the DAG, this change will
32599 // affect all users of Cond. Update all the nodes so that we do not use
32600 // the generic VSELECT anymore. Otherwise, we may perform wrong
32601 // optimizations as we messed with the actual expectation for the vector
32603 for (SDNode *U : Cond->uses()) {
32604 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U), U->getValueType(0),
32605 Cond, U->getOperand(1), U->getOperand(2));
32606 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
32608 DCI.CommitTargetLoweringOpt(TLO);
32609 return SDValue(N, 0);
32612 /// Do target-specific dag combines on SELECT and VSELECT nodes.
32613 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
32614 TargetLowering::DAGCombinerInfo &DCI,
32615 const X86Subtarget &Subtarget) {
32617 SDValue Cond = N->getOperand(0);
32618 // Get the LHS/RHS of the select.
32619 SDValue LHS = N->getOperand(1);
32620 SDValue RHS = N->getOperand(2);
32621 EVT VT = LHS.getValueType();
32622 EVT CondVT = Cond.getValueType();
32623 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32625 // Convert vselects with constant condition into shuffles.
32626 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
32627 DCI.isBeforeLegalizeOps()) {
32628 SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
32629 for (int i = 0, Size = Mask.size(); i != Size; ++i) {
32630 SDValue CondElt = Cond->getOperand(i);
32632 // Arbitrarily choose from the 2nd operand if the select condition element
32634 // TODO: Can we do better by matching patterns such as even/odd?
32635 if (CondElt.isUndef() || isNullConstant(CondElt))
32639 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
32642 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
32643 // instructions match the semantics of the common C idiom x<y?x:y but not
32644 // x<=y?x:y, because of how they handle negative zero (which can be
32645 // ignored in unsafe-math mode).
32646 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
32647 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
32648 VT != MVT::f80 && VT != MVT::f128 &&
32649 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
32650 (Subtarget.hasSSE2() ||
32651 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
32652 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32654 unsigned Opcode = 0;
32655 // Check for x CC y ? x : y.
32656 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32657 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32661 // Converting this to a min would handle NaNs incorrectly, and swapping
32662 // the operands would cause it to handle comparisons between positive
32663 // and negative zero incorrectly.
32664 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32665 if (!DAG.getTarget().Options.UnsafeFPMath &&
32666 !(DAG.isKnownNeverZeroFloat(LHS) ||
32667 DAG.isKnownNeverZeroFloat(RHS)))
32669 std::swap(LHS, RHS);
32671 Opcode = X86ISD::FMIN;
32674 // Converting this to a min would handle comparisons between positive
32675 // and negative zero incorrectly.
32676 if (!DAG.getTarget().Options.UnsafeFPMath &&
32677 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32679 Opcode = X86ISD::FMIN;
32682 // Converting this to a min would handle both negative zeros and NaNs
32683 // incorrectly, but we can swap the operands to fix both.
32684 std::swap(LHS, RHS);
32689 Opcode = X86ISD::FMIN;
32693 // Converting this to a max would handle comparisons between positive
32694 // and negative zero incorrectly.
32695 if (!DAG.getTarget().Options.UnsafeFPMath &&
32696 !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
32698 Opcode = X86ISD::FMAX;
32701 // Converting this to a max would handle NaNs incorrectly, and swapping
32702 // the operands would cause it to handle comparisons between positive
32703 // and negative zero incorrectly.
32704 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32705 if (!DAG.getTarget().Options.UnsafeFPMath &&
32706 !(DAG.isKnownNeverZeroFloat(LHS) ||
32707 DAG.isKnownNeverZeroFloat(RHS)))
32709 std::swap(LHS, RHS);
32711 Opcode = X86ISD::FMAX;
32714 // Converting this to a max would handle both negative zeros and NaNs
32715 // incorrectly, but we can swap the operands to fix both.
32716 std::swap(LHS, RHS);
32721 Opcode = X86ISD::FMAX;
32724 // Check for x CC y ? y : x -- a min/max with reversed arms.
32725 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
32726 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
32730 // Converting this to a min would handle comparisons between positive
32731 // and negative zero incorrectly, and swapping the operands would
32732 // cause it to handle NaNs incorrectly.
32733 if (!DAG.getTarget().Options.UnsafeFPMath &&
32734 !(DAG.isKnownNeverZeroFloat(LHS) ||
32735 DAG.isKnownNeverZeroFloat(RHS))) {
32736 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32738 std::swap(LHS, RHS);
32740 Opcode = X86ISD::FMIN;
32743 // Converting this to a min would handle NaNs incorrectly.
32744 if (!DAG.getTarget().Options.UnsafeFPMath &&
32745 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
32747 Opcode = X86ISD::FMIN;
32750 // Converting this to a min would handle both negative zeros and NaNs
32751 // incorrectly, but we can swap the operands to fix both.
32752 std::swap(LHS, RHS);
32757 Opcode = X86ISD::FMIN;
32761 // Converting this to a max would handle NaNs incorrectly.
32762 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32764 Opcode = X86ISD::FMAX;
32767 // Converting this to a max would handle comparisons between positive
32768 // and negative zero incorrectly, and swapping the operands would
32769 // cause it to handle NaNs incorrectly.
32770 if (!DAG.getTarget().Options.UnsafeFPMath &&
32771 !DAG.isKnownNeverZeroFloat(LHS) &&
32772 !DAG.isKnownNeverZeroFloat(RHS)) {
32773 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32775 std::swap(LHS, RHS);
32777 Opcode = X86ISD::FMAX;
32780 // Converting this to a max would handle both negative zeros and NaNs
32781 // incorrectly, but we can swap the operands to fix both.
32782 std::swap(LHS, RHS);
32787 Opcode = X86ISD::FMAX;
32793 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
32796 // Some mask scalar intrinsics rely on checking if only one bit is set
32797 // and implement it in C code like this:
32798 // A[0] = (U & 1) ? A[0] : W[0];
32799 // This creates some redundant instructions that break pattern matching.
32800 // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
32801 if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
32802 Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
32803 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32804 SDValue AndNode = Cond.getOperand(0);
32805 if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
32806 isNullConstant(Cond.getOperand(1)) &&
32807 isOneConstant(AndNode.getOperand(1))) {
32808 // LHS and RHS swapped due to
32809 // setcc outputting 1 when AND resulted in 0 and vice versa.
32810 AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8);
32811 return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
32815 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
32816 // lowering on KNL. In this case we convert it to
32817 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
32818 // The same situation all vectors of i8 and i16 without BWI.
32819 // Make sure we extend these even before type legalization gets a chance to
32820 // split wide vectors.
32821 // Since SKX these selects have a proper lowering.
32822 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
32823 CondVT.getVectorElementType() == MVT::i1 &&
32824 VT.getVectorNumElements() > 4 &&
32825 (VT.getVectorElementType() == MVT::i8 ||
32826 VT.getVectorElementType() == MVT::i16)) {
32827 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
32828 DCI.AddToWorklist(Cond.getNode());
32829 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
32832 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
32835 // Canonicalize max and min:
32836 // (x > y) ? x : y -> (x >= y) ? x : y
32837 // (x < y) ? x : y -> (x <= y) ? x : y
32838 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
32839 // the need for an extra compare
32840 // against zero. e.g.
32841 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
32843 // testl %edi, %edi
32845 // cmovgl %edi, %eax
32849 // cmovsl %eax, %edi
32850 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
32851 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32852 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32853 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32858 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
32859 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
32860 Cond.getOperand(0), Cond.getOperand(1), NewCC);
32861 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
32866 // Early exit check
32867 if (!TLI.isTypeLegal(VT))
32870 // Match VSELECTs into subs with unsigned saturation.
32871 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
32872 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
32873 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
32874 (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
32875 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32877 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
32878 // left side invert the predicate to simplify logic below.
32880 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
32882 CC = ISD::getSetCCInverse(CC, true);
32883 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
32887 if (Other.getNode() && Other->getNumOperands() == 2 &&
32888 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
32889 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
32890 SDValue CondRHS = Cond->getOperand(1);
32892 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
32893 ArrayRef<SDValue> Ops) {
32894 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
32897 // Look for a general sub with unsigned saturation first.
32898 // x >= y ? x-y : 0 --> subus x, y
32899 // x > y ? x-y : 0 --> subus x, y
32900 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
32901 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
32902 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32905 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
32906 if (isa<BuildVectorSDNode>(CondRHS)) {
32907 // If the RHS is a constant we have to reverse the const
32908 // canonicalization.
32909 // x > C-1 ? x+-C : 0 --> subus x, C
32910 auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
32911 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
32913 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
32914 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
32915 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
32916 DAG.getConstant(0, DL, VT), OpRHS);
32917 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32921 // Another special case: If C was a sign bit, the sub has been
32922 // canonicalized into a xor.
32923 // FIXME: Would it be better to use computeKnownBits to determine
32924 // whether it's safe to decanonicalize the xor?
32925 // x s< 0 ? x^C : 0 --> subus x, C
32926 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
32927 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
32928 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
32929 OpRHSConst->getAPIntValue().isSignMask()) {
32930 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
32931 // Note that we have to rebuild the RHS constant here to ensure we
32932 // don't rely on particular values of undef lanes.
32933 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32940 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
32943 if (SDValue V = combineVSelectToShrunkBlend(N, DAG, DCI, Subtarget))
32946 // Custom action for SELECT MMX
32947 if (VT == MVT::x86mmx) {
32948 LHS = DAG.getBitcast(MVT::i64, LHS);
32949 RHS = DAG.getBitcast(MVT::i64, RHS);
32950 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
32951 return DAG.getBitcast(VT, newSelect);
32958 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
32960 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
32961 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
32962 /// Note that this is only legal for some op/cc combinations.
32963 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
32965 const X86Subtarget &Subtarget) {
32966 // This combine only operates on CMP-like nodes.
32967 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32968 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32971 // Can't replace the cmp if it has more uses than the one we're looking at.
32972 // FIXME: We would like to be able to handle this, but would need to make sure
32973 // all uses were updated.
32974 if (!Cmp.hasOneUse())
32977 // This only applies to variations of the common case:
32978 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
32979 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
32980 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
32981 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
32982 // Using the proper condcodes (see below), overflow is checked for.
32984 // FIXME: We can generalize both constraints:
32985 // - XOR/OR/AND (if they were made to survive AtomicExpand)
32987 // if the result is compared.
32989 SDValue CmpLHS = Cmp.getOperand(0);
32990 SDValue CmpRHS = Cmp.getOperand(1);
32992 if (!CmpLHS.hasOneUse())
32995 unsigned Opc = CmpLHS.getOpcode();
32996 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
32999 SDValue OpRHS = CmpLHS.getOperand(2);
33000 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
33004 APInt Addend = OpRHSC->getAPIntValue();
33005 if (Opc == ISD::ATOMIC_LOAD_SUB)
33008 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
33012 APInt Comparison = CmpRHSC->getAPIntValue();
33014 // If the addend is the negation of the comparison value, then we can do
33015 // a full comparison by emitting the atomic arithmetic as a locked sub.
33016 if (Comparison == -Addend) {
33017 // The CC is fine, but we need to rewrite the LHS of the comparison as an
33019 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
33020 auto AtomicSub = DAG.getAtomic(
33021 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
33022 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
33023 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
33024 AN->getMemOperand());
33025 // If the comparision uses the CF flag we can't use INC/DEC instructions.
33026 bool NeedCF = false;
33029 case X86::COND_A: case X86::COND_AE:
33030 case X86::COND_B: case X86::COND_BE:
33034 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
33035 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33036 DAG.getUNDEF(CmpLHS.getValueType()));
33037 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33041 // We can handle comparisons with zero in a number of cases by manipulating
33043 if (!Comparison.isNullValue())
33046 if (CC == X86::COND_S && Addend == 1)
33048 else if (CC == X86::COND_NS && Addend == 1)
33050 else if (CC == X86::COND_G && Addend == -1)
33052 else if (CC == X86::COND_LE && Addend == -1)
33057 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
33058 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
33059 DAG.getUNDEF(CmpLHS.getValueType()));
33060 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
33064 // Check whether a boolean test is testing a boolean value generated by
33065 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
33068 // Simplify the following patterns:
33069 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
33070 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
33071 // to (Op EFLAGS Cond)
33073 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
33074 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
33075 // to (Op EFLAGS !Cond)
33077 // where Op could be BRCOND or CMOV.
33079 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
33080 // This combine only operates on CMP-like nodes.
33081 if (!(Cmp.getOpcode() == X86ISD::CMP ||
33082 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
33085 // Quit if not used as a boolean value.
33086 if (CC != X86::COND_E && CC != X86::COND_NE)
33089 // Check CMP operands. One of them should be 0 or 1 and the other should be
33090 // an SetCC or extended from it.
33091 SDValue Op1 = Cmp.getOperand(0);
33092 SDValue Op2 = Cmp.getOperand(1);
33095 const ConstantSDNode* C = nullptr;
33096 bool needOppositeCond = (CC == X86::COND_E);
33097 bool checkAgainstTrue = false; // Is it a comparison against 1?
33099 if ((C = dyn_cast<ConstantSDNode>(Op1)))
33101 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
33103 else // Quit if all operands are not constants.
33106 if (C->getZExtValue() == 1) {
33107 needOppositeCond = !needOppositeCond;
33108 checkAgainstTrue = true;
33109 } else if (C->getZExtValue() != 0)
33110 // Quit if the constant is neither 0 or 1.
33113 bool truncatedToBoolWithAnd = false;
33114 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
33115 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
33116 SetCC.getOpcode() == ISD::TRUNCATE ||
33117 SetCC.getOpcode() == ISD::AND) {
33118 if (SetCC.getOpcode() == ISD::AND) {
33120 if (isOneConstant(SetCC.getOperand(0)))
33122 if (isOneConstant(SetCC.getOperand(1)))
33126 SetCC = SetCC.getOperand(OpIdx);
33127 truncatedToBoolWithAnd = true;
33129 SetCC = SetCC.getOperand(0);
33132 switch (SetCC.getOpcode()) {
33133 case X86ISD::SETCC_CARRY:
33134 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
33135 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
33136 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
33137 // truncated to i1 using 'and'.
33138 if (checkAgainstTrue && !truncatedToBoolWithAnd)
33140 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
33141 "Invalid use of SETCC_CARRY!");
33143 case X86ISD::SETCC:
33144 // Set the condition code or opposite one if necessary.
33145 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
33146 if (needOppositeCond)
33147 CC = X86::GetOppositeBranchCondition(CC);
33148 return SetCC.getOperand(1);
33149 case X86ISD::CMOV: {
33150 // Check whether false/true value has canonical one, i.e. 0 or 1.
33151 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
33152 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
33153 // Quit if true value is not a constant.
33156 // Quit if false value is not a constant.
33158 SDValue Op = SetCC.getOperand(0);
33159 // Skip 'zext' or 'trunc' node.
33160 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
33161 Op.getOpcode() == ISD::TRUNCATE)
33162 Op = Op.getOperand(0);
33163 // A special case for rdrand/rdseed, where 0 is set if false cond is
33165 if ((Op.getOpcode() != X86ISD::RDRAND &&
33166 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
33169 // Quit if false value is not the constant 0 or 1.
33170 bool FValIsFalse = true;
33171 if (FVal && FVal->getZExtValue() != 0) {
33172 if (FVal->getZExtValue() != 1)
33174 // If FVal is 1, opposite cond is needed.
33175 needOppositeCond = !needOppositeCond;
33176 FValIsFalse = false;
33178 // Quit if TVal is not the constant opposite of FVal.
33179 if (FValIsFalse && TVal->getZExtValue() != 1)
33181 if (!FValIsFalse && TVal->getZExtValue() != 0)
33183 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
33184 if (needOppositeCond)
33185 CC = X86::GetOppositeBranchCondition(CC);
33186 return SetCC.getOperand(3);
33193 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
33195 /// (X86or (X86setcc) (X86setcc))
33196 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
33197 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
33198 X86::CondCode &CC1, SDValue &Flags,
33200 if (Cond->getOpcode() == X86ISD::CMP) {
33201 if (!isNullConstant(Cond->getOperand(1)))
33204 Cond = Cond->getOperand(0);
33209 SDValue SetCC0, SetCC1;
33210 switch (Cond->getOpcode()) {
33211 default: return false;
33218 SetCC0 = Cond->getOperand(0);
33219 SetCC1 = Cond->getOperand(1);
33223 // Make sure we have SETCC nodes, using the same flags value.
33224 if (SetCC0.getOpcode() != X86ISD::SETCC ||
33225 SetCC1.getOpcode() != X86ISD::SETCC ||
33226 SetCC0->getOperand(1) != SetCC1->getOperand(1))
33229 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
33230 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
33231 Flags = SetCC0->getOperand(1);
33235 // When legalizing carry, we create carries via add X, -1
33236 // If that comes from an actual carry, via setcc, we use the
33238 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
33239 if (EFLAGS.getOpcode() == X86ISD::ADD) {
33240 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
33241 SDValue Carry = EFLAGS.getOperand(0);
33242 while (Carry.getOpcode() == ISD::TRUNCATE ||
33243 Carry.getOpcode() == ISD::ZERO_EXTEND ||
33244 Carry.getOpcode() == ISD::SIGN_EXTEND ||
33245 Carry.getOpcode() == ISD::ANY_EXTEND ||
33246 (Carry.getOpcode() == ISD::AND &&
33247 isOneConstant(Carry.getOperand(1))))
33248 Carry = Carry.getOperand(0);
33249 if (Carry.getOpcode() == X86ISD::SETCC ||
33250 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
33251 if (Carry.getConstantOperandVal(0) == X86::COND_B)
33252 return Carry.getOperand(1);
33260 /// Optimize an EFLAGS definition used according to the condition code \p CC
33261 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
33262 /// uses of chain values.
33263 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
33265 const X86Subtarget &Subtarget) {
33266 if (CC == X86::COND_B)
33267 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
33270 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
33272 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
33275 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
33276 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
33277 TargetLowering::DAGCombinerInfo &DCI,
33278 const X86Subtarget &Subtarget) {
33281 SDValue FalseOp = N->getOperand(0);
33282 SDValue TrueOp = N->getOperand(1);
33283 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
33284 SDValue Cond = N->getOperand(3);
33286 // Try to simplify the EFLAGS and condition code operands.
33287 // We can't always do this as FCMOV only supports a subset of X86 cond.
33288 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
33289 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
33290 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
33292 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33296 // If this is a select between two integer constants, try to do some
33297 // optimizations. Note that the operands are ordered the opposite of SELECT
33299 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
33300 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
33301 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
33302 // larger than FalseC (the false value).
33303 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
33304 CC = X86::GetOppositeBranchCondition(CC);
33305 std::swap(TrueC, FalseC);
33306 std::swap(TrueOp, FalseOp);
33309 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
33310 // This is efficient for any integer data type (including i8/i16) and
33312 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
33313 Cond = getSETCC(CC, Cond, DL, DAG);
33315 // Zero extend the condition if needed.
33316 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
33318 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
33319 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
33320 DAG.getConstant(ShAmt, DL, MVT::i8));
33324 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
33325 // for any integer data type, including i8/i16.
33326 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
33327 Cond = getSETCC(CC, Cond, DL, DAG);
33329 // Zero extend the condition if needed.
33330 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
33331 FalseC->getValueType(0), Cond);
33332 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33333 SDValue(FalseC, 0));
33337 // Optimize cases that will turn into an LEA instruction. This requires
33338 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
33339 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
33340 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
33341 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
33343 bool isFastMultiplier = false;
33345 switch ((unsigned char)Diff) {
33347 case 1: // result = add base, cond
33348 case 2: // result = lea base( , cond*2)
33349 case 3: // result = lea base(cond, cond*2)
33350 case 4: // result = lea base( , cond*4)
33351 case 5: // result = lea base(cond, cond*4)
33352 case 8: // result = lea base( , cond*8)
33353 case 9: // result = lea base(cond, cond*8)
33354 isFastMultiplier = true;
33359 if (isFastMultiplier) {
33360 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
33361 Cond = getSETCC(CC, Cond, DL ,DAG);
33362 // Zero extend the condition if needed.
33363 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
33365 // Scale the condition by the difference.
33367 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
33368 DAG.getConstant(Diff, DL, Cond.getValueType()));
33370 // Add the base if non-zero.
33371 if (FalseC->getAPIntValue() != 0)
33372 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
33373 SDValue(FalseC, 0));
33380 // Handle these cases:
33381 // (select (x != c), e, c) -> select (x != c), e, x),
33382 // (select (x == c), c, e) -> select (x == c), x, e)
33383 // where the c is an integer constant, and the "select" is the combination
33384 // of CMOV and CMP.
33386 // The rationale for this change is that the conditional-move from a constant
33387 // needs two instructions, however, conditional-move from a register needs
33388 // only one instruction.
33390 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
33391 // some instruction-combining opportunities. This opt needs to be
33392 // postponed as late as possible.
33394 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
33395 // the DCI.xxxx conditions are provided to postpone the optimization as
33396 // late as possible.
33398 ConstantSDNode *CmpAgainst = nullptr;
33399 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
33400 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
33401 !isa<ConstantSDNode>(Cond.getOperand(0))) {
33403 if (CC == X86::COND_NE &&
33404 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
33405 CC = X86::GetOppositeBranchCondition(CC);
33406 std::swap(TrueOp, FalseOp);
33409 if (CC == X86::COND_E &&
33410 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
33411 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
33412 DAG.getConstant(CC, DL, MVT::i8), Cond };
33413 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33418 // Fold and/or of setcc's to double CMOV:
33419 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
33420 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
33422 // This combine lets us generate:
33423 // cmovcc1 (jcc1 if we don't have CMOV)
33429 // cmovne (jne if we don't have CMOV)
33430 // When we can't use the CMOV instruction, it might increase branch
33432 // When we can use CMOV, or when there is no mispredict, this improves
33433 // throughput and reduces register pressure.
33435 if (CC == X86::COND_NE) {
33437 X86::CondCode CC0, CC1;
33439 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
33441 std::swap(FalseOp, TrueOp);
33442 CC0 = X86::GetOppositeBranchCondition(CC0);
33443 CC1 = X86::GetOppositeBranchCondition(CC1);
33446 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
33448 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
33449 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
33450 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
33458 /// Different mul shrinking modes.
33459 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
33461 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
33462 EVT VT = N->getOperand(0).getValueType();
33463 if (VT.getScalarSizeInBits() != 32)
33466 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
33467 unsigned SignBits[2] = {1, 1};
33468 bool IsPositive[2] = {false, false};
33469 for (unsigned i = 0; i < 2; i++) {
33470 SDValue Opd = N->getOperand(i);
33472 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
33473 // compute signbits for it separately.
33474 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
33475 // For anyextend, it is safe to assume an appropriate number of leading
33477 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
33479 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
33484 IsPositive[i] = true;
33485 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
33486 // All the operands of BUILD_VECTOR need to be int constant.
33487 // Find the smallest value range which all the operands belong to.
33489 IsPositive[i] = true;
33490 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
33491 if (SubOp.isUndef())
33493 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
33496 APInt IntVal = CN->getAPIntValue();
33497 if (IntVal.isNegative())
33498 IsPositive[i] = false;
33499 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
33502 SignBits[i] = DAG.ComputeNumSignBits(Opd);
33503 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
33504 IsPositive[i] = true;
33508 bool AllPositive = IsPositive[0] && IsPositive[1];
33509 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
33510 // When ranges are from -128 ~ 127, use MULS8 mode.
33511 if (MinSignBits >= 25)
33513 // When ranges are from 0 ~ 255, use MULU8 mode.
33514 else if (AllPositive && MinSignBits >= 24)
33516 // When ranges are from -32768 ~ 32767, use MULS16 mode.
33517 else if (MinSignBits >= 17)
33519 // When ranges are from 0 ~ 65535, use MULU16 mode.
33520 else if (AllPositive && MinSignBits >= 16)
33527 /// When the operands of vector mul are extended from smaller size values,
33528 /// like i8 and i16, the type of mul may be shrinked to generate more
33529 /// efficient code. Two typical patterns are handled:
33531 /// %2 = sext/zext <N x i8> %1 to <N x i32>
33532 /// %4 = sext/zext <N x i8> %3 to <N x i32>
33533 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33534 /// %5 = mul <N x i32> %2, %4
33537 /// %2 = zext/sext <N x i16> %1 to <N x i32>
33538 /// %4 = zext/sext <N x i16> %3 to <N x i32>
33539 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
33540 /// %5 = mul <N x i32> %2, %4
33542 /// There are four mul shrinking modes:
33543 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
33544 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
33545 /// generate pmullw+sext32 for it (MULS8 mode).
33546 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
33547 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
33548 /// generate pmullw+zext32 for it (MULU8 mode).
33549 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
33550 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
33551 /// generate pmullw+pmulhw for it (MULS16 mode).
33552 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
33553 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
33554 /// generate pmullw+pmulhuw for it (MULU16 mode).
33555 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
33556 const X86Subtarget &Subtarget) {
33557 // Check for legality
33558 // pmullw/pmulhw are not supported by SSE.
33559 if (!Subtarget.hasSSE2())
33562 // Check for profitability
33563 // pmulld is supported since SSE41. It is better to use pmulld
33564 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
33566 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
33567 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
33571 if (!canReduceVMulWidth(N, DAG, Mode))
33575 SDValue N0 = N->getOperand(0);
33576 SDValue N1 = N->getOperand(1);
33577 EVT VT = N->getOperand(0).getValueType();
33578 unsigned NumElts = VT.getVectorNumElements();
33579 if ((NumElts % 2) != 0)
33582 unsigned RegSize = 128;
33583 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
33584 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
33586 // Shrink the operands of mul.
33587 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
33588 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
33590 if (NumElts >= OpsVT.getVectorNumElements()) {
33591 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
33592 // lower part is needed.
33593 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
33594 if (Mode == MULU8 || Mode == MULS8) {
33595 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
33598 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
33599 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
33600 // the higher part is also needed.
33601 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33602 ReducedVT, NewN0, NewN1);
33604 // Repack the lower part and higher part result of mul into a wider
33606 // Generate shuffle functioning as punpcklwd.
33607 SmallVector<int, 16> ShuffleMask(NumElts);
33608 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33609 ShuffleMask[2 * i] = i;
33610 ShuffleMask[2 * i + 1] = i + NumElts;
33613 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33614 ResLo = DAG.getBitcast(ResVT, ResLo);
33615 // Generate shuffle functioning as punpckhwd.
33616 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33617 ShuffleMask[2 * i] = i + NumElts / 2;
33618 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
33621 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33622 ResHi = DAG.getBitcast(ResVT, ResHi);
33623 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
33626 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
33627 // to legalize the mul explicitly because implicit legalization for type
33628 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
33629 // instructions which will not exist when we explicitly legalize it by
33630 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
33631 // <4 x i16> undef).
33633 // Legalize the operands of mul.
33634 // FIXME: We may be able to handle non-concatenated vectors by insertion.
33635 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
33636 if ((RegSize % ReducedSizeInBits) != 0)
33639 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
33640 DAG.getUNDEF(ReducedVT));
33642 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33644 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33646 if (Mode == MULU8 || Mode == MULS8) {
33647 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
33649 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33651 // convert the type of mul result to VT.
33652 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33653 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
33654 : ISD::SIGN_EXTEND_VECTOR_INREG,
33656 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33657 DAG.getIntPtrConstant(0, DL));
33659 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
33660 // MULU16/MULS16, both parts are needed.
33661 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33662 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33663 OpsVT, NewN0, NewN1);
33665 // Repack the lower part and higher part result of mul into a wider
33666 // result. Make sure the type of mul result is VT.
33667 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33668 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
33669 Res = DAG.getBitcast(ResVT, Res);
33670 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33671 DAG.getIntPtrConstant(0, DL));
33676 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
33677 EVT VT, const SDLoc &DL) {
33679 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
33680 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33681 DAG.getConstant(Mult, DL, VT));
33682 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
33683 DAG.getConstant(Shift, DL, MVT::i8));
33684 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33689 auto combineMulMulAddOrSub = [&](bool isAdd) {
33690 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33691 DAG.getConstant(9, DL, VT));
33692 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
33693 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33702 // mul x, 11 => add ((shl (mul x, 5), 1), x)
33703 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
33705 // mul x, 21 => add ((shl (mul x, 5), 2), x)
33706 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
33708 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
33709 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33710 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
33712 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
33713 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
33715 // mul x, 13 => add ((shl (mul x, 3), 2), x)
33716 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
33718 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
33719 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
33721 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
33722 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33723 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
33725 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
33726 return combineMulMulAddOrSub(/*isAdd*/ false);
33728 // mul x, 28 => add ((mul (mul x, 9), 3), x)
33729 return combineMulMulAddOrSub(/*isAdd*/ true);
33731 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
33732 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33733 combineMulMulAddOrSub(/*isAdd*/ true));
33735 // mul x, 30 => sub (sub ((shl x, 5), x), x)
33736 return DAG.getNode(
33738 DAG.getNode(ISD::SUB, DL, VT,
33739 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33740 DAG.getConstant(5, DL, MVT::i8)),
33747 // If the upper 17 bits of each element are zero then we can use PMADDWD,
33748 // which is always at least as quick as PMULLD, expect on KNL.
33749 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
33750 const X86Subtarget &Subtarget) {
33751 if (!Subtarget.hasSSE2())
33754 if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
33757 EVT VT = N->getValueType(0);
33759 // Only support vXi32 vectors.
33760 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
33763 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
33764 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
33765 if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
33768 SDValue N0 = N->getOperand(0);
33769 SDValue N1 = N->getOperand(1);
33770 APInt Mask17 = APInt::getHighBitsSet(32, 17);
33771 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
33772 !DAG.MaskedValueIsZero(N0, Mask17))
33775 // Use SplitOpsAndApply to handle AVX splitting.
33776 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33777 ArrayRef<SDValue> Ops) {
33778 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
33779 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
33781 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
33782 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
33786 static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG,
33787 const X86Subtarget &Subtarget) {
33788 if (!Subtarget.hasSSE2())
33791 EVT VT = N->getValueType(0);
33793 // Only support vXi64 vectors.
33794 if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 ||
33795 !DAG.getTargetLoweringInfo().isTypeLegal(VT))
33798 SDValue N0 = N->getOperand(0);
33799 SDValue N1 = N->getOperand(1);
33801 // MULDQ returns the 64-bit result of the signed multiplication of the lower
33802 // 32-bits. We can lower with this if the sign bits stretch that far.
33803 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 &&
33804 DAG.ComputeNumSignBits(N1) > 32) {
33805 auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33806 ArrayRef<SDValue> Ops) {
33807 return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops);
33809 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33810 PMULDQBuilder, /*CheckBWI*/false);
33813 // If the upper bits are zero we can use a single pmuludq.
33814 APInt Mask = APInt::getHighBitsSet(64, 32);
33815 if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) {
33816 auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33817 ArrayRef<SDValue> Ops) {
33818 return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops);
33820 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 },
33821 PMULUDQBuilder, /*CheckBWI*/false);
33827 /// Optimize a single multiply with constant into two operations in order to
33828 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
33829 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
33830 TargetLowering::DAGCombinerInfo &DCI,
33831 const X86Subtarget &Subtarget) {
33832 EVT VT = N->getValueType(0);
33834 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
33837 if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget))
33840 if (DCI.isBeforeLegalize() && VT.isVector())
33841 return reduceVMULWidth(N, DAG, Subtarget);
33843 if (!MulConstantOptimization)
33845 // An imul is usually smaller than the alternative sequence.
33846 if (DAG.getMachineFunction().getFunction().optForMinSize())
33849 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
33852 if (VT != MVT::i64 && VT != MVT::i32)
33855 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
33858 uint64_t MulAmt = C->getZExtValue();
33859 if (isPowerOf2_64(MulAmt))
33863 if (MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
33864 return DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33867 uint64_t MulAmt1 = 0;
33868 uint64_t MulAmt2 = 0;
33869 if ((MulAmt % 9) == 0) {
33871 MulAmt2 = MulAmt / 9;
33872 } else if ((MulAmt % 5) == 0) {
33874 MulAmt2 = MulAmt / 5;
33875 } else if ((MulAmt % 3) == 0) {
33877 MulAmt2 = MulAmt / 3;
33882 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
33884 if (isPowerOf2_64(MulAmt2) &&
33885 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
33886 // If second multiplifer is pow2, issue it first. We want the multiply by
33887 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
33889 std::swap(MulAmt1, MulAmt2);
33891 if (isPowerOf2_64(MulAmt1))
33892 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33893 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
33895 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33896 DAG.getConstant(MulAmt1, DL, VT));
33898 if (isPowerOf2_64(MulAmt2))
33899 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
33900 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
33902 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
33903 DAG.getConstant(MulAmt2, DL, VT));
33904 } else if (!Subtarget.slowLEA())
33905 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
33908 assert(MulAmt != 0 &&
33909 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
33910 "Both cases that could cause potential overflows should have "
33911 "already been handled.");
33912 int64_t SignMulAmt = C->getSExtValue();
33913 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
33914 (SignMulAmt != -INT64_MAX)) {
33915 int NumSign = SignMulAmt > 0 ? 1 : -1;
33916 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
33917 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
33918 if (IsPowerOf2_64PlusOne) {
33919 // (mul x, 2^N + 1) => (add (shl x, N), x)
33920 NewMul = DAG.getNode(
33921 ISD::ADD, DL, VT, N->getOperand(0),
33922 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33923 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
33925 } else if (IsPowerOf2_64MinusOne) {
33926 // (mul x, 2^N - 1) => (sub (shl x, N), x)
33927 NewMul = DAG.getNode(
33929 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33930 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
33934 // To negate, subtract the number from zero
33935 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
33937 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
33942 // Do not add new nodes to DAG combiner worklist.
33943 DCI.CombineTo(N, NewMul, false);
33948 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
33949 SDValue N0 = N->getOperand(0);
33950 SDValue N1 = N->getOperand(1);
33951 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
33952 EVT VT = N0.getValueType();
33954 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
33955 // since the result of setcc_c is all zero's or all ones.
33956 if (VT.isInteger() && !VT.isVector() &&
33957 N1C && N0.getOpcode() == ISD::AND &&
33958 N0.getOperand(1).getOpcode() == ISD::Constant) {
33959 SDValue N00 = N0.getOperand(0);
33960 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
33961 Mask <<= N1C->getAPIntValue();
33962 bool MaskOK = false;
33963 // We can handle cases concerning bit-widening nodes containing setcc_c if
33964 // we carefully interrogate the mask to make sure we are semantics
33966 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
33967 // of the underlying setcc_c operation if the setcc_c was zero extended.
33968 // Consider the following example:
33969 // zext(setcc_c) -> i32 0x0000FFFF
33970 // c1 -> i32 0x0000FFFF
33971 // c2 -> i32 0x00000001
33972 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
33973 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
33974 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33976 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
33977 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
33979 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
33980 N00.getOpcode() == ISD::ANY_EXTEND) &&
33981 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
33982 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
33984 if (MaskOK && Mask != 0) {
33986 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
33990 // Hardware support for vector shifts is sparse which makes us scalarize the
33991 // vector operations in many cases. Also, on sandybridge ADD is faster than
33993 // (shl V, 1) -> add V,V
33994 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
33995 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
33996 assert(N0.getValueType().isVector() && "Invalid vector shift type");
33997 // We shift all of the values by one. In many cases we do not have
33998 // hardware support for this operation. This is better expressed as an ADD
34000 if (N1SplatC->getAPIntValue() == 1)
34001 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
34007 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
34008 SDValue N0 = N->getOperand(0);
34009 SDValue N1 = N->getOperand(1);
34010 EVT VT = N0.getValueType();
34011 unsigned Size = VT.getSizeInBits();
34013 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
34014 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
34015 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
34016 // depending on sign of (SarConst - [56,48,32,24,16])
34018 // sexts in X86 are MOVs. The MOVs have the same code size
34019 // as above SHIFTs (only SHIFT on 1 has lower code size).
34020 // However the MOVs have 2 advantages to a SHIFT:
34021 // 1. MOVs can write to a register that differs from source
34022 // 2. MOVs accept memory operands
34024 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
34025 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
34026 N0.getOperand(1).getOpcode() != ISD::Constant)
34029 SDValue N00 = N0.getOperand(0);
34030 SDValue N01 = N0.getOperand(1);
34031 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
34032 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
34033 EVT CVT = N1.getValueType();
34035 if (SarConst.isNegative())
34038 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
34039 unsigned ShiftSize = SVT.getSizeInBits();
34040 // skipping types without corresponding sext/zext and
34041 // ShlConst that is not one of [56,48,32,24,16]
34042 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
34046 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
34047 SarConst = SarConst - (Size - ShiftSize);
34050 else if (SarConst.isNegative())
34051 return DAG.getNode(ISD::SHL, DL, VT, NN,
34052 DAG.getConstant(-SarConst, DL, CVT));
34054 return DAG.getNode(ISD::SRA, DL, VT, NN,
34055 DAG.getConstant(SarConst, DL, CVT));
34060 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
34061 TargetLowering::DAGCombinerInfo &DCI) {
34062 SDValue N0 = N->getOperand(0);
34063 SDValue N1 = N->getOperand(1);
34064 EVT VT = N0.getValueType();
34066 // Only do this on the last DAG combine as it can interfere with other
34068 if (!DCI.isAfterLegalizeDAG())
34071 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
34072 // TODO: This is a generic DAG combine that became an x86-only combine to
34073 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
34074 // and-not ('andn').
34075 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
34078 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
34079 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
34080 if (!ShiftC || !AndC)
34083 // If we can shrink the constant mask below 8-bits or 32-bits, then this
34084 // transform should reduce code size. It may also enable secondary transforms
34085 // from improved known-bits analysis or instruction selection.
34086 APInt MaskVal = AndC->getAPIntValue();
34088 // If this can be matched by a zero extend, don't optimize.
34089 if (MaskVal.isMask()) {
34090 unsigned TO = MaskVal.countTrailingOnes();
34091 if (TO >= 8 && isPowerOf2_32(TO))
34095 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
34096 unsigned OldMaskSize = MaskVal.getMinSignedBits();
34097 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
34098 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
34099 (OldMaskSize > 32 && NewMaskSize <= 32)) {
34100 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
34102 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
34103 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
34104 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
34109 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
34110 TargetLowering::DAGCombinerInfo &DCI,
34111 const X86Subtarget &Subtarget) {
34112 if (N->getOpcode() == ISD::SHL)
34113 if (SDValue V = combineShiftLeft(N, DAG))
34116 if (N->getOpcode() == ISD::SRA)
34117 if (SDValue V = combineShiftRightArithmetic(N, DAG))
34120 if (N->getOpcode() == ISD::SRL)
34121 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
34127 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
34128 TargetLowering::DAGCombinerInfo &DCI,
34129 const X86Subtarget &Subtarget) {
34130 unsigned Opcode = N->getOpcode();
34131 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
34132 "Unexpected shift opcode");
34134 EVT VT = N->getValueType(0);
34135 SDValue N0 = N->getOperand(0);
34136 SDValue N1 = N->getOperand(1);
34137 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
34138 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
34139 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
34140 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
34141 "Unexpected PACKSS/PACKUS input type");
34143 // Constant Folding.
34144 APInt UndefElts0, UndefElts1;
34145 SmallVector<APInt, 32> EltBits0, EltBits1;
34146 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
34147 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
34148 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
34149 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
34150 unsigned NumLanes = VT.getSizeInBits() / 128;
34151 unsigned NumDstElts = VT.getVectorNumElements();
34152 unsigned NumSrcElts = NumDstElts / 2;
34153 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
34154 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
34155 bool IsSigned = (X86ISD::PACKSS == Opcode);
34157 APInt Undefs(NumDstElts, 0);
34158 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
34159 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
34160 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
34161 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
34162 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
34163 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
34165 if (UndefElts[SrcIdx]) {
34166 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
34170 APInt &Val = EltBits[SrcIdx];
34172 // PACKSS: Truncate signed value with signed saturation.
34173 // Source values less than dst minint are saturated to minint.
34174 // Source values greater than dst maxint are saturated to maxint.
34175 if (Val.isSignedIntN(DstBitsPerElt))
34176 Val = Val.trunc(DstBitsPerElt);
34177 else if (Val.isNegative())
34178 Val = APInt::getSignedMinValue(DstBitsPerElt);
34180 Val = APInt::getSignedMaxValue(DstBitsPerElt);
34182 // PACKUS: Truncate signed value with unsigned saturation.
34183 // Source values less than zero are saturated to zero.
34184 // Source values greater than dst maxuint are saturated to maxuint.
34185 if (Val.isIntN(DstBitsPerElt))
34186 Val = Val.trunc(DstBitsPerElt);
34187 else if (Val.isNegative())
34188 Val = APInt::getNullValue(DstBitsPerElt);
34190 Val = APInt::getAllOnesValue(DstBitsPerElt);
34192 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
34196 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
34199 // Attempt to combine as shuffle.
34202 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34203 /*HasVarMask*/ false, DAG, Subtarget)) {
34204 DCI.CombineTo(N, Res);
34211 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
34212 TargetLowering::DAGCombinerInfo &DCI,
34213 const X86Subtarget &Subtarget) {
34214 unsigned Opcode = N->getOpcode();
34215 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
34216 X86ISD::VSRLI == Opcode) &&
34217 "Unexpected shift opcode");
34218 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
34219 EVT VT = N->getValueType(0);
34220 SDValue N0 = N->getOperand(0);
34221 SDValue N1 = N->getOperand(1);
34222 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
34223 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
34224 "Unexpected value type");
34226 // Out of range logical bit shifts are guaranteed to be zero.
34227 // Out of range arithmetic bit shifts splat the sign bit.
34228 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
34229 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
34231 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34233 ShiftVal = NumBitsPerElt - 1;
34236 // Shift N0 by zero -> N0.
34240 // Shift zero -> zero.
34241 if (ISD::isBuildVectorAllZeros(N0.getNode()))
34242 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
34244 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
34245 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
34246 // TODO - support other sra opcodes as needed.
34247 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
34248 N0.getOpcode() == X86ISD::VSRAI)
34249 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
34251 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
34252 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
34253 N1 == N0.getOperand(1)) {
34254 SDValue N00 = N0.getOperand(0);
34255 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
34256 if (ShiftVal.ult(NumSignBits))
34260 // We can decode 'whole byte' logical bit shifts as shuffles.
34261 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
34263 if (SDValue Res = combineX86ShufflesRecursively(
34264 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34265 /*HasVarMask*/ false, DAG, Subtarget)) {
34266 DCI.CombineTo(N, Res);
34271 // Constant Folding.
34273 SmallVector<APInt, 32> EltBits;
34274 if (N->isOnlyUserOf(N0.getNode()) &&
34275 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
34276 assert(EltBits.size() == VT.getVectorNumElements() &&
34277 "Unexpected shift value type");
34278 unsigned ShiftImm = ShiftVal.getZExtValue();
34279 for (APInt &Elt : EltBits) {
34280 if (X86ISD::VSHLI == Opcode)
34282 else if (X86ISD::VSRAI == Opcode)
34283 Elt.ashrInPlace(ShiftImm);
34285 Elt.lshrInPlace(ShiftImm);
34287 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
34293 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
34294 TargetLowering::DAGCombinerInfo &DCI,
34295 const X86Subtarget &Subtarget) {
34297 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
34298 (N->getOpcode() == X86ISD::PINSRW &&
34299 N->getValueType(0) == MVT::v8i16)) &&
34300 "Unexpected vector insertion");
34302 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
34305 combineX86ShufflesRecursively({Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34306 /*HasVarMask*/ false, DAG, Subtarget)) {
34307 DCI.CombineTo(N, Res);
34314 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
34315 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
34316 /// OR -> CMPNEQSS.
34317 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
34318 TargetLowering::DAGCombinerInfo &DCI,
34319 const X86Subtarget &Subtarget) {
34322 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
34323 // we're requiring SSE2 for both.
34324 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
34325 SDValue N0 = N->getOperand(0);
34326 SDValue N1 = N->getOperand(1);
34327 SDValue CMP0 = N0->getOperand(1);
34328 SDValue CMP1 = N1->getOperand(1);
34331 // The SETCCs should both refer to the same CMP.
34332 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
34335 SDValue CMP00 = CMP0->getOperand(0);
34336 SDValue CMP01 = CMP0->getOperand(1);
34337 EVT VT = CMP00.getValueType();
34339 if (VT == MVT::f32 || VT == MVT::f64) {
34340 bool ExpectingFlags = false;
34341 // Check for any users that want flags:
34342 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
34343 !ExpectingFlags && UI != UE; ++UI)
34344 switch (UI->getOpcode()) {
34349 ExpectingFlags = true;
34351 case ISD::CopyToReg:
34352 case ISD::SIGN_EXTEND:
34353 case ISD::ZERO_EXTEND:
34354 case ISD::ANY_EXTEND:
34358 if (!ExpectingFlags) {
34359 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
34360 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
34362 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
34363 X86::CondCode tmp = cc0;
34368 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
34369 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
34370 // FIXME: need symbolic constants for these magic numbers.
34371 // See X86ATTInstPrinter.cpp:printSSECC().
34372 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
34373 if (Subtarget.hasAVX512()) {
34375 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
34376 DAG.getConstant(x86cc, DL, MVT::i8));
34377 // Need to fill with zeros to ensure the bitcast will produce zeroes
34378 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
34379 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
34380 DAG.getConstant(0, DL, MVT::v16i1),
34381 FSetCC, DAG.getIntPtrConstant(0, DL));
34382 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
34383 N->getSimpleValueType(0));
34385 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
34386 CMP00.getValueType(), CMP00, CMP01,
34387 DAG.getConstant(x86cc, DL,
34390 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
34391 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
34393 if (is64BitFP && !Subtarget.is64Bit()) {
34394 // On a 32-bit target, we cannot bitcast the 64-bit float to a
34395 // 64-bit integer, since that's not a legal type. Since
34396 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
34397 // bits, but can do this little dance to extract the lowest 32 bits
34398 // and work with those going forward.
34399 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
34401 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
34402 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
34403 Vector32, DAG.getIntPtrConstant(0, DL));
34407 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
34408 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
34409 DAG.getConstant(1, DL, IntVT));
34410 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
34412 return OneBitOfTruth;
34420 // Try to match (and (xor X, -1), Y) logic pattern for (andnp X, Y) combines.
34421 static bool matchANDXORWithAllOnesAsANDNP(SDNode *N, SDValue &X, SDValue &Y) {
34422 if (N->getOpcode() != ISD::AND)
34425 SDValue N0 = N->getOperand(0);
34426 SDValue N1 = N->getOperand(1);
34427 if (N0.getOpcode() == ISD::XOR &&
34428 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode())) {
34429 X = N0.getOperand(0);
34433 if (N1.getOpcode() == ISD::XOR &&
34434 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode())) {
34435 X = N1.getOperand(0);
34443 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
34444 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
34445 assert(N->getOpcode() == ISD::AND);
34447 EVT VT = N->getValueType(0);
34448 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
34452 if (matchANDXORWithAllOnesAsANDNP(N, X, Y))
34453 return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
34458 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
34459 // register. In most cases we actually compare or select YMM-sized registers
34460 // and mixing the two types creates horrible code. This method optimizes
34461 // some of the transition sequences.
34462 // Even with AVX-512 this is still useful for removing casts around logical
34463 // operations on vXi1 mask types.
34464 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
34465 const X86Subtarget &Subtarget) {
34466 EVT VT = N->getValueType(0);
34467 assert(VT.isVector() && "Expected vector type");
34469 assert((N->getOpcode() == ISD::ANY_EXTEND ||
34470 N->getOpcode() == ISD::ZERO_EXTEND ||
34471 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
34473 SDValue Narrow = N->getOperand(0);
34474 EVT NarrowVT = Narrow.getValueType();
34476 if (Narrow->getOpcode() != ISD::XOR &&
34477 Narrow->getOpcode() != ISD::AND &&
34478 Narrow->getOpcode() != ISD::OR)
34481 SDValue N0 = Narrow->getOperand(0);
34482 SDValue N1 = Narrow->getOperand(1);
34485 // The Left side has to be a trunc.
34486 if (N0.getOpcode() != ISD::TRUNCATE)
34489 // The type of the truncated inputs.
34490 if (N0->getOperand(0).getValueType() != VT)
34493 // The right side has to be a 'trunc' or a constant vector.
34494 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
34495 N1.getOperand(0).getValueType() == VT;
34497 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
34500 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34502 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
34505 // Set N0 and N1 to hold the inputs to the new wide operation.
34506 N0 = N0->getOperand(0);
34508 N1 = N1->getOperand(0);
34510 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
34512 // Generate the wide operation.
34513 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
34514 unsigned Opcode = N->getOpcode();
34516 default: llvm_unreachable("Unexpected opcode");
34517 case ISD::ANY_EXTEND:
34519 case ISD::ZERO_EXTEND:
34520 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
34521 case ISD::SIGN_EXTEND:
34522 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
34523 Op, DAG.getValueType(NarrowVT));
34527 /// If both input operands of a logic op are being cast from floating point
34528 /// types, try to convert this into a floating point logic node to avoid
34529 /// unnecessary moves from SSE to integer registers.
34530 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
34531 const X86Subtarget &Subtarget) {
34532 unsigned FPOpcode = ISD::DELETED_NODE;
34533 if (N->getOpcode() == ISD::AND)
34534 FPOpcode = X86ISD::FAND;
34535 else if (N->getOpcode() == ISD::OR)
34536 FPOpcode = X86ISD::FOR;
34537 else if (N->getOpcode() == ISD::XOR)
34538 FPOpcode = X86ISD::FXOR;
34540 assert(FPOpcode != ISD::DELETED_NODE &&
34541 "Unexpected input node for FP logic conversion");
34543 EVT VT = N->getValueType(0);
34544 SDValue N0 = N->getOperand(0);
34545 SDValue N1 = N->getOperand(1);
34547 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
34548 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
34549 (Subtarget.hasSSE2() && VT == MVT::i64))) {
34550 SDValue N00 = N0.getOperand(0);
34551 SDValue N10 = N1.getOperand(0);
34552 EVT N00Type = N00.getValueType();
34553 EVT N10Type = N10.getValueType();
34554 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
34555 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
34556 return DAG.getBitcast(VT, FPLogic);
34562 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
34563 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
34564 /// with a shift-right to eliminate loading the vector constant mask value.
34565 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
34566 const X86Subtarget &Subtarget) {
34567 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
34568 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
34569 EVT VT0 = Op0.getValueType();
34570 EVT VT1 = Op1.getValueType();
34572 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
34576 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
34577 !SplatVal.isMask())
34580 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
34583 unsigned EltBitWidth = VT0.getScalarSizeInBits();
34584 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
34588 unsigned ShiftVal = SplatVal.countTrailingOnes();
34589 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
34590 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
34591 return DAG.getBitcast(N->getValueType(0), Shift);
34594 // Get the index node from the lowered DAG of a GEP IR instruction with one
34595 // indexing dimension.
34596 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
34597 if (Ld->isIndexed())
34600 SDValue Base = Ld->getBasePtr();
34602 if (Base.getOpcode() != ISD::ADD)
34605 SDValue ShiftedIndex = Base.getOperand(0);
34607 if (ShiftedIndex.getOpcode() != ISD::SHL)
34610 return ShiftedIndex.getOperand(0);
34614 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
34615 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
34616 switch (VT.getSizeInBits()) {
34617 default: return false;
34618 case 64: return Subtarget.is64Bit() ? true : false;
34619 case 32: return true;
34625 // This function recognizes cases where X86 bzhi instruction can replace and
34626 // 'and-load' sequence.
34627 // In case of loading integer value from an array of constants which is defined
34630 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
34632 // then applying a bitwise and on the result with another input.
34633 // It's equivalent to performing bzhi (zero high bits) on the input, with the
34634 // same index of the load.
34635 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
34636 const X86Subtarget &Subtarget) {
34637 MVT VT = Node->getSimpleValueType(0);
34640 // Check if subtarget has BZHI instruction for the node's type
34641 if (!hasBZHI(Subtarget, VT))
34644 // Try matching the pattern for both operands.
34645 for (unsigned i = 0; i < 2; i++) {
34646 SDValue N = Node->getOperand(i);
34647 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
34649 // continue if the operand is not a load instruction
34653 const Value *MemOp = Ld->getMemOperand()->getValue();
34658 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
34659 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
34660 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
34662 Constant *Init = GV->getInitializer();
34663 Type *Ty = Init->getType();
34664 if (!isa<ConstantDataArray>(Init) ||
34665 !Ty->getArrayElementType()->isIntegerTy() ||
34666 Ty->getArrayElementType()->getScalarSizeInBits() !=
34667 VT.getSizeInBits() ||
34668 Ty->getArrayNumElements() >
34669 Ty->getArrayElementType()->getScalarSizeInBits())
34672 // Check if the array's constant elements are suitable to our case.
34673 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
34674 bool ConstantsMatch = true;
34675 for (uint64_t j = 0; j < ArrayElementCount; j++) {
34676 ConstantInt *Elem =
34677 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
34678 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
34679 ConstantsMatch = false;
34683 if (!ConstantsMatch)
34686 // Do the transformation (For 32-bit type):
34687 // -> (and (load arr[idx]), inp)
34688 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
34689 // that will be replaced with one bzhi instruction.
34690 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
34691 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32);
34693 // Get the Node which indexes into the array.
34694 SDValue Index = getIndexFromUnindexedLoad(Ld);
34697 Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32);
34699 SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index);
34700 Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub);
34702 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
34703 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
34705 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
34713 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
34714 TargetLowering::DAGCombinerInfo &DCI,
34715 const X86Subtarget &Subtarget) {
34716 EVT VT = N->getValueType(0);
34718 // If this is SSE1 only convert to FAND to avoid scalarization.
34719 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34720 return DAG.getBitcast(
34721 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
34722 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34723 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34726 // Use a 32-bit and+zext if upper bits known zero.
34727 if (VT == MVT::i64 && Subtarget.is64Bit() &&
34728 !isa<ConstantSDNode>(N->getOperand(1))) {
34729 APInt HiMask = APInt::getHighBitsSet(64, 32);
34730 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
34731 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
34733 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
34734 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
34735 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
34736 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
34740 if (DCI.isBeforeLegalizeOps())
34743 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34746 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34749 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
34752 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
34755 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
34758 // Attempt to recursively combine a bitmask AND with shuffles.
34759 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34761 if (SDValue Res = combineX86ShufflesRecursively(
34762 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34763 /*HasVarMask*/ false, DAG, Subtarget)) {
34764 DCI.CombineTo(N, Res);
34769 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
34770 if ((VT.getScalarSizeInBits() % 8) == 0 &&
34771 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34772 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
34773 SDValue BitMask = N->getOperand(1);
34774 SDValue SrcVec = N->getOperand(0).getOperand(0);
34775 EVT SrcVecVT = SrcVec.getValueType();
34777 // Check that the constant bitmask masks whole bytes.
34779 SmallVector<APInt, 64> EltBits;
34780 if (VT == SrcVecVT.getScalarType() &&
34781 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
34782 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
34783 llvm::all_of(EltBits, [](APInt M) {
34784 return M.isNullValue() || M.isAllOnesValue();
34786 unsigned NumElts = SrcVecVT.getVectorNumElements();
34787 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
34788 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
34790 // Create a root shuffle mask from the byte mask and the extracted index.
34791 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
34792 for (unsigned i = 0; i != Scale; ++i) {
34795 int VecIdx = Scale * Idx + i;
34796 ShuffleMask[VecIdx] =
34797 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
34800 if (SDValue Shuffle = combineX86ShufflesRecursively(
34801 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
34802 /*HasVarMask*/ false, DAG, Subtarget))
34803 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
34804 N->getOperand(0).getOperand(1));
34811 // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern.
34812 static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
34813 if (N->getOpcode() != ISD::OR)
34816 SDValue N0 = N->getOperand(0);
34817 SDValue N1 = N->getOperand(1);
34819 // Canonicalize AND to LHS.
34820 if (N1.getOpcode() == ISD::AND)
34823 // Attempt to match OR(AND(M,Y),ANDNP(M,X)).
34824 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
34827 Mask = N1.getOperand(0);
34828 X = N1.getOperand(1);
34830 // Check to see if the mask appeared in both the AND and ANDNP.
34831 if (N0.getOperand(0) == Mask)
34832 Y = N0.getOperand(1);
34833 else if (N0.getOperand(1) == Mask)
34834 Y = N0.getOperand(0);
34838 // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for
34839 // ANDNP combine allows other combines to happen that prevent matching.
34844 // (or (and (m, y), (pandn m, x)))
34846 // (vselect m, x, y)
34847 // As a special case, try to fold:
34848 // (or (and (m, (sub 0, x)), (pandn m, x)))
34850 // (sub (xor X, M), M)
34851 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
34852 const X86Subtarget &Subtarget) {
34853 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
34855 EVT VT = N->getValueType(0);
34856 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
34857 (VT.is256BitVector() && Subtarget.hasInt256())))
34860 SDValue X, Y, Mask;
34861 if (!matchLogicBlend(N, X, Y, Mask))
34864 // Validate that X, Y, and Mask are bitcasts, and see through them.
34865 Mask = peekThroughBitcasts(Mask);
34866 X = peekThroughBitcasts(X);
34867 Y = peekThroughBitcasts(Y);
34869 EVT MaskVT = Mask.getValueType();
34870 unsigned EltBits = MaskVT.getScalarSizeInBits();
34872 // TODO: Attempt to handle floating point cases as well?
34873 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
34879 // (or (and (M, (sub 0, X)), (pandn M, X)))
34880 // which is a special case of vselect:
34881 // (vselect M, (sub 0, X), X)
34883 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
34884 // We know that, if fNegate is 0 or 1:
34885 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
34887 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
34888 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
34889 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
34890 // This lets us transform our vselect to:
34891 // (add (xor X, M), (and M, 1))
34893 // (sub (xor X, M), M)
34894 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
34895 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
34896 auto IsNegV = [](SDNode *N, SDValue V) {
34897 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
34898 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
34901 if (IsNegV(Y.getNode(), X))
34903 else if (IsNegV(X.getNode(), Y))
34907 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
34908 SDValue SubOp2 = Mask;
34910 // If the negate was on the false side of the select, then
34911 // the operands of the SUB need to be swapped. PR 27251.
34912 // This is because the pattern being matched above is
34913 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
34914 // but if the pattern matched was
34915 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
34916 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
34917 // pattern also needs to be a negation of the replacement pattern above.
34918 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
34919 // sub accomplishes the negation of the replacement pattern.
34921 std::swap(SubOp1, SubOp2);
34923 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
34924 return DAG.getBitcast(VT, Res);
34928 // PBLENDVB is only available on SSE 4.1.
34929 if (!Subtarget.hasSSE41())
34932 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
34934 X = DAG.getBitcast(BlendVT, X);
34935 Y = DAG.getBitcast(BlendVT, Y);
34936 Mask = DAG.getBitcast(BlendVT, Mask);
34937 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
34938 return DAG.getBitcast(VT, Mask);
34941 // Helper function for combineOrCmpEqZeroToCtlzSrl
34945 // srl(ctlz x), log2(bitsize(x))
34946 // Input pattern is checked by caller.
34947 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
34948 SelectionDAG &DAG) {
34949 SDValue Cmp = Op.getOperand(1);
34950 EVT VT = Cmp.getOperand(0).getValueType();
34951 unsigned Log2b = Log2_32(VT.getSizeInBits());
34953 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
34954 // The result of the shift is true or false, and on X86, the 32-bit
34955 // encoding of shr and lzcnt is more desirable.
34956 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
34957 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
34958 DAG.getConstant(Log2b, dl, MVT::i8));
34959 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
34962 // Try to transform:
34963 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
34965 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
34966 // Will also attempt to match more generic cases, eg:
34967 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
34968 // Only applies if the target supports the FastLZCNT feature.
34969 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
34970 TargetLowering::DAGCombinerInfo &DCI,
34971 const X86Subtarget &Subtarget) {
34972 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
34975 auto isORCandidate = [](SDValue N) {
34976 return (N->getOpcode() == ISD::OR && N->hasOneUse());
34979 // Check the zero extend is extending to 32-bit or more. The code generated by
34980 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
34981 // instructions to clear the upper bits.
34982 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
34983 !isORCandidate(N->getOperand(0)))
34986 // Check the node matches: setcc(eq, cmp 0)
34987 auto isSetCCCandidate = [](SDValue N) {
34988 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
34989 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
34990 N->getOperand(1).getOpcode() == X86ISD::CMP &&
34991 isNullConstant(N->getOperand(1).getOperand(1)) &&
34992 N->getOperand(1).getValueType().bitsGE(MVT::i32);
34995 SDNode *OR = N->getOperand(0).getNode();
34996 SDValue LHS = OR->getOperand(0);
34997 SDValue RHS = OR->getOperand(1);
34999 // Save nodes matching or(or, setcc(eq, cmp 0)).
35000 SmallVector<SDNode *, 2> ORNodes;
35001 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
35002 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
35003 ORNodes.push_back(OR);
35004 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
35005 LHS = OR->getOperand(0);
35006 RHS = OR->getOperand(1);
35009 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
35010 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
35011 !isORCandidate(SDValue(OR, 0)))
35014 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
35016 // or(srl(ctlz),srl(ctlz)).
35017 // The dag combiner can then fold it into:
35018 // srl(or(ctlz, ctlz)).
35019 EVT VT = OR->getValueType(0);
35020 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
35021 SDValue Ret, NewRHS;
35022 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
35023 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
35028 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
35029 while (ORNodes.size() > 0) {
35030 OR = ORNodes.pop_back_val();
35031 LHS = OR->getOperand(0);
35032 RHS = OR->getOperand(1);
35033 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
35034 if (RHS->getOpcode() == ISD::OR)
35035 std::swap(LHS, RHS);
35036 EVT VT = OR->getValueType(0);
35037 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
35040 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
35044 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
35049 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
35050 TargetLowering::DAGCombinerInfo &DCI,
35051 const X86Subtarget &Subtarget) {
35052 SDValue N0 = N->getOperand(0);
35053 SDValue N1 = N->getOperand(1);
35054 EVT VT = N->getValueType(0);
35056 // If this is SSE1 only convert to FOR to avoid scalarization.
35057 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
35058 return DAG.getBitcast(MVT::v4i32,
35059 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
35060 DAG.getBitcast(MVT::v4f32, N0),
35061 DAG.getBitcast(MVT::v4f32, N1)));
35064 if (DCI.isBeforeLegalizeOps())
35067 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
35070 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35073 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
35076 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
35079 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
35080 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
35082 // SHLD/SHRD instructions have lower register pressure, but on some
35083 // platforms they have higher latency than the equivalent
35084 // series of shifts/or that would otherwise be generated.
35085 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
35086 // have higher latencies and we are not optimizing for size.
35087 if (!OptForSize && Subtarget.isSHLDSlow())
35090 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
35092 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
35094 if (!N0.hasOneUse() || !N1.hasOneUse())
35097 SDValue ShAmt0 = N0.getOperand(1);
35098 if (ShAmt0.getValueType() != MVT::i8)
35100 SDValue ShAmt1 = N1.getOperand(1);
35101 if (ShAmt1.getValueType() != MVT::i8)
35103 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
35104 ShAmt0 = ShAmt0.getOperand(0);
35105 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
35106 ShAmt1 = ShAmt1.getOperand(0);
35109 unsigned Opc = X86ISD::SHLD;
35110 SDValue Op0 = N0.getOperand(0);
35111 SDValue Op1 = N1.getOperand(0);
35112 if (ShAmt0.getOpcode() == ISD::SUB ||
35113 ShAmt0.getOpcode() == ISD::XOR) {
35114 Opc = X86ISD::SHRD;
35115 std::swap(Op0, Op1);
35116 std::swap(ShAmt0, ShAmt1);
35119 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
35120 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
35121 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
35122 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
35123 unsigned Bits = VT.getSizeInBits();
35124 if (ShAmt1.getOpcode() == ISD::SUB) {
35125 SDValue Sum = ShAmt1.getOperand(0);
35126 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
35127 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
35128 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
35129 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
35130 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
35131 return DAG.getNode(Opc, DL, VT,
35133 DAG.getNode(ISD::TRUNCATE, DL,
35136 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
35137 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
35138 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
35139 return DAG.getNode(Opc, DL, VT,
35140 N0.getOperand(0), N1.getOperand(0),
35141 DAG.getNode(ISD::TRUNCATE, DL,
35143 } else if (ShAmt1.getOpcode() == ISD::XOR) {
35144 SDValue Mask = ShAmt1.getOperand(1);
35145 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
35146 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
35147 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
35148 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
35149 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
35150 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
35151 if (Op1.getOpcode() == InnerShift &&
35152 isa<ConstantSDNode>(Op1.getOperand(1)) &&
35153 Op1.getConstantOperandVal(1) == 1) {
35154 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35155 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35157 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
35158 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
35159 Op1.getOperand(0) == Op1.getOperand(1)) {
35160 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
35161 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
35170 /// Try to turn tests against the signbit in the form of:
35171 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
35174 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
35175 // This is only worth doing if the output type is i8 or i1.
35176 EVT ResultType = N->getValueType(0);
35177 if (ResultType != MVT::i8 && ResultType != MVT::i1)
35180 SDValue N0 = N->getOperand(0);
35181 SDValue N1 = N->getOperand(1);
35183 // We should be performing an xor against a truncated shift.
35184 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
35187 // Make sure we are performing an xor against one.
35188 if (!isOneConstant(N1))
35191 // SetCC on x86 zero extends so only act on this if it's a logical shift.
35192 SDValue Shift = N0.getOperand(0);
35193 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
35196 // Make sure we are truncating from one of i16, i32 or i64.
35197 EVT ShiftTy = Shift.getValueType();
35198 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
35201 // Make sure the shift amount extracts the sign bit.
35202 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
35203 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
35206 // Create a greater-than comparison against -1.
35207 // N.B. Using SETGE against 0 works but we want a canonical looking
35208 // comparison, using SETGT matches up with what TranslateX86CC.
35210 SDValue ShiftOp = Shift.getOperand(0);
35211 EVT ShiftOpTy = ShiftOp.getValueType();
35212 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35213 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
35214 *DAG.getContext(), ResultType);
35215 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
35216 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
35217 if (SetCCResultType != ResultType)
35218 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
35222 /// Turn vector tests of the signbit in the form of:
35223 /// xor (sra X, elt_size(X)-1), -1
35227 /// This should be called before type legalization because the pattern may not
35228 /// persist after that.
35229 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
35230 const X86Subtarget &Subtarget) {
35231 EVT VT = N->getValueType(0);
35232 if (!VT.isSimple())
35235 switch (VT.getSimpleVT().SimpleTy) {
35236 default: return SDValue();
35239 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
35240 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
35244 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
35247 // There must be a shift right algebraic before the xor, and the xor must be a
35248 // 'not' operation.
35249 SDValue Shift = N->getOperand(0);
35250 SDValue Ones = N->getOperand(1);
35251 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
35252 !ISD::isBuildVectorAllOnes(Ones.getNode()))
35255 // The shift should be smearing the sign bit across each vector element.
35256 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
35260 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
35261 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
35262 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
35265 // Create a greater-than comparison against -1. We don't use the more obvious
35266 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
35267 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
35270 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
35271 /// is valid for the given \p Subtarget.
35272 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
35273 const X86Subtarget &Subtarget) {
35274 if (!Subtarget.hasAVX512())
35277 // FIXME: Scalar type may be supported if we move it to vector register.
35278 if (!SrcVT.isVector())
35281 EVT SrcElVT = SrcVT.getScalarType();
35282 EVT DstElVT = DstVT.getScalarType();
35283 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
35285 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
35286 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
35290 /// Detect patterns of truncation with unsigned saturation:
35292 /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35293 /// Return the source value x to be truncated or SDValue() if the pattern was
35296 /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type),
35297 /// where C1 >= 0 and C2 is unsigned max of destination type.
35299 /// (truncate (smax (smin (x, C2), C1)) to dest_type)
35300 /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2.
35302 /// These two patterns are equivalent to:
35303 /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type)
35304 /// So return the smax(x, C1) value to be truncated or SDValue() if the
35305 /// pattern was not matched.
35306 static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35308 EVT InVT = In.getValueType();
35310 // Saturation with truncation. We truncate from InVT to VT.
35311 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
35312 "Unexpected types for truncate operation");
35314 // Match min/max and return limit value as a parameter.
35315 auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue {
35316 if (V.getOpcode() == Opcode &&
35317 ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit))
35318 return V.getOperand(0);
35323 if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2))
35324 // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
35325 // the element size of the destination type.
35326 if (C2.isMask(VT.getScalarSizeInBits()))
35329 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2))
35330 if (MatchMinMax(SMin, ISD::SMAX, C1))
35331 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()))
35334 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1))
35335 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2))
35336 if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) &&
35338 return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1));
35344 /// Detect patterns of truncation with signed saturation:
35345 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
35346 /// signed_max_of_dest_type)) to dest_type)
35348 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
35349 /// signed_min_of_dest_type)) to dest_type).
35350 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
35351 /// Return the source value to be truncated or SDValue() if the pattern was not
35353 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
35354 unsigned NumDstBits = VT.getScalarSizeInBits();
35355 unsigned NumSrcBits = In.getScalarValueSizeInBits();
35356 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
35358 auto MatchMinMax = [](SDValue V, unsigned Opcode,
35359 const APInt &Limit) -> SDValue {
35361 if (V.getOpcode() == Opcode &&
35362 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
35363 return V.getOperand(0);
35367 APInt SignedMax, SignedMin;
35369 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
35370 SignedMin = APInt(NumSrcBits, 0);
35372 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
35373 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
35376 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
35377 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
35380 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
35381 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
35387 /// Detect a pattern of truncation with signed saturation.
35388 /// The types should allow to use VPMOVSS* instruction on AVX512.
35389 /// Return the source value to be truncated or SDValue() if the pattern was not
35391 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
35392 const X86Subtarget &Subtarget,
35393 const TargetLowering &TLI) {
35394 if (!TLI.isTypeLegal(In.getValueType()))
35396 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35398 return detectSSatPattern(In, VT);
35401 /// Detect a pattern of truncation with saturation:
35402 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
35403 /// The types should allow to use VPMOVUS* instruction on AVX512.
35404 /// Return the source value to be truncated or SDValue() if the pattern was not
35406 static SDValue detectAVX512USatPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35408 const X86Subtarget &Subtarget,
35409 const TargetLowering &TLI) {
35410 if (!TLI.isTypeLegal(In.getValueType()))
35412 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
35414 return detectUSatPattern(In, VT, DAG, DL);
35417 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
35419 const X86Subtarget &Subtarget) {
35420 EVT SVT = VT.getScalarType();
35421 EVT InVT = In.getValueType();
35422 EVT InSVT = InVT.getScalarType();
35423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35424 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
35425 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
35426 if (auto SSatVal = detectSSatPattern(In, VT))
35427 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
35428 if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
35429 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
35431 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
35432 (SVT == MVT::i8 || SVT == MVT::i16) &&
35433 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
35434 if (auto USatVal = detectSSatPattern(In, VT, true)) {
35435 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
35436 if (SVT == MVT::i8 && InSVT == MVT::i32) {
35437 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35438 VT.getVectorNumElements());
35439 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
35442 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
35444 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
35445 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
35448 if (auto SSatVal = detectSSatPattern(In, VT))
35449 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
35455 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
35456 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
35457 /// X86ISD::AVG instruction.
35458 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
35459 const X86Subtarget &Subtarget,
35461 if (!VT.isVector())
35463 EVT InVT = In.getValueType();
35464 unsigned NumElems = VT.getVectorNumElements();
35466 EVT ScalarVT = VT.getVectorElementType();
35467 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
35468 isPowerOf2_32(NumElems)))
35471 // InScalarVT is the intermediate type in AVG pattern and it should be greater
35472 // than the original input type (i8/i16).
35473 EVT InScalarVT = InVT.getVectorElementType();
35474 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
35477 if (!Subtarget.hasSSE2())
35480 // Detect the following pattern:
35482 // %1 = zext <N x i8> %a to <N x i32>
35483 // %2 = zext <N x i8> %b to <N x i32>
35484 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
35485 // %4 = add nuw nsw <N x i32> %3, %2
35486 // %5 = lshr <N x i32> %N, <i32 1 x N>
35487 // %6 = trunc <N x i32> %5 to <N x i8>
35489 // In AVX512, the last instruction can also be a trunc store.
35490 if (In.getOpcode() != ISD::SRL)
35493 // A lambda checking the given SDValue is a constant vector and each element
35494 // is in the range [Min, Max].
35495 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
35496 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
35497 if (!BV || !BV->isConstant())
35499 for (SDValue Op : V->ops()) {
35500 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
35503 const APInt &Val = C->getAPIntValue();
35504 if (Val.ult(Min) || Val.ugt(Max))
35510 // Check if each element of the vector is left-shifted by one.
35511 auto LHS = In.getOperand(0);
35512 auto RHS = In.getOperand(1);
35513 if (!IsConstVectorInRange(RHS, 1, 1))
35515 if (LHS.getOpcode() != ISD::ADD)
35518 // Detect a pattern of a + b + 1 where the order doesn't matter.
35519 SDValue Operands[3];
35520 Operands[0] = LHS.getOperand(0);
35521 Operands[1] = LHS.getOperand(1);
35523 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
35524 ArrayRef<SDValue> Ops) {
35525 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
35528 // Take care of the case when one of the operands is a constant vector whose
35529 // element is in the range [1, 256].
35530 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
35531 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
35532 Operands[0].getOperand(0).getValueType() == VT) {
35533 // The pattern is detected. Subtract one from the constant vector, then
35534 // demote it and emit X86ISD::AVG instruction.
35535 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
35536 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
35537 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
35538 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35539 { Operands[0].getOperand(0), Operands[1] },
35543 if (Operands[0].getOpcode() == ISD::ADD)
35544 std::swap(Operands[0], Operands[1]);
35545 else if (Operands[1].getOpcode() != ISD::ADD)
35547 Operands[2] = Operands[1].getOperand(0);
35548 Operands[1] = Operands[1].getOperand(1);
35550 // Now we have three operands of two additions. Check that one of them is a
35551 // constant vector with ones, and the other two are promoted from i8/i16.
35552 for (int i = 0; i < 3; ++i) {
35553 if (!IsConstVectorInRange(Operands[i], 1, 1))
35555 std::swap(Operands[i], Operands[2]);
35557 // Check if Operands[0] and Operands[1] are results of type promotion.
35558 for (int j = 0; j < 2; ++j)
35559 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
35560 Operands[j].getOperand(0).getValueType() != VT)
35563 // The pattern is detected, emit X86ISD::AVG instruction(s).
35564 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
35565 { Operands[0].getOperand(0),
35566 Operands[1].getOperand(0) }, AVGBuilder);
35572 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
35573 TargetLowering::DAGCombinerInfo &DCI,
35574 const X86Subtarget &Subtarget) {
35575 LoadSDNode *Ld = cast<LoadSDNode>(N);
35576 EVT RegVT = Ld->getValueType(0);
35577 EVT MemVT = Ld->getMemoryVT();
35579 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35581 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
35582 // into two 16-byte operations. Also split non-temporal aligned loads on
35583 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
35584 ISD::LoadExtType Ext = Ld->getExtensionType();
35586 unsigned AddressSpace = Ld->getAddressSpace();
35587 unsigned Alignment = Ld->getAlignment();
35588 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
35589 Ext == ISD::NON_EXTLOAD &&
35590 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
35591 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
35592 AddressSpace, Alignment, &Fast) && !Fast))) {
35593 unsigned NumElems = RegVT.getVectorNumElements();
35597 SDValue Ptr = Ld->getBasePtr();
35599 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
35602 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
35603 Alignment, Ld->getMemOperand()->getFlags());
35605 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
35607 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
35608 Ld->getPointerInfo().getWithOffset(16),
35609 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
35610 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
35612 Load2.getValue(1));
35614 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
35615 return DCI.CombineTo(N, NewVec, TF, true);
35621 /// If V is a build vector of boolean constants and exactly one of those
35622 /// constants is true, return the operand index of that true element.
35623 /// Otherwise, return -1.
35624 static int getOneTrueElt(SDValue V) {
35625 // This needs to be a build vector of booleans.
35626 // TODO: Checking for the i1 type matches the IR definition for the mask,
35627 // but the mask check could be loosened to i8 or other types. That might
35628 // also require checking more than 'allOnesValue'; eg, the x86 HW
35629 // instructions only require that the MSB is set for each mask element.
35630 // The ISD::MSTORE comments/definition do not specify how the mask operand
35632 auto *BV = dyn_cast<BuildVectorSDNode>(V);
35633 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
35636 int TrueIndex = -1;
35637 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
35638 for (unsigned i = 0; i < NumElts; ++i) {
35639 const SDValue &Op = BV->getOperand(i);
35642 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
35645 if (ConstNode->getAPIntValue().isAllOnesValue()) {
35646 // If we already found a one, this is too many.
35647 if (TrueIndex >= 0)
35655 /// Given a masked memory load/store operation, return true if it has one mask
35656 /// bit set. If it has one mask bit set, then also return the memory address of
35657 /// the scalar element to load/store, the vector index to insert/extract that
35658 /// scalar element, and the alignment for the scalar memory access.
35659 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
35660 SelectionDAG &DAG, SDValue &Addr,
35661 SDValue &Index, unsigned &Alignment) {
35662 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
35663 if (TrueMaskElt < 0)
35666 // Get the address of the one scalar element that is specified by the mask
35667 // using the appropriate offset from the base pointer.
35668 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
35669 Addr = MaskedOp->getBasePtr();
35670 if (TrueMaskElt != 0) {
35671 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
35672 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
35675 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
35676 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
35680 /// If exactly one element of the mask is set for a non-extending masked load,
35681 /// it is a scalar load and vector insert.
35682 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35683 /// mask have already been optimized in IR, so we don't bother with those here.
35685 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35686 TargetLowering::DAGCombinerInfo &DCI) {
35687 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35688 // However, some target hooks may need to be added to know when the transform
35689 // is profitable. Endianness would also have to be considered.
35691 SDValue Addr, VecIndex;
35692 unsigned Alignment;
35693 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
35696 // Load the one scalar element that is specified by the mask using the
35697 // appropriate offset from the base pointer.
35699 EVT VT = ML->getValueType(0);
35700 EVT EltVT = VT.getVectorElementType();
35702 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
35703 Alignment, ML->getMemOperand()->getFlags());
35705 // Insert the loaded element into the appropriate place in the vector.
35706 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
35708 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
35712 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
35713 TargetLowering::DAGCombinerInfo &DCI) {
35714 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
35718 EVT VT = ML->getValueType(0);
35720 // If we are loading the first and last elements of a vector, it is safe and
35721 // always faster to load the whole vector. Replace the masked load with a
35722 // vector load and select.
35723 unsigned NumElts = VT.getVectorNumElements();
35724 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
35725 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
35726 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
35727 if (LoadFirstElt && LoadLastElt) {
35728 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35729 ML->getMemOperand());
35730 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
35731 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
35734 // Convert a masked load with a constant mask into a masked load and a select.
35735 // This allows the select operation to use a faster kind of select instruction
35736 // (for example, vblendvps -> vblendps).
35738 // Don't try this if the pass-through operand is already undefined. That would
35739 // cause an infinite loop because that's what we're about to create.
35740 if (ML->getSrc0().isUndef())
35743 // The new masked load has an undef pass-through operand. The select uses the
35744 // original pass-through operand.
35745 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35746 ML->getMask(), DAG.getUNDEF(VT),
35747 ML->getMemoryVT(), ML->getMemOperand(),
35748 ML->getExtensionType());
35749 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
35751 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
35754 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
35755 TargetLowering::DAGCombinerInfo &DCI,
35756 const X86Subtarget &Subtarget) {
35757 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
35759 // TODO: Expanding load with constant mask may be optimized as well.
35760 if (Mld->isExpandingLoad())
35763 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
35764 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
35766 // TODO: Do some AVX512 subsets benefit from this transform?
35767 if (!Subtarget.hasAVX512())
35768 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
35772 if (Mld->getExtensionType() != ISD::SEXTLOAD)
35775 // Resolve extending loads.
35776 EVT VT = Mld->getValueType(0);
35777 unsigned NumElems = VT.getVectorNumElements();
35778 EVT LdVT = Mld->getMemoryVT();
35781 assert(LdVT != VT && "Cannot extend to the same type");
35782 unsigned ToSz = VT.getScalarSizeInBits();
35783 unsigned FromSz = LdVT.getScalarSizeInBits();
35784 // From/To sizes and ElemCount must be pow of two.
35785 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35786 "Unexpected size for extending masked load");
35788 unsigned SizeRatio = ToSz / FromSz;
35789 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
35791 // Create a type on which we perform the shuffle.
35792 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35793 LdVT.getScalarType(), NumElems*SizeRatio);
35794 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35796 // Convert Src0 value.
35797 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
35798 if (!Mld->getSrc0().isUndef()) {
35799 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35800 for (unsigned i = 0; i != NumElems; ++i)
35801 ShuffleVec[i] = i * SizeRatio;
35803 // Can't shuffle using an illegal type.
35804 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35805 "WideVecVT should be legal");
35806 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
35807 DAG.getUNDEF(WideVecVT), ShuffleVec);
35810 // Prepare the new mask.
35812 SDValue Mask = Mld->getMask();
35813 if (Mask.getValueType() == VT) {
35814 // Mask and original value have the same type.
35815 NewMask = DAG.getBitcast(WideVecVT, Mask);
35816 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35817 for (unsigned i = 0; i != NumElems; ++i)
35818 ShuffleVec[i] = i * SizeRatio;
35819 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
35820 ShuffleVec[i] = NumElems * SizeRatio;
35821 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35822 DAG.getConstant(0, dl, WideVecVT),
35825 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35826 unsigned WidenNumElts = NumElems*SizeRatio;
35827 unsigned MaskNumElts = VT.getVectorNumElements();
35828 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35831 unsigned NumConcat = WidenNumElts / MaskNumElts;
35832 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35833 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35835 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35838 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
35839 Mld->getBasePtr(), NewMask, WideSrc0,
35840 Mld->getMemoryVT(), Mld->getMemOperand(),
35842 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
35843 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
35846 /// If exactly one element of the mask is set for a non-truncating masked store,
35847 /// it is a vector extract and scalar store.
35848 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35849 /// mask have already been optimized in IR, so we don't bother with those here.
35850 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
35851 SelectionDAG &DAG) {
35852 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35853 // However, some target hooks may need to be added to know when the transform
35854 // is profitable. Endianness would also have to be considered.
35856 SDValue Addr, VecIndex;
35857 unsigned Alignment;
35858 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
35861 // Extract the one scalar element that is actually being stored.
35863 EVT VT = MS->getValue().getValueType();
35864 EVT EltVT = VT.getVectorElementType();
35865 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
35866 MS->getValue(), VecIndex);
35868 // Store that element at the appropriate offset from the base pointer.
35869 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
35870 Alignment, MS->getMemOperand()->getFlags());
35873 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
35874 const X86Subtarget &Subtarget) {
35875 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
35877 if (Mst->isCompressingStore())
35880 if (!Mst->isTruncatingStore()) {
35881 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
35882 return ScalarStore;
35884 // If the mask is checking (0 > X), we're creating a vector with all-zeros
35885 // or all-ones elements based on the sign bits of X. AVX1 masked store only
35886 // cares about the sign bit of each mask element, so eliminate the compare:
35887 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
35888 // Note that by waiting to match an x86-specific PCMPGT node, we're
35889 // eliminating potentially more complex matching of a setcc node which has
35890 // a full range of predicates.
35891 SDValue Mask = Mst->getMask();
35892 if (Mask.getOpcode() == X86ISD::PCMPGT &&
35893 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
35894 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
35895 "Unexpected type for PCMPGT");
35896 return DAG.getMaskedStore(
35897 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
35898 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
35901 // TODO: AVX512 targets should also be able to simplify something like the
35902 // pattern above, but that pattern will be different. It will either need to
35903 // match setcc more generally or match PCMPGTM later (in tablegen?).
35908 // Resolve truncating stores.
35909 EVT VT = Mst->getValue().getValueType();
35910 unsigned NumElems = VT.getVectorNumElements();
35911 EVT StVT = Mst->getMemoryVT();
35914 assert(StVT != VT && "Cannot truncate to the same type");
35915 unsigned FromSz = VT.getScalarSizeInBits();
35916 unsigned ToSz = StVT.getScalarSizeInBits();
35918 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35920 // The truncating store is legal in some cases. For example
35921 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35922 // are designated for truncate store.
35923 // In this case we don't need any further transformations.
35924 if (TLI.isTruncStoreLegal(VT, StVT))
35927 // From/To sizes and ElemCount must be pow of two.
35928 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35929 "Unexpected size for truncating masked store");
35930 // We are going to use the original vector elt for storing.
35931 // Accumulated smaller vector elements must be a multiple of the store size.
35932 assert (((NumElems * FromSz) % ToSz) == 0 &&
35933 "Unexpected ratio for truncating masked store");
35935 unsigned SizeRatio = FromSz / ToSz;
35936 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35938 // Create a type on which we perform the shuffle.
35939 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35940 StVT.getScalarType(), NumElems*SizeRatio);
35942 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35944 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
35945 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35946 for (unsigned i = 0; i != NumElems; ++i)
35947 ShuffleVec[i] = i * SizeRatio;
35949 // Can't shuffle using an illegal type.
35950 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35951 "WideVecVT should be legal");
35953 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
35954 DAG.getUNDEF(WideVecVT),
35958 SDValue Mask = Mst->getMask();
35959 if (Mask.getValueType() == VT) {
35960 // Mask and original value have the same type.
35961 NewMask = DAG.getBitcast(WideVecVT, Mask);
35962 for (unsigned i = 0; i != NumElems; ++i)
35963 ShuffleVec[i] = i * SizeRatio;
35964 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
35965 ShuffleVec[i] = NumElems*SizeRatio;
35966 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35967 DAG.getConstant(0, dl, WideVecVT),
35970 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35971 unsigned WidenNumElts = NumElems*SizeRatio;
35972 unsigned MaskNumElts = VT.getVectorNumElements();
35973 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35976 unsigned NumConcat = WidenNumElts / MaskNumElts;
35977 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35978 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35980 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35983 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
35984 Mst->getBasePtr(), NewMask, StVT,
35985 Mst->getMemOperand(), false);
35988 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
35989 const X86Subtarget &Subtarget) {
35990 StoreSDNode *St = cast<StoreSDNode>(N);
35991 EVT VT = St->getValue().getValueType();
35992 EVT StVT = St->getMemoryVT();
35994 SDValue StoredVal = St->getOperand(1);
35995 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35997 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
35998 // This will avoid a copy to k-register.
35999 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
36000 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36001 StoredVal.getOperand(0).getValueType() == MVT::i8) {
36002 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
36003 St->getBasePtr(), St->getPointerInfo(),
36004 St->getAlignment(), St->getMemOperand()->getFlags());
36007 // Widen v2i1/v4i1 stores to v8i1.
36008 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
36009 Subtarget.hasAVX512()) {
36010 unsigned NumConcats = 8 / VT.getVectorNumElements();
36011 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
36012 Ops[0] = StoredVal;
36013 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
36014 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36015 St->getPointerInfo(), St->getAlignment(),
36016 St->getMemOperand()->getFlags());
36019 // Turn vXi1 stores of constants into a scalar store.
36020 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
36021 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
36022 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
36023 // If its a v64i1 store without 64-bit support, we need two stores.
36024 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
36025 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
36026 StoredVal->ops().slice(0, 32));
36027 Lo = combinevXi1ConstantToInteger(Lo, DAG);
36028 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
36029 StoredVal->ops().slice(32, 32));
36030 Hi = combinevXi1ConstantToInteger(Hi, DAG);
36032 unsigned Alignment = St->getAlignment();
36034 SDValue Ptr0 = St->getBasePtr();
36035 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
36038 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
36039 Alignment, St->getMemOperand()->getFlags());
36041 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
36042 St->getPointerInfo().getWithOffset(4),
36043 MinAlign(Alignment, 4U),
36044 St->getMemOperand()->getFlags());
36045 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36048 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
36049 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
36050 St->getPointerInfo(), St->getAlignment(),
36051 St->getMemOperand()->getFlags());
36054 // If we are saving a concatenation of two XMM registers and 32-byte stores
36055 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
36057 unsigned AddressSpace = St->getAddressSpace();
36058 unsigned Alignment = St->getAlignment();
36059 if (VT.is256BitVector() && StVT == VT &&
36060 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
36061 AddressSpace, Alignment, &Fast) &&
36063 unsigned NumElems = VT.getVectorNumElements();
36067 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
36068 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
36070 SDValue Ptr0 = St->getBasePtr();
36071 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
36074 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
36075 Alignment, St->getMemOperand()->getFlags());
36077 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
36078 St->getPointerInfo().getWithOffset(16),
36079 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
36080 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
36083 // Optimize trunc store (of multiple scalars) to shuffle and store.
36084 // First, pack all of the elements in one place. Next, store to memory
36085 // in fewer chunks.
36086 if (St->isTruncatingStore() && VT.isVector()) {
36087 // Check if we can detect an AVG pattern from the truncation. If yes,
36088 // replace the trunc store by a normal store with the result of X86ISD::AVG
36090 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
36092 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
36093 St->getPointerInfo(), St->getAlignment(),
36094 St->getMemOperand()->getFlags());
36096 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36098 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
36100 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
36101 dl, Val, St->getBasePtr(),
36102 St->getMemoryVT(), St->getMemOperand(), DAG);
36103 if (SDValue Val = detectAVX512USatPattern(St->getValue(), St->getMemoryVT(),
36104 DAG, dl, Subtarget, TLI))
36105 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
36106 dl, Val, St->getBasePtr(),
36107 St->getMemoryVT(), St->getMemOperand(), DAG);
36109 unsigned NumElems = VT.getVectorNumElements();
36110 assert(StVT != VT && "Cannot truncate to the same type");
36111 unsigned FromSz = VT.getScalarSizeInBits();
36112 unsigned ToSz = StVT.getScalarSizeInBits();
36114 // The truncating store is legal in some cases. For example
36115 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
36116 // are designated for truncate store.
36117 // In this case we don't need any further transformations.
36118 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
36121 // From, To sizes and ElemCount must be pow of two
36122 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
36123 // We are going to use the original vector elt for storing.
36124 // Accumulated smaller vector elements must be a multiple of the store size.
36125 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
36127 unsigned SizeRatio = FromSz / ToSz;
36129 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
36131 // Create a type on which we perform the shuffle
36132 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
36133 StVT.getScalarType(), NumElems*SizeRatio);
36135 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
36137 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
36138 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
36139 for (unsigned i = 0; i != NumElems; ++i)
36140 ShuffleVec[i] = i * SizeRatio;
36142 // Can't shuffle using an illegal type.
36143 if (!TLI.isTypeLegal(WideVecVT))
36146 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
36147 DAG.getUNDEF(WideVecVT),
36149 // At this point all of the data is stored at the bottom of the
36150 // register. We now need to save it to mem.
36152 // Find the largest store unit
36153 MVT StoreType = MVT::i8;
36154 for (MVT Tp : MVT::integer_valuetypes()) {
36155 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
36159 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
36160 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
36161 (64 <= NumElems * ToSz))
36162 StoreType = MVT::f64;
36164 // Bitcast the original vector into a vector of store-size units
36165 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
36166 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
36167 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
36168 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
36169 SmallVector<SDValue, 8> Chains;
36170 SDValue Ptr = St->getBasePtr();
36172 // Perform one or more big stores into memory.
36173 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
36174 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
36175 StoreType, ShuffWide,
36176 DAG.getIntPtrConstant(i, dl));
36178 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
36179 St->getAlignment(), St->getMemOperand()->getFlags());
36180 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
36181 Chains.push_back(Ch);
36184 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
36187 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
36188 // the FP state in cases where an emms may be missing.
36189 // A preferable solution to the general problem is to figure out the right
36190 // places to insert EMMS. This qualifies as a quick hack.
36192 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
36193 if (VT.getSizeInBits() != 64)
36196 const Function &F = DAG.getMachineFunction().getFunction();
36197 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
36199 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
36200 if ((VT.isVector() ||
36201 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
36202 isa<LoadSDNode>(St->getValue()) &&
36203 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
36204 St->getChain().hasOneUse() && !St->isVolatile()) {
36205 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
36206 SmallVector<SDValue, 8> Ops;
36208 if (!ISD::isNormalLoad(Ld))
36211 // If this is not the MMX case, i.e. we are just turning i64 load/store
36212 // into f64 load/store, avoid the transformation if there are multiple
36213 // uses of the loaded value.
36214 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
36219 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
36220 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
36222 if (Subtarget.is64Bit() || F64IsLegal) {
36223 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
36224 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
36225 Ld->getMemOperand());
36227 // Make sure new load is placed in same chain order.
36228 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
36229 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
36230 St->getMemOperand());
36233 // Otherwise, lower to two pairs of 32-bit loads / stores.
36234 SDValue LoAddr = Ld->getBasePtr();
36235 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
36237 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
36238 Ld->getPointerInfo(), Ld->getAlignment(),
36239 Ld->getMemOperand()->getFlags());
36240 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
36241 Ld->getPointerInfo().getWithOffset(4),
36242 MinAlign(Ld->getAlignment(), 4),
36243 Ld->getMemOperand()->getFlags());
36244 // Make sure new loads are placed in same chain order.
36245 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
36246 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
36248 LoAddr = St->getBasePtr();
36249 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
36252 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
36253 St->getAlignment(), St->getMemOperand()->getFlags());
36254 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
36255 St->getPointerInfo().getWithOffset(4),
36256 MinAlign(St->getAlignment(), 4),
36257 St->getMemOperand()->getFlags());
36258 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
36261 // This is similar to the above case, but here we handle a scalar 64-bit
36262 // integer store that is extracted from a vector on a 32-bit target.
36263 // If we have SSE2, then we can treat it like a floating-point double
36264 // to get past legalization. The execution dependencies fixup pass will
36265 // choose the optimal machine instruction for the store if this really is
36266 // an integer or v2f32 rather than an f64.
36267 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
36268 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
36269 SDValue OldExtract = St->getOperand(1);
36270 SDValue ExtOp0 = OldExtract.getOperand(0);
36271 unsigned VecSize = ExtOp0.getValueSizeInBits();
36272 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
36273 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
36274 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
36275 BitCast, OldExtract.getOperand(1));
36276 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
36277 St->getPointerInfo(), St->getAlignment(),
36278 St->getMemOperand()->getFlags());
36284 /// Return 'true' if this vector operation is "horizontal"
36285 /// and return the operands for the horizontal operation in LHS and RHS. A
36286 /// horizontal operation performs the binary operation on successive elements
36287 /// of its first operand, then on successive elements of its second operand,
36288 /// returning the resulting values in a vector. For example, if
36289 /// A = < float a0, float a1, float a2, float a3 >
36291 /// B = < float b0, float b1, float b2, float b3 >
36292 /// then the result of doing a horizontal operation on A and B is
36293 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
36294 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
36295 /// A horizontal-op B, for some already available A and B, and if so then LHS is
36296 /// set to A, RHS to B, and the routine returns 'true'.
36297 /// Note that the binary operation should have the property that if one of the
36298 /// operands is UNDEF then the result is UNDEF.
36299 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
36300 // Look for the following pattern: if
36301 // A = < float a0, float a1, float a2, float a3 >
36302 // B = < float b0, float b1, float b2, float b3 >
36304 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
36305 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
36306 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
36307 // which is A horizontal-op B.
36309 // At least one of the operands should be a vector shuffle.
36310 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
36311 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
36314 MVT VT = LHS.getSimpleValueType();
36316 assert((VT.is128BitVector() || VT.is256BitVector()) &&
36317 "Unsupported vector type for horizontal add/sub");
36319 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
36320 // operate independently on 128-bit lanes.
36321 unsigned NumElts = VT.getVectorNumElements();
36322 unsigned NumLanes = VT.getSizeInBits()/128;
36323 unsigned NumLaneElts = NumElts / NumLanes;
36324 assert((NumLaneElts % 2 == 0) &&
36325 "Vector type should have an even number of elements in each lane");
36326 unsigned HalfLaneElts = NumLaneElts/2;
36328 // View LHS in the form
36329 // LHS = VECTOR_SHUFFLE A, B, LMask
36330 // If LHS is not a shuffle then pretend it is the shuffle
36331 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
36332 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
36335 SmallVector<int, 16> LMask(NumElts);
36336 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36337 if (!LHS.getOperand(0).isUndef())
36338 A = LHS.getOperand(0);
36339 if (!LHS.getOperand(1).isUndef())
36340 B = LHS.getOperand(1);
36341 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
36342 std::copy(Mask.begin(), Mask.end(), LMask.begin());
36344 if (!LHS.isUndef())
36346 for (unsigned i = 0; i != NumElts; ++i)
36350 // Likewise, view RHS in the form
36351 // RHS = VECTOR_SHUFFLE C, D, RMask
36353 SmallVector<int, 16> RMask(NumElts);
36354 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
36355 if (!RHS.getOperand(0).isUndef())
36356 C = RHS.getOperand(0);
36357 if (!RHS.getOperand(1).isUndef())
36358 D = RHS.getOperand(1);
36359 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
36360 std::copy(Mask.begin(), Mask.end(), RMask.begin());
36362 if (!RHS.isUndef())
36364 for (unsigned i = 0; i != NumElts; ++i)
36368 // Check that the shuffles are both shuffling the same vectors.
36369 if (!(A == C && B == D) && !(A == D && B == C))
36372 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
36373 if (!A.getNode() && !B.getNode())
36376 // If A and B occur in reverse order in RHS, then "swap" them (which means
36377 // rewriting the mask).
36379 ShuffleVectorSDNode::commuteMask(RMask);
36381 // At this point LHS and RHS are equivalent to
36382 // LHS = VECTOR_SHUFFLE A, B, LMask
36383 // RHS = VECTOR_SHUFFLE A, B, RMask
36384 // Check that the masks correspond to performing a horizontal operation.
36385 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
36386 for (unsigned i = 0; i != NumLaneElts; ++i) {
36387 int LIdx = LMask[i+l], RIdx = RMask[i+l];
36389 // Ignore any UNDEF components.
36390 if (LIdx < 0 || RIdx < 0 ||
36391 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
36392 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
36395 // Check that successive elements are being operated on. If not, this is
36396 // not a horizontal operation.
36397 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
36398 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
36399 if (!(LIdx == Index && RIdx == Index + 1) &&
36400 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
36405 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
36406 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
36410 /// Do target-specific dag combines on floating-point adds/subs.
36411 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
36412 const X86Subtarget &Subtarget) {
36413 EVT VT = N->getValueType(0);
36414 SDValue LHS = N->getOperand(0);
36415 SDValue RHS = N->getOperand(1);
36416 bool IsFadd = N->getOpcode() == ISD::FADD;
36417 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
36419 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
36420 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
36421 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
36422 isHorizontalBinOp(LHS, RHS, IsFadd)) {
36423 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
36424 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
36429 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
36431 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
36432 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
36433 const X86Subtarget &Subtarget,
36435 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
36436 SDValue Src = N->getOperand(0);
36437 unsigned Opcode = Src.getOpcode();
36438 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36440 EVT VT = N->getValueType(0);
36441 EVT SrcVT = Src.getValueType();
36443 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
36444 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
36446 // Repeated operand, so we are only trading one output truncation for
36447 // one input truncation.
36451 // See if either operand has been extended from a smaller/equal size to
36452 // the truncation size, allowing a truncation to combine with the extend.
36453 unsigned Opcode0 = Op0.getOpcode();
36454 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
36455 Opcode0 == ISD::ZERO_EXTEND) &&
36456 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36459 unsigned Opcode1 = Op1.getOpcode();
36460 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
36461 Opcode1 == ISD::ZERO_EXTEND) &&
36462 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
36465 // See if either operand is a single use constant which can be constant
36467 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
36468 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
36469 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
36470 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
36473 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
36474 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
36475 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
36476 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
36479 // Don't combine if the operation has other uses.
36480 if (!N->isOnlyUserOf(Src.getNode()))
36483 // Only support vector truncation for now.
36484 // TODO: i64 scalar math would benefit as well.
36485 if (!VT.isVector())
36488 // In most cases its only worth pre-truncating if we're only facing the cost
36489 // of one truncation.
36490 // i.e. if one of the inputs will constant fold or the input is repeated.
36495 SDValue Op0 = Src.getOperand(0);
36496 SDValue Op1 = Src.getOperand(1);
36497 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
36498 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36499 return TruncateArithmetic(Op0, Op1);
36504 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
36505 // better to truncate if we have the chance.
36506 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
36507 !TLI.isOperationLegal(Opcode, SrcVT))
36508 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
36511 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
36512 SDValue Op0 = Src.getOperand(0);
36513 SDValue Op1 = Src.getOperand(1);
36514 if (TLI.isOperationLegal(Opcode, VT) &&
36515 IsRepeatedOpOrFreeTruncation(Op0, Op1))
36516 return TruncateArithmetic(Op0, Op1);
36524 /// Truncate using ISD::AND mask and X86ISD::PACKUS.
36525 static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL,
36526 const X86Subtarget &Subtarget,
36527 SelectionDAG &DAG) {
36528 SDValue In = N->getOperand(0);
36529 EVT InVT = In.getValueType();
36530 EVT InSVT = InVT.getVectorElementType();
36531 EVT OutVT = N->getValueType(0);
36532 EVT OutSVT = OutVT.getVectorElementType();
36534 // Split a long vector into vectors of legal type and mask to unset all bits
36535 // that won't appear in the result to prevent saturation.
36536 // TODO - we should be doing this at the maximum legal size but this is
36537 // causing regressions where we're concatenating back to max width just to
36538 // perform the AND and then extracting back again.....
36539 unsigned NumSubRegs = InVT.getSizeInBits() / 128;
36540 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
36541 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
36542 SmallVector<SDValue, 8> SubVecs(NumSubRegs);
36545 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
36546 SDValue MaskVal = DAG.getConstant(Mask, DL, SubRegVT);
36548 for (unsigned i = 0; i < NumSubRegs; i++) {
36549 SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
36550 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
36551 SubVecs[i] = DAG.getNode(ISD::AND, DL, SubRegVT, Sub, MaskVal);
36553 In = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, SubVecs);
36555 return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget);
36558 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
36559 static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL,
36560 const X86Subtarget &Subtarget,
36561 SelectionDAG &DAG) {
36562 SDValue In = N->getOperand(0);
36563 EVT InVT = In.getValueType();
36564 EVT OutVT = N->getValueType(0);
36565 In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In,
36566 DAG.getValueType(OutVT));
36567 return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget);
36570 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
36571 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
36572 /// legalization the truncation will be translated into a BUILD_VECTOR with each
36573 /// element that is extracted from a vector and then truncated, and it is
36574 /// difficult to do this optimization based on them.
36575 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
36576 const X86Subtarget &Subtarget) {
36577 EVT OutVT = N->getValueType(0);
36578 if (!OutVT.isVector())
36581 SDValue In = N->getOperand(0);
36582 if (!In.getValueType().isSimple())
36585 EVT InVT = In.getValueType();
36586 unsigned NumElems = OutVT.getVectorNumElements();
36588 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
36589 // SSE2, and we need to take care of it specially.
36590 // AVX512 provides vpmovdb.
36591 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
36594 EVT OutSVT = OutVT.getVectorElementType();
36595 EVT InSVT = InVT.getVectorElementType();
36596 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
36597 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
36601 // SSSE3's pshufb results in less instructions in the cases below.
36602 if (Subtarget.hasSSSE3() && NumElems == 8 &&
36603 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
36604 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
36608 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
36609 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
36610 // truncate 2 x v4i32 to v8i16.
36611 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
36612 return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG);
36613 if (InSVT == MVT::i32)
36614 return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG);
36619 /// This function transforms vector truncation of 'extended sign-bits' or
36620 /// 'extended zero-bits' values.
36621 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
36622 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
36624 const X86Subtarget &Subtarget) {
36625 // Requires SSE2 but AVX512 has fast truncate.
36626 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36629 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
36632 SDValue In = N->getOperand(0);
36633 if (!In.getValueType().isSimple())
36636 MVT VT = N->getValueType(0).getSimpleVT();
36637 MVT SVT = VT.getScalarType();
36639 MVT InVT = In.getValueType().getSimpleVT();
36640 MVT InSVT = InVT.getScalarType();
36642 // Check we have a truncation suited for PACKSS/PACKUS.
36643 if (!VT.is128BitVector() && !VT.is256BitVector())
36645 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
36647 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
36650 unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
36651 unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8;
36653 // Use PACKUS if the input has zero-bits that extend all the way to the
36654 // packed/truncated value. e.g. masks, zext_in_reg, etc.
36656 DAG.computeKnownBits(In, Known);
36657 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
36658 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits))
36659 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
36661 // Use PACKSS if the input has sign-bits that extend all the way to the
36662 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
36663 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
36664 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
36665 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
36670 // Try to form a MULHU or MULHS node by looking for
36671 // (trunc (srl (mul ext, ext), 16))
36672 // TODO: This is X86 specific because we want to be able to handle wide types
36673 // before type legalization. But we can only do it if the vector will be
36674 // legalized via widening/splitting. Type legalization can't handle promotion
36675 // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
36677 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
36678 SelectionDAG &DAG, const X86Subtarget &Subtarget) {
36679 // First instruction should be a right shift of a multiply.
36680 if (Src.getOpcode() != ISD::SRL ||
36681 Src.getOperand(0).getOpcode() != ISD::MUL)
36684 if (!Subtarget.hasSSE2())
36687 // Only handle vXi16 types that are at least 128-bits.
36688 if (!VT.isVector() || VT.getVectorElementType() != MVT::i16 ||
36689 VT.getVectorNumElements() < 8)
36692 // Input type should be vXi32.
36693 EVT InVT = Src.getValueType();
36694 if (InVT.getVectorElementType() != MVT::i32)
36697 // Need a shift by 16.
36699 if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
36703 SDValue LHS = Src.getOperand(0).getOperand(0);
36704 SDValue RHS = Src.getOperand(0).getOperand(1);
36706 unsigned ExtOpc = LHS.getOpcode();
36707 if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
36708 RHS.getOpcode() != ExtOpc)
36711 // Peek through the extends.
36712 LHS = LHS.getOperand(0);
36713 RHS = RHS.getOperand(0);
36715 // Ensure the input types match.
36716 if (LHS.getValueType() != VT || RHS.getValueType() != VT)
36719 unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
36720 return DAG.getNode(Opc, DL, VT, LHS, RHS);
36723 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
36724 const X86Subtarget &Subtarget) {
36725 EVT VT = N->getValueType(0);
36726 SDValue Src = N->getOperand(0);
36729 // Attempt to pre-truncate inputs to arithmetic ops instead.
36730 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
36733 // Try to detect AVG pattern first.
36734 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
36737 // Try to combine truncation with signed/unsigned saturation.
36738 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
36741 // Try to combine PMULHUW/PMULHW for vXi16.
36742 if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget))
36745 // The bitcast source is a direct mmx result.
36746 // Detect bitcasts between i32 to x86mmx
36747 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
36748 SDValue BCSrc = Src.getOperand(0);
36749 if (BCSrc.getValueType() == MVT::x86mmx)
36750 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
36753 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
36754 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
36757 return combineVectorTruncation(N, DAG, Subtarget);
36760 /// Returns the negated value if the node \p N flips sign of FP value.
36762 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
36763 /// AVX512F does not have FXOR, so FNEG is lowered as
36764 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
36765 /// In this case we go though all bitcasts.
36766 static SDValue isFNEG(SDNode *N) {
36767 if (N->getOpcode() == ISD::FNEG)
36768 return N->getOperand(0);
36770 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
36771 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
36774 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
36775 if (!Op1.getValueType().isFloatingPoint())
36778 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
36780 unsigned EltBits = Op1.getScalarValueSizeInBits();
36781 auto isSignMask = [&](const ConstantFP *C) {
36782 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
36785 // There is more than one way to represent the same constant on
36786 // the different X86 targets. The type of the node may also depend on size.
36787 // - load scalar value and broadcast
36788 // - BUILD_VECTOR node
36789 // - load from a constant pool.
36790 // We check all variants here.
36791 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
36792 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
36793 if (isSignMask(cast<ConstantFP>(C)))
36796 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
36797 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
36798 if (isSignMask(CN->getConstantFPValue()))
36801 } else if (auto *C = getTargetConstantFromNode(Op1)) {
36802 if (C->getType()->isVectorTy()) {
36803 if (auto *SplatV = C->getSplatValue())
36804 if (isSignMask(cast<ConstantFP>(SplatV)))
36806 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
36807 if (isSignMask(FPConst))
36813 /// Do target-specific dag combines on floating point negations.
36814 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
36815 const X86Subtarget &Subtarget) {
36816 EVT OrigVT = N->getValueType(0);
36817 SDValue Arg = isFNEG(N);
36818 assert(Arg.getNode() && "N is expected to be an FNEG node");
36820 EVT VT = Arg.getValueType();
36821 EVT SVT = VT.getScalarType();
36824 // Let legalize expand this if it isn't a legal type yet.
36825 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36828 // If we're negating a FMUL node on a target with FMA, then we can avoid the
36829 // use of a constant by performing (-0 - A*B) instead.
36830 // FIXME: Check rounding control flags as well once it becomes available.
36831 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
36832 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
36833 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
36834 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
36835 Arg.getOperand(1), Zero);
36836 return DAG.getBitcast(OrigVT, NewNode);
36839 // If we're negating an FMA node, then we can adjust the
36840 // instruction to include the extra negation.
36841 unsigned NewOpcode = 0;
36842 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
36843 switch (Arg.getOpcode()) {
36844 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
36845 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
36846 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
36847 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
36848 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
36849 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
36850 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
36851 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
36852 // We can't handle scalar intrinsic node here because it would only
36853 // invert one element and not the whole vector. But we could try to handle
36854 // a negation of the lower element only.
36858 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
36859 Arg.getNode()->ops()));
36864 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
36865 const X86Subtarget &Subtarget) {
36866 MVT VT = N->getSimpleValueType(0);
36867 // If we have integer vector types available, use the integer opcodes.
36868 if (VT.isVector() && Subtarget.hasSSE2()) {
36871 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
36873 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
36874 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
36875 unsigned IntOpcode;
36876 switch (N->getOpcode()) {
36877 default: llvm_unreachable("Unexpected FP logic op");
36878 case X86ISD::FOR: IntOpcode = ISD::OR; break;
36879 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
36880 case X86ISD::FAND: IntOpcode = ISD::AND; break;
36881 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
36883 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
36884 return DAG.getBitcast(VT, IntOp);
36890 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
36891 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
36892 if (N->getOpcode() != ISD::XOR)
36895 SDValue LHS = N->getOperand(0);
36896 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
36897 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
36900 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
36901 X86::CondCode(LHS->getConstantOperandVal(0)));
36903 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
36906 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
36907 TargetLowering::DAGCombinerInfo &DCI,
36908 const X86Subtarget &Subtarget) {
36909 // If this is SSE1 only convert to FXOR to avoid scalarization.
36910 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
36911 N->getValueType(0) == MVT::v4i32) {
36912 return DAG.getBitcast(
36913 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
36914 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
36915 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
36918 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
36921 if (DCI.isBeforeLegalizeOps())
36924 if (SDValue SetCC = foldXor1SetCC(N, DAG))
36927 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
36930 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
36934 return combineFneg(N, DAG, Subtarget);
36938 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
36939 TargetLowering::DAGCombinerInfo &DCI,
36940 const X86Subtarget &Subtarget) {
36941 SDValue Op0 = N->getOperand(0);
36942 SDValue Op1 = N->getOperand(1);
36943 EVT VT = N->getValueType(0);
36944 unsigned NumBits = VT.getSizeInBits();
36946 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36947 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36948 !DCI.isBeforeLegalizeOps());
36950 // TODO - Constant Folding.
36951 if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
36952 // Reduce Cst1 to the bottom 16-bits.
36953 // NOTE: SimplifyDemandedBits won't do this for constants.
36954 const APInt &Val1 = Cst1->getAPIntValue();
36955 APInt MaskedVal1 = Val1 & 0xFFFF;
36956 if (MaskedVal1 != Val1)
36957 return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
36958 DAG.getConstant(MaskedVal1, SDLoc(N), VT));
36961 // Only bottom 16-bits of the control bits are required.
36963 APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
36964 if (TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) {
36965 DCI.CommitTargetLoweringOpt(TLO);
36966 return SDValue(N, 0);
36972 static bool isNullFPScalarOrVectorConst(SDValue V) {
36973 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
36976 /// If a value is a scalar FP zero or a vector FP zero (potentially including
36977 /// undefined elements), return a zero constant that may be used to fold away
36978 /// that value. In the case of a vector, the returned constant will not contain
36979 /// undefined elements even if the input parameter does. This makes it suitable
36980 /// to be used as a replacement operand with operations (eg, bitwise-and) where
36981 /// an undef should not propagate.
36982 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
36983 const X86Subtarget &Subtarget) {
36984 if (!isNullFPScalarOrVectorConst(V))
36987 if (V.getValueType().isVector())
36988 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
36993 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
36994 const X86Subtarget &Subtarget) {
36995 SDValue N0 = N->getOperand(0);
36996 SDValue N1 = N->getOperand(1);
36997 EVT VT = N->getValueType(0);
37000 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
37001 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
37002 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
37003 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
37006 auto isAllOnesConstantFP = [](SDValue V) {
37007 if (V.getSimpleValueType().isVector())
37008 return ISD::isBuildVectorAllOnes(V.getNode());
37009 auto *C = dyn_cast<ConstantFPSDNode>(V);
37010 return C && C->getConstantFPValue()->isAllOnesValue();
37013 // fand (fxor X, -1), Y --> fandn X, Y
37014 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
37015 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
37017 // fand X, (fxor Y, -1) --> fandn Y, X
37018 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
37019 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
37024 /// Do target-specific dag combines on X86ISD::FAND nodes.
37025 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
37026 const X86Subtarget &Subtarget) {
37027 // FAND(0.0, x) -> 0.0
37028 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
37031 // FAND(x, 0.0) -> 0.0
37032 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37035 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
37038 return lowerX86FPLogicOp(N, DAG, Subtarget);
37041 /// Do target-specific dag combines on X86ISD::FANDN nodes.
37042 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
37043 const X86Subtarget &Subtarget) {
37044 // FANDN(0.0, x) -> x
37045 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37046 return N->getOperand(1);
37048 // FANDN(x, 0.0) -> 0.0
37049 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
37052 return lowerX86FPLogicOp(N, DAG, Subtarget);
37055 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
37056 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
37057 const X86Subtarget &Subtarget) {
37058 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
37060 // F[X]OR(0.0, x) -> x
37061 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
37062 return N->getOperand(1);
37064 // F[X]OR(x, 0.0) -> x
37065 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
37066 return N->getOperand(0);
37069 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
37072 return lowerX86FPLogicOp(N, DAG, Subtarget);
37075 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
37076 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
37077 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
37079 // Only perform optimizations if UnsafeMath is used.
37080 if (!DAG.getTarget().Options.UnsafeFPMath)
37083 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
37084 // into FMINC and FMAXC, which are Commutative operations.
37085 unsigned NewOp = 0;
37086 switch (N->getOpcode()) {
37087 default: llvm_unreachable("unknown opcode");
37088 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
37089 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
37092 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
37093 N->getOperand(0), N->getOperand(1));
37096 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
37097 const X86Subtarget &Subtarget) {
37098 if (Subtarget.useSoftFloat())
37101 // TODO: If an operand is already known to be a NaN or not a NaN, this
37102 // should be an optional swap and FMAX/FMIN.
37104 EVT VT = N->getValueType(0);
37105 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
37106 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
37107 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
37110 SDValue Op0 = N->getOperand(0);
37111 SDValue Op1 = N->getOperand(1);
37113 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
37115 // If we don't have to respect NaN inputs, this is a direct translation to x86
37116 // min/max instructions.
37117 if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs())
37118 return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags());
37120 // If we have to respect NaN inputs, this takes at least 3 instructions.
37121 // Favor a library call when operating on a scalar and minimizing code size.
37122 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
37125 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
37126 DAG.getDataLayout(), *DAG.getContext(), VT);
37128 // There are 4 possibilities involving NaN inputs, and these are the required
37132 // ----------------
37133 // Num | Max | Op0 |
37134 // Op0 ----------------
37135 // NaN | Op1 | NaN |
37136 // ----------------
37138 // The SSE FP max/min instructions were not designed for this case, but rather
37140 // Min = Op1 < Op0 ? Op1 : Op0
37141 // Max = Op1 > Op0 ? Op1 : Op0
37143 // So they always return Op0 if either input is a NaN. However, we can still
37144 // use those instructions for fmaxnum by selecting away a NaN input.
37146 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
37147 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
37148 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO);
37150 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
37151 // are NaN, the NaN value of Op1 is the result.
37152 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
37155 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
37156 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
37157 TargetLowering::DAGCombinerInfo &DCI,
37158 const X86Subtarget &Subtarget) {
37159 // ANDNP(0, x) -> x
37160 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
37161 return N->getOperand(1);
37163 // ANDNP(x, 0) -> 0
37164 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
37165 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
37167 EVT VT = N->getValueType(0);
37169 // Attempt to recursively combine a bitmask ANDNP with shuffles.
37170 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
37172 if (SDValue Res = combineX86ShufflesRecursively(
37173 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
37174 /*HasVarMask*/ false, DAG, Subtarget)) {
37175 DCI.CombineTo(N, Res);
37183 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
37184 TargetLowering::DAGCombinerInfo &DCI) {
37185 SDValue N0 = N->getOperand(0);
37186 SDValue N1 = N->getOperand(1);
37188 // BT ignores high bits in the bit index operand.
37189 unsigned BitWidth = N1.getValueSizeInBits();
37190 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
37191 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
37192 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
37197 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
37198 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
37199 EVT VT = N->getValueType(0);
37201 SDValue N0 = N->getOperand(0);
37202 SDValue N1 = N->getOperand(1);
37203 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37205 if (ExtraVT != MVT::i16)
37208 // Look through single use any_extends.
37209 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
37210 N0 = N0.getOperand(0);
37212 // See if we have a single use cmov.
37213 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
37216 SDValue CMovOp0 = N0.getOperand(0);
37217 SDValue CMovOp1 = N0.getOperand(1);
37219 // Make sure both operands are constants.
37220 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37221 !isa<ConstantSDNode>(CMovOp1.getNode()))
37226 // If we looked through an any_extend above, add one to the constants.
37227 if (N0.getValueType() != VT) {
37228 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
37229 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
37232 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
37233 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
37235 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
37236 N0.getOperand(2), N0.getOperand(3));
37239 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
37240 const X86Subtarget &Subtarget) {
37241 if (SDValue V = combineSextInRegCmov(N, DAG))
37244 EVT VT = N->getValueType(0);
37245 SDValue N0 = N->getOperand(0);
37246 SDValue N1 = N->getOperand(1);
37247 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
37250 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
37251 // both SSE and AVX2 since there is no sign-extended shift right
37252 // operation on a vector with 64-bit elements.
37253 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
37254 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
37255 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
37256 N0.getOpcode() == ISD::SIGN_EXTEND)) {
37257 SDValue N00 = N0.getOperand(0);
37259 // EXTLOAD has a better solution on AVX2,
37260 // it may be replaced with X86ISD::VSEXT node.
37261 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
37262 if (!ISD::isNormalLoad(N00.getNode()))
37265 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
37266 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
37268 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
37274 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
37275 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
37276 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
37277 /// opportunities to combine math ops, use an LEA, or use a complex addressing
37278 /// mode. This can eliminate extend, add, and shift instructions.
37279 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
37280 const X86Subtarget &Subtarget) {
37281 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
37282 Ext->getOpcode() != ISD::ZERO_EXTEND)
37285 // TODO: This should be valid for other integer types.
37286 EVT VT = Ext->getValueType(0);
37287 if (VT != MVT::i64)
37290 SDValue Add = Ext->getOperand(0);
37291 if (Add.getOpcode() != ISD::ADD)
37294 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
37295 bool NSW = Add->getFlags().hasNoSignedWrap();
37296 bool NUW = Add->getFlags().hasNoUnsignedWrap();
37298 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
37300 if ((Sext && !NSW) || (!Sext && !NUW))
37303 // Having a constant operand to the 'add' ensures that we are not increasing
37304 // the instruction count because the constant is extended for free below.
37305 // A constant operand can also become the displacement field of an LEA.
37306 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
37310 // Don't make the 'add' bigger if there's no hope of combining it with some
37311 // other 'add' or 'shl' instruction.
37312 // TODO: It may be profitable to generate simpler LEA instructions in place
37313 // of single 'add' instructions, but the cost model for selecting an LEA
37314 // currently has a high threshold.
37315 bool HasLEAPotential = false;
37316 for (auto *User : Ext->uses()) {
37317 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
37318 HasLEAPotential = true;
37322 if (!HasLEAPotential)
37325 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
37326 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
37327 SDValue AddOp0 = Add.getOperand(0);
37328 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
37329 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
37331 // The wider add is guaranteed to not wrap because both operands are
37334 Flags.setNoSignedWrap(NSW);
37335 Flags.setNoUnsignedWrap(NUW);
37336 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
37339 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
37340 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
37341 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
37342 /// extends from AH (which we otherwise need to do contortions to access).
37343 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
37344 SDValue N0 = N->getOperand(0);
37345 auto OpcodeN = N->getOpcode();
37346 auto OpcodeN0 = N0.getOpcode();
37347 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
37348 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
37351 EVT VT = N->getValueType(0);
37352 EVT InVT = N0.getValueType();
37353 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
37354 !(VT == MVT::i32 || VT == MVT::i64))
37357 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
37358 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
37359 : X86ISD::UDIVREM8_ZEXT_HREG;
37360 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
37362 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
37363 // If this was a 64-bit extend, complete it.
37364 if (VT == MVT::i64)
37365 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
37366 return R.getValue(1);
37369 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
37370 // operands and the result of CMOV is not used anywhere else - promote CMOV
37371 // itself instead of promoting its result. This could be beneficial, because:
37372 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
37373 // (or more) pseudo-CMOVs only when they go one-after-another and
37374 // getting rid of result extension code after CMOV will help that.
37375 // 2) Promotion of constant CMOV arguments is free, hence the
37376 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
37377 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
37378 // promotion is also good in terms of code-size.
37379 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
37381 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
37382 SDValue CMovN = Extend->getOperand(0);
37383 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
37386 EVT TargetVT = Extend->getValueType(0);
37387 unsigned ExtendOpcode = Extend->getOpcode();
37390 EVT VT = CMovN.getValueType();
37391 SDValue CMovOp0 = CMovN.getOperand(0);
37392 SDValue CMovOp1 = CMovN.getOperand(1);
37394 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
37395 !isa<ConstantSDNode>(CMovOp1.getNode()))
37398 // Only extend to i32 or i64.
37399 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
37402 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
37404 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
37407 // If this a zero extend to i64, we should only extend to i32 and use a free
37408 // zero extend to finish.
37409 EVT ExtendVT = TargetVT;
37410 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
37411 ExtendVT = MVT::i32;
37413 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
37414 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
37416 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
37417 CMovN.getOperand(2), CMovN.getOperand(3));
37419 // Finish extending if needed.
37420 if (ExtendVT != TargetVT)
37421 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
37426 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
37427 // This is more or less the reverse of combineBitcastvxi1.
37429 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
37430 TargetLowering::DAGCombinerInfo &DCI,
37431 const X86Subtarget &Subtarget) {
37432 unsigned Opcode = N->getOpcode();
37433 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
37434 Opcode != ISD::ANY_EXTEND)
37436 if (!DCI.isBeforeLegalizeOps())
37438 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
37441 SDValue N0 = N->getOperand(0);
37442 EVT VT = N->getValueType(0);
37443 EVT SVT = VT.getScalarType();
37444 EVT InSVT = N0.getValueType().getScalarType();
37445 unsigned EltSizeInBits = SVT.getSizeInBits();
37447 // Input type must be extending a bool vector (bit-casted from a scalar
37448 // integer) to legal integer types.
37449 if (!VT.isVector())
37451 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
37453 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
37456 SDValue N00 = N0.getOperand(0);
37457 EVT SclVT = N0.getOperand(0).getValueType();
37458 if (!SclVT.isScalarInteger())
37463 SmallVector<int, 32> ShuffleMask;
37464 unsigned NumElts = VT.getVectorNumElements();
37465 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
37467 // Broadcast the scalar integer to the vector elements.
37468 if (NumElts > EltSizeInBits) {
37469 // If the scalar integer is greater than the vector element size, then we
37470 // must split it down into sub-sections for broadcasting. For example:
37471 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
37472 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
37473 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
37474 unsigned Scale = NumElts / EltSizeInBits;
37476 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
37477 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
37478 Vec = DAG.getBitcast(VT, Vec);
37480 for (unsigned i = 0; i != Scale; ++i)
37481 ShuffleMask.append(EltSizeInBits, i);
37483 // For smaller scalar integers, we can simply any-extend it to the vector
37484 // element size (we don't care about the upper bits) and broadcast it to all
37486 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
37487 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
37488 ShuffleMask.append(NumElts, 0);
37490 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
37492 // Now, mask the relevant bit in each element.
37493 SmallVector<SDValue, 32> Bits;
37494 for (unsigned i = 0; i != NumElts; ++i) {
37495 int BitIdx = (i % EltSizeInBits);
37496 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
37497 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
37499 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
37500 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
37502 // Compare against the bitmask and extend the result.
37503 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
37504 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
37505 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
37507 // For SEXT, this is now done, otherwise shift the result down for
37509 if (Opcode == ISD::SIGN_EXTEND)
37511 return DAG.getNode(ISD::SRL, DL, VT, Vec,
37512 DAG.getConstant(EltSizeInBits - 1, DL, VT));
37515 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
37516 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
37517 /// with UNDEFs) of the input to vectors of the same size as the target type
37518 /// which then extends the lowest elements.
37519 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
37520 TargetLowering::DAGCombinerInfo &DCI,
37521 const X86Subtarget &Subtarget) {
37522 unsigned Opcode = N->getOpcode();
37523 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
37525 if (!DCI.isBeforeLegalizeOps())
37527 if (!Subtarget.hasSSE2())
37530 SDValue N0 = N->getOperand(0);
37531 EVT VT = N->getValueType(0);
37532 EVT SVT = VT.getScalarType();
37533 EVT InVT = N0.getValueType();
37534 EVT InSVT = InVT.getScalarType();
37536 // Input type must be a vector and we must be extending legal integer types.
37537 if (!VT.isVector())
37539 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
37541 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
37544 // On AVX2+ targets, if the input/output types are both legal then we will be
37545 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
37546 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
37547 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
37552 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
37553 EVT InVT = N.getValueType();
37554 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
37555 Size / InVT.getScalarSizeInBits());
37556 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
37557 DAG.getUNDEF(InVT));
37559 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
37562 // If target-size is less than 128-bits, extend to a type that would extend
37563 // to 128 bits, extend that and extract the original target vector.
37564 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
37565 unsigned Scale = 128 / VT.getSizeInBits();
37567 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
37568 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
37569 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
37570 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
37571 DAG.getIntPtrConstant(0, DL));
37574 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
37575 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
37576 // Also use this if we don't have SSE41 to allow the legalizer do its job.
37577 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
37578 (VT.is256BitVector() && Subtarget.hasInt256()) ||
37579 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
37580 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
37581 return Opcode == ISD::SIGN_EXTEND
37582 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
37583 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
37586 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
37587 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
37588 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
37589 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
37590 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
37592 SmallVector<SDValue, 8> Opnds;
37593 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
37594 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
37595 DAG.getIntPtrConstant(Offset, DL));
37596 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
37597 SrcVec = Opcode == ISD::SIGN_EXTEND
37598 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
37599 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
37600 Opnds.push_back(SrcVec);
37602 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
37605 // On pre-AVX2 targets, split into 128-bit nodes of
37606 // ISD::*_EXTEND_VECTOR_INREG.
37607 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
37608 return SplitAndExtendInReg(128);
37610 // On pre-AVX512 targets, split into 256-bit nodes of
37611 // ISD::*_EXTEND_VECTOR_INREG.
37612 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
37613 return SplitAndExtendInReg(256);
37618 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
37620 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
37621 const X86Subtarget &Subtarget) {
37622 SDValue N0 = N->getOperand(0);
37623 EVT VT = N->getValueType(0);
37626 // Only do this combine with AVX512 for vector extends.
37627 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
37630 // Only combine legal element types.
37631 EVT SVT = VT.getVectorElementType();
37632 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
37633 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
37636 // We can only do this if the vector size in 256 bits or less.
37637 unsigned Size = VT.getSizeInBits();
37641 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
37642 // that's the only integer compares with we have.
37643 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
37644 if (ISD::isUnsignedIntSetCC(CC))
37647 // Only do this combine if the extension will be fully consumed by the setcc.
37648 EVT N00VT = N0.getOperand(0).getValueType();
37649 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
37650 if (Size != MatchingVecType.getSizeInBits())
37653 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
37655 if (N->getOpcode() == ISD::ZERO_EXTEND)
37656 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
37661 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
37662 TargetLowering::DAGCombinerInfo &DCI,
37663 const X86Subtarget &Subtarget) {
37664 SDValue N0 = N->getOperand(0);
37665 EVT VT = N->getValueType(0);
37666 EVT InVT = N0.getValueType();
37669 if (SDValue DivRem8 = getDivRem8(N, DAG))
37672 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37675 if (!DCI.isBeforeLegalizeOps())
37678 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37681 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
37682 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
37683 // Invert and sign-extend a boolean is the same as zero-extend and subtract
37684 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
37685 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
37686 // sext (xor Bool, -1) --> sub (zext Bool), 1
37687 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
37688 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
37691 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37694 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37698 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37701 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37707 static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc) {
37710 default: llvm_unreachable("Unexpected opcode");
37711 case ISD::FMA: Opcode = X86ISD::FNMADD; break;
37712 case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
37713 case X86ISD::FMADDS1: Opcode = X86ISD::FNMADDS1; break;
37714 case X86ISD::FMADDS3: Opcode = X86ISD::FNMADDS3; break;
37715 case X86ISD::FMADDS1_RND: Opcode = X86ISD::FNMADDS1_RND; break;
37716 case X86ISD::FMADDS3_RND: Opcode = X86ISD::FNMADDS3_RND; break;
37717 case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
37718 case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
37719 case X86ISD::FMSUBS1: Opcode = X86ISD::FNMSUBS1; break;
37720 case X86ISD::FMSUBS3: Opcode = X86ISD::FNMSUBS3; break;
37721 case X86ISD::FMSUBS1_RND: Opcode = X86ISD::FNMSUBS1_RND; break;
37722 case X86ISD::FMSUBS3_RND: Opcode = X86ISD::FNMSUBS3_RND; break;
37723 case X86ISD::FNMADD: Opcode = ISD::FMA; break;
37724 case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
37725 case X86ISD::FNMADDS1: Opcode = X86ISD::FMADDS1; break;
37726 case X86ISD::FNMADDS3: Opcode = X86ISD::FMADDS3; break;
37727 case X86ISD::FNMADDS1_RND: Opcode = X86ISD::FMADDS1_RND; break;
37728 case X86ISD::FNMADDS3_RND: Opcode = X86ISD::FMADDS3_RND; break;
37729 case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
37730 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
37731 case X86ISD::FNMSUBS1: Opcode = X86ISD::FMSUBS1; break;
37732 case X86ISD::FNMSUBS3: Opcode = X86ISD::FMSUBS3; break;
37733 case X86ISD::FNMSUBS1_RND: Opcode = X86ISD::FMSUBS1_RND; break;
37734 case X86ISD::FNMSUBS3_RND: Opcode = X86ISD::FMSUBS3_RND; break;
37740 default: llvm_unreachable("Unexpected opcode");
37741 case ISD::FMA: Opcode = X86ISD::FMSUB; break;
37742 case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
37743 case X86ISD::FMADDS1: Opcode = X86ISD::FMSUBS1; break;
37744 case X86ISD::FMADDS3: Opcode = X86ISD::FMSUBS3; break;
37745 case X86ISD::FMADDS1_RND: Opcode = X86ISD::FMSUBS1_RND; break;
37746 case X86ISD::FMADDS3_RND: Opcode = X86ISD::FMSUBS3_RND; break;
37747 case X86ISD::FMSUB: Opcode = ISD::FMA; break;
37748 case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
37749 case X86ISD::FMSUBS1: Opcode = X86ISD::FMADDS1; break;
37750 case X86ISD::FMSUBS3: Opcode = X86ISD::FMADDS3; break;
37751 case X86ISD::FMSUBS1_RND: Opcode = X86ISD::FMADDS1_RND; break;
37752 case X86ISD::FMSUBS3_RND: Opcode = X86ISD::FMADDS3_RND; break;
37753 case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
37754 case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
37755 case X86ISD::FNMADDS1: Opcode = X86ISD::FNMSUBS1; break;
37756 case X86ISD::FNMADDS3: Opcode = X86ISD::FNMSUBS3; break;
37757 case X86ISD::FNMADDS1_RND: Opcode = X86ISD::FNMSUBS1_RND; break;
37758 case X86ISD::FNMADDS3_RND: Opcode = X86ISD::FNMSUBS3_RND; break;
37759 case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
37760 case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
37761 case X86ISD::FNMSUBS1: Opcode = X86ISD::FNMADDS1; break;
37762 case X86ISD::FNMSUBS3: Opcode = X86ISD::FNMADDS3; break;
37763 case X86ISD::FNMSUBS1_RND: Opcode = X86ISD::FNMADDS1_RND; break;
37764 case X86ISD::FNMSUBS3_RND: Opcode = X86ISD::FNMADDS3_RND; break;
37771 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
37772 const X86Subtarget &Subtarget) {
37774 EVT VT = N->getValueType(0);
37776 // Let legalize expand this if it isn't a legal type yet.
37777 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
37780 EVT ScalarVT = VT.getScalarType();
37781 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
37784 SDValue A = N->getOperand(0);
37785 SDValue B = N->getOperand(1);
37786 SDValue C = N->getOperand(2);
37788 auto invertIfNegative = [&DAG](SDValue &V) {
37789 if (SDValue NegVal = isFNEG(V.getNode())) {
37790 V = DAG.getBitcast(V.getValueType(), NegVal);
37793 // Look through extract_vector_elts. If it comes from an FNEG, create a
37794 // new extract from the FNEG input.
37795 if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
37796 isa<ConstantSDNode>(V.getOperand(1)) &&
37797 cast<ConstantSDNode>(V.getOperand(1))->getZExtValue() == 0) {
37798 if (SDValue NegVal = isFNEG(V.getOperand(0).getNode())) {
37799 NegVal = DAG.getBitcast(V.getOperand(0).getValueType(), NegVal);
37800 V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
37801 NegVal, V.getOperand(1));
37809 bool IsScalarS1 = N->getOpcode() == X86ISD::FMADDS1 ||
37810 N->getOpcode() == X86ISD::FMSUBS1 ||
37811 N->getOpcode() == X86ISD::FNMADDS1 ||
37812 N->getOpcode() == X86ISD::FNMSUBS1 ||
37813 N->getOpcode() == X86ISD::FMADDS1_RND ||
37814 N->getOpcode() == X86ISD::FMSUBS1_RND ||
37815 N->getOpcode() == X86ISD::FNMADDS1_RND ||
37816 N->getOpcode() == X86ISD::FNMSUBS1_RND;
37817 bool IsScalarS3 = N->getOpcode() == X86ISD::FMADDS3 ||
37818 N->getOpcode() == X86ISD::FMSUBS3 ||
37819 N->getOpcode() == X86ISD::FNMADDS3 ||
37820 N->getOpcode() == X86ISD::FNMSUBS3 ||
37821 N->getOpcode() == X86ISD::FMADDS3_RND ||
37822 N->getOpcode() == X86ISD::FMSUBS3_RND ||
37823 N->getOpcode() == X86ISD::FNMADDS3_RND ||
37824 N->getOpcode() == X86ISD::FNMSUBS3_RND;
37826 // Do not convert the passthru input of scalar intrinsics.
37827 // FIXME: We could allow negations of the lower element only.
37828 bool NegA = !IsScalarS1 && invertIfNegative(A);
37829 bool NegB = invertIfNegative(B);
37830 bool NegC = !IsScalarS3 && invertIfNegative(C);
37832 if (!NegA && !NegB && !NegC)
37835 unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC);
37837 if (N->getNumOperands() == 4)
37838 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
37839 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37842 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
37843 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
37844 const X86Subtarget &Subtarget) {
37846 EVT VT = N->getValueType(0);
37848 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
37852 unsigned NewOpcode;
37853 switch (N->getOpcode()) {
37854 default: llvm_unreachable("Unexpected opcode!");
37855 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
37856 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
37857 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
37858 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
37861 if (N->getNumOperands() == 4)
37862 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37863 NegVal, N->getOperand(3));
37864 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37868 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
37869 TargetLowering::DAGCombinerInfo &DCI,
37870 const X86Subtarget &Subtarget) {
37871 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
37872 // (and (i32 x86isd::setcc_carry), 1)
37873 // This eliminates the zext. This transformation is necessary because
37874 // ISD::SETCC is always legalized to i8.
37876 SDValue N0 = N->getOperand(0);
37877 EVT VT = N->getValueType(0);
37879 if (N0.getOpcode() == ISD::AND &&
37881 N0.getOperand(0).hasOneUse()) {
37882 SDValue N00 = N0.getOperand(0);
37883 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37884 if (!isOneConstant(N0.getOperand(1)))
37886 return DAG.getNode(ISD::AND, dl, VT,
37887 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37888 N00.getOperand(0), N00.getOperand(1)),
37889 DAG.getConstant(1, dl, VT));
37893 if (N0.getOpcode() == ISD::TRUNCATE &&
37895 N0.getOperand(0).hasOneUse()) {
37896 SDValue N00 = N0.getOperand(0);
37897 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37898 return DAG.getNode(ISD::AND, dl, VT,
37899 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37900 N00.getOperand(0), N00.getOperand(1)),
37901 DAG.getConstant(1, dl, VT));
37905 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37908 if (DCI.isBeforeLegalizeOps())
37909 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37912 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37915 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37919 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37922 if (SDValue DivRem8 = getDivRem8(N, DAG))
37925 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37928 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
37934 /// Try to map a 128-bit or larger integer comparison to vector instructions
37935 /// before type legalization splits it up into chunks.
37936 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
37937 const X86Subtarget &Subtarget) {
37938 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
37939 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
37941 // We're looking for an oversized integer equality comparison.
37942 SDValue X = SetCC->getOperand(0);
37943 SDValue Y = SetCC->getOperand(1);
37944 EVT OpVT = X.getValueType();
37945 unsigned OpSize = OpVT.getSizeInBits();
37946 if (!OpVT.isScalarInteger() || OpSize < 128)
37949 // Ignore a comparison with zero because that gets special treatment in
37950 // EmitTest(). But make an exception for the special case of a pair of
37951 // logically-combined vector-sized operands compared to zero. This pattern may
37952 // be generated by the memcmp expansion pass with oversized integer compares
37954 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
37955 X.getOperand(0).getOpcode() == ISD::XOR &&
37956 X.getOperand(1).getOpcode() == ISD::XOR;
37957 if (isNullConstant(Y) && !IsOrXorXorCCZero)
37960 // Bail out if we know that this is not really just an oversized integer.
37961 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
37962 peekThroughBitcasts(Y).getValueType() == MVT::f128)
37965 // TODO: Use PXOR + PTEST for SSE4.1 or later?
37966 // TODO: Add support for AVX-512.
37967 EVT VT = SetCC->getValueType(0);
37969 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
37970 (OpSize == 256 && Subtarget.hasAVX2())) {
37971 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
37973 if (IsOrXorXorCCZero) {
37974 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
37975 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
37976 // Use 2 vector equality compares and 'and' the results before doing a
37978 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
37979 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
37980 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
37981 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
37982 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
37983 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
37984 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
37986 SDValue VecX = DAG.getBitcast(VecVT, X);
37987 SDValue VecY = DAG.getBitcast(VecVT, Y);
37988 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
37990 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
37991 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
37992 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
37993 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
37994 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
37995 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
37996 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
37998 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
38004 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
38005 const X86Subtarget &Subtarget) {
38006 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
38007 SDValue LHS = N->getOperand(0);
38008 SDValue RHS = N->getOperand(1);
38009 EVT VT = N->getValueType(0);
38010 EVT OpVT = LHS.getValueType();
38013 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
38014 // 0-x == y --> x+y == 0
38015 // 0-x != y --> x+y != 0
38016 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
38018 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
38019 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
38021 // x == 0-y --> x+y == 0
38022 // x != 0-y --> x+y != 0
38023 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
38025 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
38026 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
38029 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
38033 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
38034 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
38035 // Put build_vectors on the right.
38036 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
38037 std::swap(LHS, RHS);
38038 CC = ISD::getSetCCSwappedOperands(CC);
38042 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
38043 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
38044 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
38046 if (IsSEXT0 && IsVZero1) {
38047 assert(VT == LHS.getOperand(0).getValueType() &&
38048 "Uexpected operand type");
38049 if (CC == ISD::SETGT)
38050 return DAG.getConstant(0, DL, VT);
38051 if (CC == ISD::SETLE)
38052 return DAG.getConstant(1, DL, VT);
38053 if (CC == ISD::SETEQ || CC == ISD::SETGE)
38054 return DAG.getNOT(DL, LHS.getOperand(0), VT);
38056 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
38057 "Unexpected condition code!");
38058 return LHS.getOperand(0);
38062 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
38063 // pre-promote its result type since vXi1 vectors don't get promoted
38064 // during type legalization.
38065 // NOTE: The element count check is to ignore operand types that need to
38066 // go through type promotion to a 128-bit vector.
38067 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
38068 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
38069 (OpVT.getVectorElementType() == MVT::i8 ||
38070 OpVT.getVectorElementType() == MVT::i16)) {
38071 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
38073 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
38076 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
38077 // to avoid scalarization via legalization because v4i32 is not a legal type.
38078 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
38079 LHS.getValueType() == MVT::v4f32)
38080 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
38085 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
38086 TargetLowering::DAGCombinerInfo &DCI) {
38087 SDValue Src = N->getOperand(0);
38088 MVT SrcVT = Src.getSimpleValueType();
38090 // Perform constant folding.
38091 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
38092 assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
38094 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
38095 SDValue In = Src.getOperand(Idx);
38096 if (!In.isUndef() &&
38097 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
38100 return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
38103 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38104 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38105 !DCI.isBeforeLegalizeOps());
38107 // MOVMSK only uses the MSB from each vector element.
38109 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
38110 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
38111 DCI.AddToWorklist(Src.getNode());
38112 DCI.CommitTargetLoweringOpt(TLO);
38113 return SDValue(N, 0);
38119 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
38120 TargetLowering::DAGCombinerInfo &DCI,
38121 const X86Subtarget &Subtarget) {
38124 if (DCI.isBeforeLegalizeOps()) {
38125 SDValue Index = N->getOperand(4);
38126 // Remove any sign extends from 32 or smaller to larger than 32.
38127 // Only do this before LegalizeOps in case we need the sign extend for
38129 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
38130 if (Index.getScalarValueSizeInBits() > 32 &&
38131 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
38132 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38133 NewOps[4] = Index.getOperand(0);
38134 DAG.UpdateNodeOperands(N, NewOps);
38135 // The original sign extend has less users, add back to worklist in case
38136 // it needs to be removed
38137 DCI.AddToWorklist(Index.getNode());
38138 DCI.AddToWorklist(N);
38139 return SDValue(N, 0);
38143 // Make sure the index is either i32 or i64
38144 unsigned ScalarSize = Index.getScalarValueSizeInBits();
38145 if (ScalarSize != 32 && ScalarSize != 64) {
38146 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
38147 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
38148 Index.getValueType().getVectorNumElements());
38149 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
38150 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38152 DAG.UpdateNodeOperands(N, NewOps);
38153 DCI.AddToWorklist(N);
38154 return SDValue(N, 0);
38157 // Try to remove zero extends from 32->64 if we know the sign bit of
38158 // the input is zero.
38159 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
38160 Index.getScalarValueSizeInBits() == 64 &&
38161 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
38162 if (DAG.SignBitIsZero(Index.getOperand(0))) {
38163 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
38164 NewOps[4] = Index.getOperand(0);
38165 DAG.UpdateNodeOperands(N, NewOps);
38166 // The original zero extend has less users, add back to worklist in case
38167 // it needs to be removed
38168 DCI.AddToWorklist(Index.getNode());
38169 DCI.AddToWorklist(N);
38170 return SDValue(N, 0);
38175 // With AVX2 we only demand the upper bit of the mask.
38176 if (!Subtarget.hasAVX512()) {
38177 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
38178 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
38179 !DCI.isBeforeLegalizeOps());
38180 SDValue Mask = N->getOperand(2);
38182 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
38183 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
38184 DCI.AddToWorklist(Mask.getNode());
38185 DCI.CommitTargetLoweringOpt(TLO);
38186 return SDValue(N, 0);
38193 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
38194 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
38195 const X86Subtarget &Subtarget) {
38197 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
38198 SDValue EFLAGS = N->getOperand(1);
38200 // Try to simplify the EFLAGS and condition code operands.
38201 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
38202 return getSETCC(CC, Flags, DL, DAG);
38207 /// Optimize branch condition evaluation.
38208 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
38209 const X86Subtarget &Subtarget) {
38211 SDValue EFLAGS = N->getOperand(3);
38212 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
38214 // Try to simplify the EFLAGS and condition code operands.
38215 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
38216 // RAUW them under us.
38217 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
38218 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
38219 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
38220 N->getOperand(1), Cond, Flags);
38226 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
38227 SelectionDAG &DAG) {
38228 // Take advantage of vector comparisons producing 0 or -1 in each lane to
38229 // optimize away operation when it's from a constant.
38231 // The general transformation is:
38232 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
38233 // AND(VECTOR_CMP(x,y), constant2)
38234 // constant2 = UNARYOP(constant)
38236 // Early exit if this isn't a vector operation, the operand of the
38237 // unary operation isn't a bitwise AND, or if the sizes of the operations
38238 // aren't the same.
38239 EVT VT = N->getValueType(0);
38240 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
38241 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
38242 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
38245 // Now check that the other operand of the AND is a constant. We could
38246 // make the transformation for non-constant splats as well, but it's unclear
38247 // that would be a benefit as it would not eliminate any operations, just
38248 // perform one more step in scalar code before moving to the vector unit.
38249 if (BuildVectorSDNode *BV =
38250 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
38251 // Bail out if the vector isn't a constant.
38252 if (!BV->isConstant())
38255 // Everything checks out. Build up the new and improved node.
38257 EVT IntVT = BV->getValueType(0);
38258 // Create a new constant of the appropriate type for the transformed
38260 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
38261 // The AND node needs bitcasts to/from an integer vector type around it.
38262 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
38263 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
38264 N->getOperand(0)->getOperand(0), MaskConst);
38265 SDValue Res = DAG.getBitcast(VT, NewAnd);
38272 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
38273 const X86Subtarget &Subtarget) {
38274 SDValue Op0 = N->getOperand(0);
38275 EVT VT = N->getValueType(0);
38276 EVT InVT = Op0.getValueType();
38278 // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38279 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
38280 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
38281 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38283 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38284 InVT.getVectorNumElements());
38285 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
38287 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
38288 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38291 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
38292 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
38293 // the optimization here.
38294 if (DAG.SignBitIsZero(Op0))
38295 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
38300 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
38301 const X86Subtarget &Subtarget) {
38302 // First try to optimize away the conversion entirely when it's
38303 // conditionally from a constant. Vectors only.
38304 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
38307 // Now move on to more general possibilities.
38308 SDValue Op0 = N->getOperand(0);
38309 EVT VT = N->getValueType(0);
38310 EVT InVT = Op0.getValueType();
38312 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
38313 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
38314 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
38315 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
38317 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38318 InVT.getVectorNumElements());
38319 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
38320 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
38323 // Without AVX512DQ we only support i64 to float scalar conversion. For both
38324 // vectors and scalars, see if we know that the upper bits are all the sign
38325 // bit, in which case we can truncate the input to i32 and convert from that.
38326 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
38327 unsigned BitWidth = InVT.getScalarSizeInBits();
38328 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
38329 if (NumSignBits >= (BitWidth - 31)) {
38330 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
38331 if (InVT.isVector())
38332 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
38333 InVT.getVectorNumElements());
38335 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
38336 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
38340 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
38341 // a 32-bit target where SSE doesn't support i64->FP operations.
38342 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
38343 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
38344 EVT LdVT = Ld->getValueType(0);
38346 // This transformation is not supported if the result type is f16 or f128.
38347 if (VT == MVT::f16 || VT == MVT::f128)
38350 // If we have AVX512DQ we can use packed conversion instructions unless
38352 if (Subtarget.hasDQI() && VT != MVT::f80)
38355 if (!Ld->isVolatile() && !VT.isVector() &&
38356 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
38357 !Subtarget.is64Bit() && LdVT == MVT::i64) {
38358 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
38359 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
38360 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
38367 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
38368 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38369 MVT VT = N->getSimpleValueType(0);
38370 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38371 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
38372 N->getOperand(0), N->getOperand(1),
38379 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
38380 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
38381 TargetLowering::DAGCombinerInfo &DCI) {
38382 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
38383 // the result is either zero or one (depending on the input carry bit).
38384 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
38385 if (X86::isZeroNode(N->getOperand(0)) &&
38386 X86::isZeroNode(N->getOperand(1)) &&
38387 // We don't have a good way to replace an EFLAGS use, so only do this when
38389 SDValue(N, 1).use_empty()) {
38391 EVT VT = N->getValueType(0);
38392 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
38393 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
38394 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38395 DAG.getConstant(X86::COND_B, DL,
38398 DAG.getConstant(1, DL, VT));
38399 return DCI.CombineTo(N, Res1, CarryOut);
38402 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
38403 MVT VT = N->getSimpleValueType(0);
38404 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38405 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
38406 N->getOperand(0), N->getOperand(1),
38413 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
38414 /// which is more useful than 0/1 in some cases.
38415 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
38417 // "Condition code B" is also known as "the carry flag" (CF).
38418 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
38419 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
38420 MVT VT = N->getSimpleValueType(0);
38422 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
38424 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
38425 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
38428 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
38429 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
38430 /// with CMP+{ADC, SBB}.
38431 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
38432 bool IsSub = N->getOpcode() == ISD::SUB;
38433 SDValue X = N->getOperand(0);
38434 SDValue Y = N->getOperand(1);
38436 // If this is an add, canonicalize a zext operand to the RHS.
38437 // TODO: Incomplete? What if both sides are zexts?
38438 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
38439 Y.getOpcode() != ISD::ZERO_EXTEND)
38442 // Look through a one-use zext.
38443 bool PeekedThroughZext = false;
38444 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
38445 Y = Y.getOperand(0);
38446 PeekedThroughZext = true;
38449 // If this is an add, canonicalize a setcc operand to the RHS.
38450 // TODO: Incomplete? What if both sides are setcc?
38451 // TODO: Should we allow peeking through a zext of the other operand?
38452 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
38453 Y.getOpcode() != X86ISD::SETCC)
38456 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
38460 EVT VT = N->getValueType(0);
38461 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
38463 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38464 // the general case below.
38465 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
38467 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
38468 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
38469 // This is a complicated way to get -1 or 0 from the carry flag:
38470 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38471 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
38472 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38473 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38477 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
38478 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
38479 SDValue EFLAGS = Y->getOperand(1);
38480 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38481 EFLAGS.getValueType().isInteger() &&
38482 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38483 // Swap the operands of a SUB, and we have the same pattern as above.
38484 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
38485 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
38486 SDValue NewSub = DAG.getNode(
38487 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
38488 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38489 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38490 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38491 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38497 if (CC == X86::COND_B) {
38498 // X + SETB Z --> X + (mask SBB Z, Z)
38499 // X - SETB Z --> X - (mask SBB Z, Z)
38500 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
38501 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
38502 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38503 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38504 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38507 if (CC == X86::COND_A) {
38508 SDValue EFLAGS = Y->getOperand(1);
38509 // Try to convert COND_A into COND_B in an attempt to facilitate
38510 // materializing "setb reg".
38512 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
38513 // cannot take an immediate as its first operand.
38515 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
38516 EFLAGS.getValueType().isInteger() &&
38517 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
38518 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
38519 EFLAGS.getNode()->getVTList(),
38520 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
38521 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
38522 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
38523 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
38524 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
38525 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
38529 if (CC != X86::COND_E && CC != X86::COND_NE)
38532 SDValue Cmp = Y.getOperand(1);
38533 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
38534 !X86::isZeroNode(Cmp.getOperand(1)) ||
38535 !Cmp.getOperand(0).getValueType().isInteger())
38538 SDValue Z = Cmp.getOperand(0);
38539 EVT ZVT = Z.getValueType();
38541 // If X is -1 or 0, then we have an opportunity to avoid constants required in
38542 // the general case below.
38544 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
38546 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
38547 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
38548 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
38549 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
38550 SDValue Zero = DAG.getConstant(0, DL, ZVT);
38551 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
38552 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
38553 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38554 DAG.getConstant(X86::COND_B, DL, MVT::i8),
38555 SDValue(Neg.getNode(), 1));
38558 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
38559 // with fake operands:
38560 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
38561 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
38562 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
38563 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
38564 SDValue One = DAG.getConstant(1, DL, ZVT);
38565 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38566 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
38567 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
38571 // (cmp Z, 1) sets the carry flag if Z is 0.
38572 SDValue One = DAG.getConstant(1, DL, ZVT);
38573 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
38575 // Add the flags type for ADC/SBB nodes.
38576 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
38578 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
38579 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
38580 if (CC == X86::COND_NE)
38581 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
38582 DAG.getConstant(-1ULL, DL, VT), Cmp1);
38584 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
38585 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
38586 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
38587 DAG.getConstant(0, DL, VT), Cmp1);
38590 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
38591 const X86Subtarget &Subtarget) {
38592 if (!Subtarget.hasSSE2())
38595 SDValue MulOp = N->getOperand(0);
38596 SDValue Phi = N->getOperand(1);
38598 if (MulOp.getOpcode() != ISD::MUL)
38599 std::swap(MulOp, Phi);
38600 if (MulOp.getOpcode() != ISD::MUL)
38604 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
38607 EVT VT = N->getValueType(0);
38609 unsigned RegSize = 128;
38610 if (Subtarget.useBWIRegs())
38612 else if (Subtarget.hasAVX())
38614 unsigned VectorSize = VT.getVectorNumElements() * 16;
38615 // If the vector size is less than 128, or greater than the supported RegSize,
38616 // do not use PMADD.
38617 if (VectorSize < 128 || VectorSize > RegSize)
38621 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38622 VT.getVectorNumElements());
38623 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38624 VT.getVectorNumElements() / 2);
38626 // Shrink the operands of mul.
38627 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
38628 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
38630 // Madd vector size is half of the original vector size
38631 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38632 ArrayRef<SDValue> Ops) {
38633 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
38634 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
38636 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
38638 // Fill the rest of the output with 0
38639 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
38640 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
38641 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
38644 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
38645 const X86Subtarget &Subtarget) {
38646 if (!Subtarget.hasSSE2())
38650 EVT VT = N->getValueType(0);
38651 SDValue Op0 = N->getOperand(0);
38652 SDValue Op1 = N->getOperand(1);
38654 // TODO: There's nothing special about i32, any integer type above i16 should
38655 // work just as well.
38656 if (!VT.isVector() || !VT.isSimple() ||
38657 !(VT.getVectorElementType() == MVT::i32))
38660 unsigned RegSize = 128;
38661 if (Subtarget.useBWIRegs())
38663 else if (Subtarget.hasAVX())
38666 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
38667 // TODO: We should be able to handle larger vectors by splitting them before
38668 // feeding them into several SADs, and then reducing over those.
38669 if (VT.getSizeInBits() / 4 > RegSize)
38672 // We know N is a reduction add, which means one of its operands is a phi.
38673 // To match SAD, we need the other operand to be a vector select.
38674 SDValue SelectOp, Phi;
38675 if (Op0.getOpcode() == ISD::VSELECT) {
38678 } else if (Op1.getOpcode() == ISD::VSELECT) {
38684 // Check whether we have an abs-diff pattern feeding into the select.
38685 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
38688 // SAD pattern detected. Now build a SAD instruction and an addition for
38689 // reduction. Note that the number of elements of the result of SAD is less
38690 // than the number of elements of its input. Therefore, we could only update
38691 // part of elements in the reduction vector.
38692 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
38694 // The output of PSADBW is a vector of i64.
38695 // We need to turn the vector of i64 into a vector of i32.
38696 // If the reduction vector is at least as wide as the psadbw result, just
38697 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
38699 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
38700 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
38701 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
38703 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
38705 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
38706 // Fill the upper elements with zero to match the add width.
38707 SDValue Zero = DAG.getConstant(0, DL, VT);
38708 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
38709 DAG.getIntPtrConstant(0, DL));
38712 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
38715 /// Convert vector increment or decrement to sub/add with an all-ones constant:
38716 /// add X, <1, 1...> --> sub X, <-1, -1...>
38717 /// sub X, <1, 1...> --> add X, <-1, -1...>
38718 /// The all-ones vector constant can be materialized using a pcmpeq instruction
38719 /// that is commonly recognized as an idiom (has no register dependency), so
38720 /// that's better/smaller than loading a splat 1 constant.
38721 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
38722 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
38723 "Unexpected opcode for increment/decrement transform");
38725 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
38726 // out and wait for legalization if we have an unsupported vector length.
38727 EVT VT = N->getValueType(0);
38728 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
38731 SDNode *N1 = N->getOperand(1).getNode();
38733 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
38734 !SplatVal.isOneValue())
38737 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
38738 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
38739 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
38742 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
38743 const SDLoc &DL, EVT VT,
38744 const X86Subtarget &Subtarget) {
38745 // Example of pattern we try to detect:
38746 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
38747 //(add (build_vector (extract_elt t, 0),
38748 // (extract_elt t, 2),
38749 // (extract_elt t, 4),
38750 // (extract_elt t, 6)),
38751 // (build_vector (extract_elt t, 1),
38752 // (extract_elt t, 3),
38753 // (extract_elt t, 5),
38754 // (extract_elt t, 7)))
38756 if (!Subtarget.hasSSE2())
38759 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
38760 Op1.getOpcode() != ISD::BUILD_VECTOR)
38763 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
38764 VT.getVectorNumElements() < 4 ||
38765 !isPowerOf2_32(VT.getVectorNumElements()))
38768 // Check if one of Op0,Op1 is of the form:
38769 // (build_vector (extract_elt Mul, 0),
38770 // (extract_elt Mul, 2),
38771 // (extract_elt Mul, 4),
38773 // the other is of the form:
38774 // (build_vector (extract_elt Mul, 1),
38775 // (extract_elt Mul, 3),
38776 // (extract_elt Mul, 5),
38778 // and identify Mul.
38780 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
38781 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
38782 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
38783 // TODO: Be more tolerant to undefs.
38784 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38785 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38786 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
38787 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
38789 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
38790 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
38791 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
38792 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
38793 if (!Const0L || !Const1L || !Const0H || !Const1H)
38795 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
38796 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
38797 // Commutativity of mul allows factors of a product to reorder.
38799 std::swap(Idx0L, Idx1L);
38801 std::swap(Idx0H, Idx1H);
38802 // Commutativity of add allows pairs of factors to reorder.
38803 if (Idx0L > Idx0H) {
38804 std::swap(Idx0L, Idx0H);
38805 std::swap(Idx1L, Idx1H);
38807 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
38808 Idx1H != 2 * i + 3)
38811 // First time an extract_elt's source vector is visited. Must be a MUL
38812 // with 2X number of vector elements than the BUILD_VECTOR.
38813 // Both extracts must be from same MUL.
38814 Mul = Op0L->getOperand(0);
38815 if (Mul->getOpcode() != ISD::MUL ||
38816 Mul.getValueType().getVectorNumElements() != 2 * e)
38819 // Check that the extract is from the same MUL previously seen.
38820 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
38821 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
38825 // Check if the Mul source can be safely shrunk.
38827 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
38830 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38831 ArrayRef<SDValue> Ops) {
38832 // Shrink by adding truncate nodes and let DAGCombine fold with the
38834 EVT InVT = Ops[0].getValueType();
38835 assert(InVT.getScalarType() == MVT::i32 &&
38836 "Unexpected scalar element type");
38837 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38838 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38839 InVT.getVectorNumElements() / 2);
38840 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38841 InVT.getVectorNumElements());
38842 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
38843 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
38844 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
38846 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
38847 { Mul.getOperand(0), Mul.getOperand(1) },
38851 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
38852 const X86Subtarget &Subtarget) {
38853 const SDNodeFlags Flags = N->getFlags();
38854 if (Flags.hasVectorReduction()) {
38855 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
38857 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
38860 EVT VT = N->getValueType(0);
38861 SDValue Op0 = N->getOperand(0);
38862 SDValue Op1 = N->getOperand(1);
38864 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
38867 // Try to synthesize horizontal adds from adds of shuffles.
38868 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
38869 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
38870 isHorizontalBinOp(Op0, Op1, true))
38871 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
38873 if (SDValue V = combineIncDecVector(N, DAG))
38876 return combineAddOrSubToADCOrSBB(N, DAG);
38879 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
38880 const X86Subtarget &Subtarget) {
38881 SDValue Op0 = N->getOperand(0);
38882 SDValue Op1 = N->getOperand(1);
38883 EVT VT = N->getValueType(0);
38885 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
38886 // is only worth it with SSSE3 (PSHUFB).
38887 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
38888 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
38889 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
38890 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
38891 VT == MVT::v16i32 || VT == MVT::v8i64)))
38894 SDValue SubusLHS, SubusRHS;
38895 // Try to find umax(a,b) - b or a - umin(a,b) patterns
38896 // they may be converted to subus(a,b).
38897 // TODO: Need to add IR canonicalization for this code.
38898 if (Op0.getOpcode() == ISD::UMAX) {
38900 SDValue MaxLHS = Op0.getOperand(0);
38901 SDValue MaxRHS = Op0.getOperand(1);
38904 else if (MaxRHS == Op1)
38908 } else if (Op1.getOpcode() == ISD::UMIN) {
38910 SDValue MinLHS = Op1.getOperand(0);
38911 SDValue MinRHS = Op1.getOperand(1);
38914 else if (MinRHS == Op0)
38921 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38922 ArrayRef<SDValue> Ops) {
38923 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
38926 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
38927 // special preprocessing in some cases.
38928 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
38929 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
38930 { SubusLHS, SubusRHS }, SUBUSBuilder);
38932 // Special preprocessing case can be only applied
38933 // if the value was zero extended from 16 bit,
38934 // so we require first 16 bits to be zeros for 32 bit
38935 // values, or first 48 bits for 64 bit values.
38937 DAG.computeKnownBits(SubusLHS, Known);
38938 unsigned NumZeros = Known.countMinLeadingZeros();
38939 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
38942 EVT ExtType = SubusLHS.getValueType();
38944 if (VT == MVT::v8i32 || VT == MVT::v8i64)
38945 ShrinkedType = MVT::v8i16;
38947 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
38949 // If SubusLHS is zeroextended - truncate SubusRHS to it's
38950 // size SubusRHS = umin(0xFFF.., SubusRHS).
38951 SDValue SaturationConst =
38952 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
38953 ShrinkedType.getScalarSizeInBits()),
38954 SDLoc(SubusLHS), ExtType);
38955 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
38957 SDValue NewSubusLHS =
38958 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
38959 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
38961 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
38962 { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
38963 // Zero extend the result, it may be used somewhere as 32 bit,
38964 // if not zext and following trunc will shrink.
38965 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
38968 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
38969 const X86Subtarget &Subtarget) {
38970 SDValue Op0 = N->getOperand(0);
38971 SDValue Op1 = N->getOperand(1);
38973 // X86 can't encode an immediate LHS of a sub. See if we can push the
38974 // negation into a preceding instruction.
38975 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
38976 // If the RHS of the sub is a XOR with one use and a constant, invert the
38977 // immediate. Then add one to the LHS of the sub so we can turn
38978 // X-Y -> X+~Y+1, saving one register.
38979 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
38980 isa<ConstantSDNode>(Op1.getOperand(1))) {
38981 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
38982 EVT VT = Op0.getValueType();
38983 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
38985 DAG.getConstant(~XorC, SDLoc(Op1), VT));
38986 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
38987 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
38991 // Try to synthesize horizontal subs from subs of shuffles.
38992 EVT VT = N->getValueType(0);
38993 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
38994 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
38995 isHorizontalBinOp(Op0, Op1, false))
38996 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
38998 if (SDValue V = combineIncDecVector(N, DAG))
39001 // Try to create PSUBUS if SUB's argument is max/min
39002 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
39005 return combineAddOrSubToADCOrSBB(N, DAG);
39008 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
39009 TargetLowering::DAGCombinerInfo &DCI,
39010 const X86Subtarget &Subtarget) {
39011 if (DCI.isBeforeLegalize())
39015 unsigned Opcode = N->getOpcode();
39016 MVT VT = N->getSimpleValueType(0);
39017 MVT SVT = VT.getVectorElementType();
39018 unsigned NumElts = VT.getVectorNumElements();
39019 unsigned EltSizeInBits = SVT.getSizeInBits();
39021 SDValue Op = N->getOperand(0);
39022 MVT OpVT = Op.getSimpleValueType();
39023 MVT OpEltVT = OpVT.getVectorElementType();
39024 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
39025 unsigned InputBits = OpEltSizeInBits * NumElts;
39027 // Perform any constant folding.
39028 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
39030 SmallVector<APInt, 64> EltBits;
39031 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
39032 APInt Undefs(NumElts, 0);
39033 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
39035 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
39036 for (unsigned i = 0; i != NumElts; ++i) {
39037 if (UndefElts[i]) {
39041 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
39042 : EltBits[i].sextOrTrunc(EltSizeInBits);
39044 return getConstVector(Vals, Undefs, VT, DAG, DL);
39047 // (vzext (bitcast (vzext (x)) -> (vzext x)
39048 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
39049 SDValue V = peekThroughBitcasts(Op);
39050 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
39051 MVT InnerVT = V.getSimpleValueType();
39052 MVT InnerEltVT = InnerVT.getVectorElementType();
39054 // If the element sizes match exactly, we can just do one larger vzext. This
39055 // is always an exact type match as vzext operates on integer types.
39056 if (OpEltVT == InnerEltVT) {
39057 assert(OpVT == InnerVT && "Types must match for vzext!");
39058 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
39061 // The only other way we can combine them is if only a single element of the
39062 // inner vzext is used in the input to the outer vzext.
39063 if (InnerEltVT.getSizeInBits() < InputBits)
39066 // In this case, the inner vzext is completely dead because we're going to
39067 // only look at bits inside of the low element. Just do the outer vzext on
39068 // a bitcast of the input to the inner.
39069 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
39072 // Check if we can bypass extracting and re-inserting an element of an input
39073 // vector. Essentially:
39074 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
39075 // TODO: Add X86ISD::VSEXT support
39076 if (Opcode == X86ISD::VZEXT &&
39077 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
39078 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
39079 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
39080 SDValue ExtractedV = V.getOperand(0);
39081 SDValue OrigV = ExtractedV.getOperand(0);
39082 if (isNullConstant(ExtractedV.getOperand(1))) {
39083 MVT OrigVT = OrigV.getSimpleValueType();
39084 // Extract a subvector if necessary...
39085 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
39086 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
39087 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
39088 OrigVT.getVectorNumElements() / Ratio);
39089 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
39090 DAG.getIntPtrConstant(0, DL));
39092 Op = DAG.getBitcast(OpVT, OrigV);
39093 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
39100 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
39101 const X86Subtarget &Subtarget) {
39102 MVT VT = N->getSimpleValueType(0);
39105 if (N->getOperand(0) == N->getOperand(1)) {
39106 if (N->getOpcode() == X86ISD::PCMPEQ)
39107 return getOnesVector(VT, DAG, DL);
39108 if (N->getOpcode() == X86ISD::PCMPGT)
39109 return getZeroVector(VT, Subtarget, DAG, DL);
39115 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
39116 TargetLowering::DAGCombinerInfo &DCI,
39117 const X86Subtarget &Subtarget) {
39118 if (DCI.isBeforeLegalizeOps())
39121 MVT OpVT = N->getSimpleValueType(0);
39123 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
39126 SDValue Vec = N->getOperand(0);
39127 SDValue SubVec = N->getOperand(1);
39129 unsigned IdxVal = N->getConstantOperandVal(2);
39130 MVT SubVecVT = SubVec.getSimpleValueType();
39132 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
39133 // Inserting zeros into zeros is a nop.
39134 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39135 return getZeroVector(OpVT, Subtarget, DAG, dl);
39137 // If we're inserting into a zero vector and then into a larger zero vector,
39138 // just insert into the larger zero vector directly.
39139 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39140 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
39141 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
39142 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39143 getZeroVector(OpVT, Subtarget, DAG, dl),
39144 SubVec.getOperand(1),
39145 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
39148 // If we're inserting into a zero vector and our input was extracted from an
39149 // insert into a zero vector of the same type and the extraction was at
39150 // least as large as the original insertion. Just insert the original
39151 // subvector into a zero vector.
39152 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
39153 SubVec.getConstantOperandVal(1) == 0 &&
39154 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
39155 SDValue Ins = SubVec.getOperand(0);
39156 if (Ins.getConstantOperandVal(2) == 0 &&
39157 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
39158 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
39159 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39160 getZeroVector(OpVT, Subtarget, DAG, dl),
39161 Ins.getOperand(1), N->getOperand(2));
39164 // If we're inserting a bitcast into zeros, rewrite the insert and move the
39165 // bitcast to the other side. This helps with detecting zero extending
39167 // TODO: Is this useful for other indices than 0?
39168 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
39169 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
39170 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
39171 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
39172 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
39173 DAG.getBitcast(NewVT, Vec),
39174 SubVec.getOperand(0), N->getOperand(2));
39175 return DAG.getBitcast(OpVT, Insert);
39179 // Stop here if this is an i1 vector.
39183 // If this is an insert of an extract, combine to a shuffle. Don't do this
39184 // if the insert or extract can be represented with a subregister operation.
39185 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
39186 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
39187 (IdxVal != 0 || !Vec.isUndef())) {
39188 int ExtIdxVal = SubVec.getConstantOperandVal(1);
39189 if (ExtIdxVal != 0) {
39190 int VecNumElts = OpVT.getVectorNumElements();
39191 int SubVecNumElts = SubVecVT.getVectorNumElements();
39192 SmallVector<int, 64> Mask(VecNumElts);
39193 // First create an identity shuffle mask.
39194 for (int i = 0; i != VecNumElts; ++i)
39196 // Now insert the extracted portion.
39197 for (int i = 0; i != SubVecNumElts; ++i)
39198 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
39200 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
39204 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
39206 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39207 // (load16 addr + 16), Elts/2)
39210 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39211 // (load32 addr + 32), Elts/2)
39213 // or a 16-byte or 32-byte broadcast:
39214 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
39215 // (load16 addr), Elts/2)
39216 // --> X86SubVBroadcast(load16 addr)
39218 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
39219 // (load32 addr), Elts/2)
39220 // --> X86SubVBroadcast(load32 addr)
39221 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
39222 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
39223 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
39224 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
39225 if (Idx2 && Idx2->getZExtValue() == 0) {
39226 SDValue SubVec2 = Vec.getOperand(1);
39227 // If needed, look through bitcasts to get to the load.
39228 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
39230 unsigned Alignment = FirstLd->getAlignment();
39231 unsigned AS = FirstLd->getAddressSpace();
39232 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
39233 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
39234 OpVT, AS, Alignment, &Fast) && Fast) {
39235 SDValue Ops[] = {SubVec2, SubVec};
39236 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
39241 // If lower/upper loads are the same and the only users of the load, then
39242 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
39243 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
39244 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
39245 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
39246 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
39248 // If this is subv_broadcast insert into both halves, use a larger
39250 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
39251 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
39252 SubVec.getOperand(0));
39254 // If we're inserting all zeros into the upper half, change this to
39255 // an insert into an all zeros vector. We will match this to a move
39256 // with implicit upper bit zeroing during isel.
39257 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
39258 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
39259 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
39260 Vec.getOperand(2));
39262 // If we are inserting into both halves of the vector, the starting
39263 // vector should be undef. If it isn't, make it so. Only do this if the
39264 // the early insert has no other uses.
39265 // TODO: Should this be a generic DAG combine?
39266 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
39267 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
39268 SubVec2, Vec.getOperand(2));
39269 DCI.AddToWorklist(Vec.getNode());
39270 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
39280 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
39281 TargetLowering::DAGCombinerInfo &DCI,
39282 const X86Subtarget &Subtarget) {
39283 if (DCI.isBeforeLegalizeOps())
39286 MVT OpVT = N->getSimpleValueType(0);
39287 SDValue InVec = N->getOperand(0);
39288 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
39290 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
39291 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
39293 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
39294 if (OpVT.getScalarType() == MVT::i1)
39295 return DAG.getConstant(1, SDLoc(N), OpVT);
39296 return getOnesVector(OpVT, DAG, SDLoc(N));
39299 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
39300 return DAG.getBuildVector(
39302 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
39307 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
39308 EVT VT = N->getValueType(0);
39309 SDValue Src = N->getOperand(0);
39311 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
39312 // This occurs frequently in our masked scalar intrinsic code and our
39313 // floating point select lowering with AVX512.
39314 // TODO: SimplifyDemandedBits instead?
39315 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
39316 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
39317 if (C->getAPIntValue().isOneValue())
39318 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
39319 Src.getOperand(0));
39324 // Simplify PMULDQ and PMULUDQ operations.
39325 static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG,
39326 TargetLowering::DAGCombinerInfo &DCI) {
39327 SDValue LHS = N->getOperand(0);
39328 SDValue RHS = N->getOperand(1);
39330 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
39331 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
39332 !DCI.isBeforeLegalizeOps());
39333 APInt DemandedMask(APInt::getLowBitsSet(64, 32));
39335 // PMULQDQ/PMULUDQ only uses lower 32 bits from each vector element.
39336 KnownBits LHSKnown;
39337 if (TLI.SimplifyDemandedBits(LHS, DemandedMask, LHSKnown, TLO)) {
39338 DCI.CommitTargetLoweringOpt(TLO);
39339 return SDValue(N, 0);
39342 KnownBits RHSKnown;
39343 if (TLI.SimplifyDemandedBits(RHS, DemandedMask, RHSKnown, TLO)) {
39344 DCI.CommitTargetLoweringOpt(TLO);
39345 return SDValue(N, 0);
39351 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
39352 DAGCombinerInfo &DCI) const {
39353 SelectionDAG &DAG = DCI.DAG;
39354 switch (N->getOpcode()) {
39356 case ISD::SCALAR_TO_VECTOR:
39357 return combineScalarToVector(N, DAG);
39358 case ISD::EXTRACT_VECTOR_ELT:
39359 case X86ISD::PEXTRW:
39360 case X86ISD::PEXTRB:
39361 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
39362 case ISD::INSERT_SUBVECTOR:
39363 return combineInsertSubvector(N, DAG, DCI, Subtarget);
39364 case ISD::EXTRACT_SUBVECTOR:
39365 return combineExtractSubvector(N, DAG, DCI, Subtarget);
39368 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
39369 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
39370 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
39371 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
39372 case ISD::SUB: return combineSub(N, DAG, Subtarget);
39373 case X86ISD::SBB: return combineSBB(N, DAG);
39374 case X86ISD::ADC: return combineADC(N, DAG, DCI);
39375 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
39378 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
39379 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
39380 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
39381 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
39382 case X86ISD::BEXTR: return combineBEXTR(N, DAG, DCI, Subtarget);
39383 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
39384 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
39385 case ISD::STORE: return combineStore(N, DAG, Subtarget);
39386 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
39387 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
39388 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
39390 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
39391 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
39392 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
39393 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
39394 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
39395 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
39397 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
39399 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
39401 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
39402 case X86ISD::BT: return combineBT(N, DAG, DCI);
39403 case ISD::ANY_EXTEND:
39404 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
39405 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
39406 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
39407 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
39408 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
39409 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
39410 case X86ISD::PACKSS:
39411 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
39412 case X86ISD::VSHLI:
39413 case X86ISD::VSRAI:
39414 case X86ISD::VSRLI:
39415 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
39416 case ISD::SIGN_EXTEND_VECTOR_INREG:
39417 case ISD::ZERO_EXTEND_VECTOR_INREG:
39418 case X86ISD::VSEXT:
39419 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
39420 case X86ISD::PINSRB:
39421 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
39422 case X86ISD::SHUFP: // Handle all target specific shuffles
39423 case X86ISD::INSERTPS:
39424 case X86ISD::EXTRQI:
39425 case X86ISD::INSERTQI:
39426 case X86ISD::PALIGNR:
39427 case X86ISD::VSHLDQ:
39428 case X86ISD::VSRLDQ:
39429 case X86ISD::BLENDI:
39430 case X86ISD::UNPCKH:
39431 case X86ISD::UNPCKL:
39432 case X86ISD::MOVHLPS:
39433 case X86ISD::MOVLHPS:
39434 case X86ISD::PSHUFB:
39435 case X86ISD::PSHUFD:
39436 case X86ISD::PSHUFHW:
39437 case X86ISD::PSHUFLW:
39438 case X86ISD::MOVSHDUP:
39439 case X86ISD::MOVSLDUP:
39440 case X86ISD::MOVDDUP:
39441 case X86ISD::MOVSS:
39442 case X86ISD::MOVSD:
39443 case X86ISD::VBROADCAST:
39444 case X86ISD::VPPERM:
39445 case X86ISD::VPERMI:
39446 case X86ISD::VPERMV:
39447 case X86ISD::VPERMV3:
39448 case X86ISD::VPERMIL2:
39449 case X86ISD::VPERMILPI:
39450 case X86ISD::VPERMILPV:
39451 case X86ISD::VPERM2X128:
39452 case X86ISD::SHUF128:
39453 case X86ISD::VZEXT_MOVL:
39454 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
39455 case X86ISD::FMADD_RND:
39456 case X86ISD::FMADDS1_RND:
39457 case X86ISD::FMADDS3_RND:
39458 case X86ISD::FMADDS1:
39459 case X86ISD::FMADDS3:
39460 case X86ISD::FMSUB:
39461 case X86ISD::FMSUB_RND:
39462 case X86ISD::FMSUBS1_RND:
39463 case X86ISD::FMSUBS3_RND:
39464 case X86ISD::FMSUBS1:
39465 case X86ISD::FMSUBS3:
39466 case X86ISD::FNMADD:
39467 case X86ISD::FNMADD_RND:
39468 case X86ISD::FNMADDS1_RND:
39469 case X86ISD::FNMADDS3_RND:
39470 case X86ISD::FNMADDS1:
39471 case X86ISD::FNMADDS3:
39472 case X86ISD::FNMSUB:
39473 case X86ISD::FNMSUB_RND:
39474 case X86ISD::FNMSUBS1_RND:
39475 case X86ISD::FNMSUBS3_RND:
39476 case X86ISD::FNMSUBS1:
39477 case X86ISD::FNMSUBS3:
39478 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
39479 case X86ISD::FMADDSUB_RND:
39480 case X86ISD::FMSUBADD_RND:
39481 case X86ISD::FMADDSUB:
39482 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
39483 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
39484 case X86ISD::MGATHER:
39485 case X86ISD::MSCATTER:
39487 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
39488 case X86ISD::PCMPEQ:
39489 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
39490 case X86ISD::PMULDQ:
39491 case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI);
39497 /// Return true if the target has native support for the specified value type
39498 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
39499 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
39500 /// some i16 instructions are slow.
39501 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
39502 if (!isTypeLegal(VT))
39505 // There are no vXi8 shifts.
39506 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
39509 if (VT != MVT::i16)
39516 case ISD::SIGN_EXTEND:
39517 case ISD::ZERO_EXTEND:
39518 case ISD::ANY_EXTEND:
39531 SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl,
39532 SDValue Value, SDValue Addr,
39533 SelectionDAG &DAG) const {
39534 const Module *M = DAG.getMachineFunction().getMMI().getModule();
39535 Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
39536 if (IsCFProtectionSupported) {
39537 // In case control-flow branch protection is enabled, we need to add
39538 // notrack prefix to the indirect branch.
39539 // In order to do that we create NT_BRIND SDNode.
39540 // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix.
39541 return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr);
39544 return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG);
39547 /// This method query the target whether it is beneficial for dag combiner to
39548 /// promote the specified node. If true, it should return the desired promotion
39549 /// type by reference.
39550 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
39551 EVT VT = Op.getValueType();
39552 if (VT != MVT::i16)
39555 auto IsFoldableRMW = [](SDValue Load, SDValue Op) {
39556 if (!Op.hasOneUse())
39558 SDNode *User = *Op->use_begin();
39559 if (!ISD::isNormalStore(User))
39561 auto *Ld = cast<LoadSDNode>(Load);
39562 auto *St = cast<StoreSDNode>(User);
39563 return Ld->getBasePtr() == St->getBasePtr();
39566 bool Commute = false;
39567 switch (Op.getOpcode()) {
39568 default: return false;
39569 case ISD::SIGN_EXTEND:
39570 case ISD::ZERO_EXTEND:
39571 case ISD::ANY_EXTEND:
39575 SDValue N0 = Op.getOperand(0);
39576 // Look out for (store (shl (load), x)).
39577 if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op))
39589 SDValue N0 = Op.getOperand(0);
39590 SDValue N1 = Op.getOperand(1);
39591 // Avoid disabling potential load folding opportunities.
39592 if (MayFoldLoad(N1) &&
39593 (!Commute || !isa<ConstantSDNode>(N0) ||
39594 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
39596 if (MayFoldLoad(N0) &&
39597 ((Commute && !isa<ConstantSDNode>(N1)) ||
39598 (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
39607 bool X86TargetLowering::
39608 isDesirableToCombineBuildVectorToShuffleTruncate(
39609 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
39611 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
39612 "Element count mismatch");
39614 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
39615 "Shuffle Mask expected to be legal");
39617 // For 32-bit elements VPERMD is better than shuffle+truncate.
39618 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
39619 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
39622 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
39628 //===----------------------------------------------------------------------===//
39629 // X86 Inline Assembly Support
39630 //===----------------------------------------------------------------------===//
39632 // Helper to match a string separated by whitespace.
39633 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
39634 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
39636 for (StringRef Piece : Pieces) {
39637 if (!S.startswith(Piece)) // Check if the piece matches.
39640 S = S.substr(Piece.size());
39641 StringRef::size_type Pos = S.find_first_not_of(" \t");
39642 if (Pos == 0) // We matched a prefix.
39651 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
39653 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
39654 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
39655 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
39656 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
39658 if (AsmPieces.size() == 3)
39660 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
39667 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
39668 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
39670 const std::string &AsmStr = IA->getAsmString();
39672 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
39673 if (!Ty || Ty->getBitWidth() % 16 != 0)
39676 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
39677 SmallVector<StringRef, 4> AsmPieces;
39678 SplitString(AsmStr, AsmPieces, ";\n");
39680 switch (AsmPieces.size()) {
39681 default: return false;
39683 // FIXME: this should verify that we are targeting a 486 or better. If not,
39684 // we will turn this bswap into something that will be lowered to logical
39685 // ops instead of emitting the bswap asm. For now, we don't support 486 or
39686 // lower so don't worry about this.
39688 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
39689 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
39690 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
39691 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
39692 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
39693 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
39694 // No need to check constraints, nothing other than the equivalent of
39695 // "=r,0" would be valid here.
39696 return IntrinsicLowering::LowerToByteSwap(CI);
39699 // rorw $$8, ${0:w} --> llvm.bswap.i16
39700 if (CI->getType()->isIntegerTy(16) &&
39701 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39702 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
39703 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
39705 StringRef ConstraintsStr = IA->getConstraintString();
39706 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39707 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39708 if (clobbersFlagRegisters(AsmPieces))
39709 return IntrinsicLowering::LowerToByteSwap(CI);
39713 if (CI->getType()->isIntegerTy(32) &&
39714 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
39715 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
39716 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
39717 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
39719 StringRef ConstraintsStr = IA->getConstraintString();
39720 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
39721 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
39722 if (clobbersFlagRegisters(AsmPieces))
39723 return IntrinsicLowering::LowerToByteSwap(CI);
39726 if (CI->getType()->isIntegerTy(64)) {
39727 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
39728 if (Constraints.size() >= 2 &&
39729 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
39730 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
39731 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
39732 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
39733 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
39734 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
39735 return IntrinsicLowering::LowerToByteSwap(CI);
39743 /// Given a constraint letter, return the type of constraint for this target.
39744 X86TargetLowering::ConstraintType
39745 X86TargetLowering::getConstraintType(StringRef Constraint) const {
39746 if (Constraint.size() == 1) {
39747 switch (Constraint[0]) {
39759 case 'k': // AVX512 masking registers.
39760 return C_RegisterClass;
39784 else if (Constraint.size() == 2) {
39785 switch (Constraint[0]) {
39789 switch (Constraint[1]) {
39800 return C_RegisterClass;
39804 return TargetLowering::getConstraintType(Constraint);
39807 /// Examine constraint type and operand type and determine a weight value.
39808 /// This object must already have been set up with the operand type
39809 /// and the current alternative constraint selected.
39810 TargetLowering::ConstraintWeight
39811 X86TargetLowering::getSingleConstraintMatchWeight(
39812 AsmOperandInfo &info, const char *constraint) const {
39813 ConstraintWeight weight = CW_Invalid;
39814 Value *CallOperandVal = info.CallOperandVal;
39815 // If we don't have a value, we can't do a match,
39816 // but allow it at the lowest weight.
39817 if (!CallOperandVal)
39819 Type *type = CallOperandVal->getType();
39820 // Look at the constraint type.
39821 switch (*constraint) {
39823 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
39835 if (CallOperandVal->getType()->isIntegerTy())
39836 weight = CW_SpecificReg;
39841 if (type->isFloatingPointTy())
39842 weight = CW_SpecificReg;
39845 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39846 weight = CW_SpecificReg;
39849 unsigned Size = StringRef(constraint).size();
39850 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
39851 char NextChar = Size == 2 ? constraint[1] : 'i';
39854 switch (NextChar) {
39860 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
39861 return CW_SpecificReg;
39863 // Conditional OpMask regs (AVX512)
39865 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39866 return CW_Register;
39870 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39873 // Any SSE reg when ISA >= SSE2, same as 'Y'
39877 if (!Subtarget.hasSSE2())
39881 // Fall through (handle "Y" constraint).
39885 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
39886 weight = CW_Register;
39889 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
39890 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
39891 weight = CW_Register;
39894 // Enable conditional vector operations using %k<#> registers.
39895 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39896 weight = CW_Register;
39899 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
39900 if (C->getZExtValue() <= 31)
39901 weight = CW_Constant;
39905 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39906 if (C->getZExtValue() <= 63)
39907 weight = CW_Constant;
39911 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39912 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
39913 weight = CW_Constant;
39917 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39918 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
39919 weight = CW_Constant;
39923 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39924 if (C->getZExtValue() <= 3)
39925 weight = CW_Constant;
39929 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39930 if (C->getZExtValue() <= 0xff)
39931 weight = CW_Constant;
39936 if (isa<ConstantFP>(CallOperandVal)) {
39937 weight = CW_Constant;
39941 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39942 if ((C->getSExtValue() >= -0x80000000LL) &&
39943 (C->getSExtValue() <= 0x7fffffffLL))
39944 weight = CW_Constant;
39948 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39949 if (C->getZExtValue() <= 0xffffffff)
39950 weight = CW_Constant;
39957 /// Try to replace an X constraint, which matches anything, with another that
39958 /// has more specific requirements based on the type of the corresponding
39960 const char *X86TargetLowering::
39961 LowerXConstraint(EVT ConstraintVT) const {
39962 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
39963 // 'f' like normal targets.
39964 if (ConstraintVT.isFloatingPoint()) {
39965 if (Subtarget.hasSSE2())
39967 if (Subtarget.hasSSE1())
39971 return TargetLowering::LowerXConstraint(ConstraintVT);
39974 /// Lower the specified operand into the Ops vector.
39975 /// If it is invalid, don't add anything to Ops.
39976 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
39977 std::string &Constraint,
39978 std::vector<SDValue>&Ops,
39979 SelectionDAG &DAG) const {
39982 // Only support length 1 constraints for now.
39983 if (Constraint.length() > 1) return;
39985 char ConstraintLetter = Constraint[0];
39986 switch (ConstraintLetter) {
39989 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39990 if (C->getZExtValue() <= 31) {
39991 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39992 Op.getValueType());
39998 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39999 if (C->getZExtValue() <= 63) {
40000 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40001 Op.getValueType());
40007 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40008 if (isInt<8>(C->getSExtValue())) {
40009 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40010 Op.getValueType());
40016 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40017 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
40018 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
40019 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
40020 Op.getValueType());
40026 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40027 if (C->getZExtValue() <= 3) {
40028 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40029 Op.getValueType());
40035 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40036 if (C->getZExtValue() <= 255) {
40037 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40038 Op.getValueType());
40044 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40045 if (C->getZExtValue() <= 127) {
40046 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40047 Op.getValueType());
40053 // 32-bit signed value
40054 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40055 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40056 C->getSExtValue())) {
40057 // Widen to 64 bits here to get it sign extended.
40058 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
40061 // FIXME gcc accepts some relocatable values here too, but only in certain
40062 // memory models; it's complicated.
40067 // 32-bit unsigned value
40068 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
40069 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
40070 C->getZExtValue())) {
40071 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
40072 Op.getValueType());
40076 // FIXME gcc accepts some relocatable values here too, but only in certain
40077 // memory models; it's complicated.
40081 // Literal immediates are always ok.
40082 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
40083 // Widen to 64 bits here to get it sign extended.
40084 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
40088 // In any sort of PIC mode addresses need to be computed at runtime by
40089 // adding in a register or some sort of table lookup. These can't
40090 // be used as immediates.
40091 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
40094 // If we are in non-pic codegen mode, we allow the address of a global (with
40095 // an optional displacement) to be used with 'i'.
40096 GlobalAddressSDNode *GA = nullptr;
40097 int64_t Offset = 0;
40099 // Match either (GA), (GA+C), (GA+C1+C2), etc.
40101 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
40102 Offset += GA->getOffset();
40104 } else if (Op.getOpcode() == ISD::ADD) {
40105 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40106 Offset += C->getZExtValue();
40107 Op = Op.getOperand(0);
40110 } else if (Op.getOpcode() == ISD::SUB) {
40111 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
40112 Offset += -C->getZExtValue();
40113 Op = Op.getOperand(0);
40118 // Otherwise, this isn't something we can handle, reject it.
40122 const GlobalValue *GV = GA->getGlobal();
40123 // If we require an extra load to get this address, as in PIC mode, we
40124 // can't accept it.
40125 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
40128 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
40129 GA->getValueType(0), Offset);
40134 if (Result.getNode()) {
40135 Ops.push_back(Result);
40138 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
40141 /// Check if \p RC is a general purpose register class.
40142 /// I.e., GR* or one of their variant.
40143 static bool isGRClass(const TargetRegisterClass &RC) {
40144 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
40145 RC.hasSuperClassEq(&X86::GR16RegClass) ||
40146 RC.hasSuperClassEq(&X86::GR32RegClass) ||
40147 RC.hasSuperClassEq(&X86::GR64RegClass) ||
40148 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
40151 /// Check if \p RC is a vector register class.
40152 /// I.e., FR* / VR* or one of their variant.
40153 static bool isFRClass(const TargetRegisterClass &RC) {
40154 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
40155 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
40156 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
40157 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
40158 RC.hasSuperClassEq(&X86::VR512RegClass);
40161 std::pair<unsigned, const TargetRegisterClass *>
40162 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
40163 StringRef Constraint,
40165 // First, see if this is a constraint that directly corresponds to an LLVM
40167 if (Constraint.size() == 1) {
40168 // GCC Constraint Letters
40169 switch (Constraint[0]) {
40171 // TODO: Slight differences here in allocation order and leaving
40172 // RIP in the class. Do they matter any more here than they do
40173 // in the normal allocation?
40175 if (Subtarget.hasAVX512()) {
40176 // Only supported in AVX512 or later.
40177 switch (VT.SimpleTy) {
40180 return std::make_pair(0U, &X86::VK32RegClass);
40182 return std::make_pair(0U, &X86::VK16RegClass);
40184 return std::make_pair(0U, &X86::VK8RegClass);
40186 return std::make_pair(0U, &X86::VK1RegClass);
40188 return std::make_pair(0U, &X86::VK64RegClass);
40192 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
40193 if (Subtarget.is64Bit()) {
40194 if (VT == MVT::i32 || VT == MVT::f32)
40195 return std::make_pair(0U, &X86::GR32RegClass);
40196 if (VT == MVT::i16)
40197 return std::make_pair(0U, &X86::GR16RegClass);
40198 if (VT == MVT::i8 || VT == MVT::i1)
40199 return std::make_pair(0U, &X86::GR8RegClass);
40200 if (VT == MVT::i64 || VT == MVT::f64)
40201 return std::make_pair(0U, &X86::GR64RegClass);
40205 // 32-bit fallthrough
40206 case 'Q': // Q_REGS
40207 if (VT == MVT::i32 || VT == MVT::f32)
40208 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
40209 if (VT == MVT::i16)
40210 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
40211 if (VT == MVT::i8 || VT == MVT::i1)
40212 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
40213 if (VT == MVT::i64)
40214 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
40216 case 'r': // GENERAL_REGS
40217 case 'l': // INDEX_REGS
40218 if (VT == MVT::i8 || VT == MVT::i1)
40219 return std::make_pair(0U, &X86::GR8RegClass);
40220 if (VT == MVT::i16)
40221 return std::make_pair(0U, &X86::GR16RegClass);
40222 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
40223 return std::make_pair(0U, &X86::GR32RegClass);
40224 return std::make_pair(0U, &X86::GR64RegClass);
40225 case 'R': // LEGACY_REGS
40226 if (VT == MVT::i8 || VT == MVT::i1)
40227 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
40228 if (VT == MVT::i16)
40229 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
40230 if (VT == MVT::i32 || !Subtarget.is64Bit())
40231 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
40232 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
40233 case 'f': // FP Stack registers.
40234 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
40235 // value to the correct fpstack register class.
40236 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
40237 return std::make_pair(0U, &X86::RFP32RegClass);
40238 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
40239 return std::make_pair(0U, &X86::RFP64RegClass);
40240 return std::make_pair(0U, &X86::RFP80RegClass);
40241 case 'y': // MMX_REGS if MMX allowed.
40242 if (!Subtarget.hasMMX()) break;
40243 return std::make_pair(0U, &X86::VR64RegClass);
40244 case 'Y': // SSE_REGS if SSE2 allowed
40245 if (!Subtarget.hasSSE2()) break;
40248 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
40249 if (!Subtarget.hasSSE1()) break;
40250 bool VConstraint = (Constraint[0] == 'v');
40252 switch (VT.SimpleTy) {
40254 // Scalar SSE types.
40257 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
40258 return std::make_pair(0U, &X86::FR32XRegClass);
40259 return std::make_pair(0U, &X86::FR32RegClass);
40262 if (VConstraint && Subtarget.hasVLX())
40263 return std::make_pair(0U, &X86::FR64XRegClass);
40264 return std::make_pair(0U, &X86::FR64RegClass);
40265 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40273 if (VConstraint && Subtarget.hasVLX())
40274 return std::make_pair(0U, &X86::VR128XRegClass);
40275 return std::make_pair(0U, &X86::VR128RegClass);
40283 if (VConstraint && Subtarget.hasVLX())
40284 return std::make_pair(0U, &X86::VR256XRegClass);
40285 return std::make_pair(0U, &X86::VR256RegClass);
40290 return std::make_pair(0U, &X86::VR512RegClass);
40294 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
40295 switch (Constraint[1]) {
40301 return getRegForInlineAsmConstraint(TRI, "Y", VT);
40303 if (!Subtarget.hasMMX()) break;
40304 return std::make_pair(0U, &X86::VR64RegClass);
40307 if (!Subtarget.hasSSE1()) break;
40308 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
40310 // This register class doesn't allocate k0 for masked vector operation.
40311 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
40312 switch (VT.SimpleTy) {
40315 return std::make_pair(0U, &X86::VK32WMRegClass);
40317 return std::make_pair(0U, &X86::VK16WMRegClass);
40319 return std::make_pair(0U, &X86::VK8WMRegClass);
40321 return std::make_pair(0U, &X86::VK1WMRegClass);
40323 return std::make_pair(0U, &X86::VK64WMRegClass);
40330 // Use the default implementation in TargetLowering to convert the register
40331 // constraint into a member of a register class.
40332 std::pair<unsigned, const TargetRegisterClass*> Res;
40333 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
40335 // Not found as a standard register?
40337 // Map st(0) -> st(7) -> ST0
40338 if (Constraint.size() == 7 && Constraint[0] == '{' &&
40339 tolower(Constraint[1]) == 's' &&
40340 tolower(Constraint[2]) == 't' &&
40341 Constraint[3] == '(' &&
40342 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
40343 Constraint[5] == ')' &&
40344 Constraint[6] == '}') {
40346 Res.first = X86::FP0+Constraint[4]-'0';
40347 Res.second = &X86::RFP80RegClass;
40351 // GCC allows "st(0)" to be called just plain "st".
40352 if (StringRef("{st}").equals_lower(Constraint)) {
40353 Res.first = X86::FP0;
40354 Res.second = &X86::RFP80RegClass;
40359 if (StringRef("{flags}").equals_lower(Constraint)) {
40360 Res.first = X86::EFLAGS;
40361 Res.second = &X86::CCRRegClass;
40365 // 'A' means [ER]AX + [ER]DX.
40366 if (Constraint == "A") {
40367 if (Subtarget.is64Bit()) {
40368 Res.first = X86::RAX;
40369 Res.second = &X86::GR64_ADRegClass;
40371 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
40372 "Expecting 64, 32 or 16 bit subtarget");
40373 Res.first = X86::EAX;
40374 Res.second = &X86::GR32_ADRegClass;
40381 // Make sure it isn't a register that requires 64-bit mode.
40382 if (!Subtarget.is64Bit() &&
40383 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
40384 TRI->getEncodingValue(Res.first) >= 8) {
40385 // Register requires REX prefix, but we're in 32-bit mode.
40387 Res.second = nullptr;
40391 // Make sure it isn't a register that requires AVX512.
40392 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
40393 TRI->getEncodingValue(Res.first) & 0x10) {
40394 // Register requires EVEX prefix.
40396 Res.second = nullptr;
40400 // Otherwise, check to see if this is a register class of the wrong value
40401 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
40402 // turn into {ax},{dx}.
40403 // MVT::Other is used to specify clobber names.
40404 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
40405 return Res; // Correct type already, nothing to do.
40407 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
40408 // return "eax". This should even work for things like getting 64bit integer
40409 // registers when given an f64 type.
40410 const TargetRegisterClass *Class = Res.second;
40411 // The generic code will match the first register class that contains the
40412 // given register. Thus, based on the ordering of the tablegened file,
40413 // the "plain" GR classes might not come first.
40414 // Therefore, use a helper method.
40415 if (isGRClass(*Class)) {
40416 unsigned Size = VT.getSizeInBits();
40417 if (Size == 1) Size = 8;
40418 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
40420 bool is64Bit = Subtarget.is64Bit();
40421 const TargetRegisterClass *RC =
40422 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
40423 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
40424 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
40425 : &X86::GR64RegClass;
40426 if (RC->contains(DestReg))
40427 Res = std::make_pair(DestReg, RC);
40429 // No register found/type mismatch.
40431 Res.second = nullptr;
40433 } else if (isFRClass(*Class)) {
40434 // Handle references to XMM physical registers that got mapped into the
40435 // wrong class. This can happen with constraints like {xmm0} where the
40436 // target independent register mapper will just pick the first match it can
40437 // find, ignoring the required type.
40439 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
40440 if (VT == MVT::f32 || VT == MVT::i32)
40441 Res.second = &X86::FR32RegClass;
40442 else if (VT == MVT::f64 || VT == MVT::i64)
40443 Res.second = &X86::FR64RegClass;
40444 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
40445 Res.second = &X86::VR128RegClass;
40446 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
40447 Res.second = &X86::VR256RegClass;
40448 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
40449 Res.second = &X86::VR512RegClass;
40451 // Type mismatch and not a clobber: Return an error;
40453 Res.second = nullptr;
40460 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
40461 const AddrMode &AM, Type *Ty,
40462 unsigned AS) const {
40463 // Scaling factors are not free at all.
40464 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
40465 // will take 2 allocations in the out of order engine instead of 1
40466 // for plain addressing mode, i.e. inst (reg1).
40468 // vaddps (%rsi,%rdx), %ymm0, %ymm1
40469 // Requires two allocations (one for the load, one for the computation)
40471 // vaddps (%rsi), %ymm0, %ymm1
40472 // Requires just 1 allocation, i.e., freeing allocations for other operations
40473 // and having less micro operations to execute.
40475 // For some X86 architectures, this is even worse because for instance for
40476 // stores, the complex addressing mode forces the instruction to use the
40477 // "load" ports instead of the dedicated "store" port.
40478 // E.g., on Haswell:
40479 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
40480 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
40481 if (isLegalAddressingMode(DL, AM, Ty, AS))
40482 // Scale represents reg2 * scale, thus account for 1
40483 // as soon as we use a second register.
40484 return AM.Scale != 0;
40488 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
40489 // Integer division on x86 is expensive. However, when aggressively optimizing
40490 // for code size, we prefer to use a div instruction, as it is usually smaller
40491 // than the alternative sequence.
40492 // The exception to this is vector division. Since x86 doesn't have vector
40493 // integer division, leaving the division as-is is a loss even in terms of
40494 // size, because it will have to be scalarized, while the alternative code
40495 // sequence can be performed in vector form.
40497 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
40498 return OptSize && !VT.isVector();
40501 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
40502 if (!Subtarget.is64Bit())
40505 // Update IsSplitCSR in X86MachineFunctionInfo.
40506 X86MachineFunctionInfo *AFI =
40507 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
40508 AFI->setIsSplitCSR(true);
40511 void X86TargetLowering::insertCopiesSplitCSR(
40512 MachineBasicBlock *Entry,
40513 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
40514 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
40515 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
40519 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
40520 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
40521 MachineBasicBlock::iterator MBBI = Entry->begin();
40522 for (const MCPhysReg *I = IStart; *I; ++I) {
40523 const TargetRegisterClass *RC = nullptr;
40524 if (X86::GR64RegClass.contains(*I))
40525 RC = &X86::GR64RegClass;
40527 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
40529 unsigned NewVR = MRI->createVirtualRegister(RC);
40530 // Create copy from CSR to a virtual register.
40531 // FIXME: this currently does not emit CFI pseudo-instructions, it works
40532 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
40533 // nounwind. If we want to generalize this later, we may need to emit
40534 // CFI pseudo-instructions.
40535 assert(Entry->getParent()->getFunction().hasFnAttribute(
40536 Attribute::NoUnwind) &&
40537 "Function should be nounwind in insertCopiesSplitCSR!");
40538 Entry->addLiveIn(*I);
40539 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
40542 // Insert the copy-back instructions right before the terminator.
40543 for (auto *Exit : Exits)
40544 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
40545 TII->get(TargetOpcode::COPY), *I)
40550 bool X86TargetLowering::supportSwiftError() const {
40551 return Subtarget.is64Bit();
40554 /// Returns the name of the symbol used to emit stack probes or the empty
40555 /// string if not applicable.
40556 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
40557 // If the function specifically requests stack probes, emit them.
40558 if (MF.getFunction().hasFnAttribute("probe-stack"))
40559 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
40561 // Generally, if we aren't on Windows, the platform ABI does not include
40562 // support for stack probes, so don't emit them.
40563 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
40564 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
40567 // We need a stack probe to conform to the Windows ABI. Choose the right
40569 if (Subtarget.is64Bit())
40570 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
40571 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";