1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
347 // Promote the i8 variants and force them on up to i32 which has a shorter
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 if (Subtarget.hasSSEPrefetch() || Subtarget.has3DNow())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544 // Expand FP immediates into loads from the stack, except for the special
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
793 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
794 setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom);
795 setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom);
796 setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom);
797 setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
801 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
802 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
804 // Provide custom widening for v2f32 setcc. This is really for VLX when
805 // setcc result type returns v2i1/v4i1 vector for v2f32/v4f32 leading to
806 // type legalization changing the result type to v4i1 during widening.
807 // It works fine for SSE2 and is probably faster so no need to qualify with
809 setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
811 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
812 setOperationAction(ISD::SETCC, VT, Custom);
813 setOperationAction(ISD::CTPOP, VT, Custom);
814 setOperationAction(ISD::CTTZ, VT, Custom);
817 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
818 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
819 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
820 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
821 setOperationAction(ISD::VSELECT, VT, Custom);
822 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
825 // We support custom legalizing of sext and anyext loads for specific
826 // memory vector types which we can load as a scalar (or sequence of
827 // scalars) and extend in-register to a legal 128-bit vector type. For sext
828 // loads these must work with a single scalar load.
829 for (MVT VT : MVT::integer_vector_valuetypes()) {
830 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
831 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
832 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
833 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
834 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
835 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
836 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
837 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
838 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
841 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
842 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
843 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
844 setOperationAction(ISD::VSELECT, VT, Custom);
846 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
849 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
850 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
853 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
854 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
855 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
856 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
857 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
858 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
859 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
862 // Custom lower v2i64 and v2f64 selects.
863 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
864 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
866 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
867 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
869 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
870 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
872 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
874 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
875 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
877 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
878 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
880 for (MVT VT : MVT::fp_vector_valuetypes())
881 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
883 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
884 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
885 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
886 if (!Subtarget.hasAVX512())
887 setOperationAction(ISD::BITCAST, MVT::v16i1, Custom);
889 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
890 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
891 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
893 // In the customized shift lowering, the legal v4i32/v2i64 cases
894 // in AVX2 will be recognized.
895 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
896 setOperationAction(ISD::SRL, VT, Custom);
897 setOperationAction(ISD::SHL, VT, Custom);
898 setOperationAction(ISD::SRA, VT, Custom);
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
903 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
904 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
905 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
906 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
907 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
908 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
909 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
910 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
913 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
914 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
915 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
916 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
917 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
918 setOperationAction(ISD::FRINT, RoundedTy, Legal);
919 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
922 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
923 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
924 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
925 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
926 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
927 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
928 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
929 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
931 // FIXME: Do we need to handle scalar-to-vector here?
932 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
934 // We directly match byte blends in the backend as they match the VSELECT
936 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
938 // SSE41 brings specific instructions for doing vector sign extend even in
939 // cases where we don't have SRA.
940 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
941 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
942 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
945 for (MVT VT : MVT::integer_vector_valuetypes()) {
946 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
947 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
948 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
951 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
952 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
953 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
954 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
955 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
956 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
957 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
958 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
959 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
962 // i8 vectors are custom because the source register and source
963 // source memory operand types are not the same width.
964 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
967 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
968 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
969 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
970 setOperationAction(ISD::ROTL, VT, Custom);
972 // XOP can efficiently perform BITREVERSE with VPPERM.
973 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
974 setOperationAction(ISD::BITREVERSE, VT, Custom);
976 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
977 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
978 setOperationAction(ISD::BITREVERSE, VT, Custom);
981 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) {
982 bool HasInt256 = Subtarget.hasInt256();
984 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
985 : &X86::VR256RegClass);
986 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
987 : &X86::VR256RegClass);
988 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
989 : &X86::VR256RegClass);
990 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
991 : &X86::VR256RegClass);
992 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
993 : &X86::VR256RegClass);
994 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
995 : &X86::VR256RegClass);
997 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
998 setOperationAction(ISD::FFLOOR, VT, Legal);
999 setOperationAction(ISD::FCEIL, VT, Legal);
1000 setOperationAction(ISD::FTRUNC, VT, Legal);
1001 setOperationAction(ISD::FRINT, VT, Legal);
1002 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1003 setOperationAction(ISD::FNEG, VT, Custom);
1004 setOperationAction(ISD::FABS, VT, Custom);
1005 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1008 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1009 // even though v8i16 is a legal type.
1010 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
1011 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
1012 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1014 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1015 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1017 if (!Subtarget.hasAVX512())
1018 setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
1020 for (MVT VT : MVT::fp_vector_valuetypes())
1021 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1023 // In the customized shift lowering, the legal v8i32/v4i64 cases
1024 // in AVX2 will be recognized.
1025 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1026 setOperationAction(ISD::SRL, VT, Custom);
1027 setOperationAction(ISD::SHL, VT, Custom);
1028 setOperationAction(ISD::SRA, VT, Custom);
1031 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1032 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1033 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1035 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1036 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1037 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1038 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1041 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1042 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1043 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1044 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1046 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1047 setOperationAction(ISD::SETCC, VT, Custom);
1048 setOperationAction(ISD::CTPOP, VT, Custom);
1049 setOperationAction(ISD::CTTZ, VT, Custom);
1050 setOperationAction(ISD::CTLZ, VT, Custom);
1053 if (Subtarget.hasAnyFMA()) {
1054 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1055 MVT::v2f64, MVT::v4f64 })
1056 setOperationAction(ISD::FMA, VT, Legal);
1059 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1060 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1061 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1065 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1069 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1070 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1072 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1073 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1074 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1075 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1077 setOperationAction(ISD::SMAX, MVT::v4i64, Custom);
1078 setOperationAction(ISD::UMAX, MVT::v4i64, Custom);
1079 setOperationAction(ISD::SMIN, MVT::v4i64, Custom);
1080 setOperationAction(ISD::UMIN, MVT::v4i64, Custom);
1082 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1083 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1084 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1085 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1086 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1087 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1091 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1092 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1093 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1095 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1096 // when we have a 256bit-wide blend with immediate.
1097 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1099 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1100 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1101 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1102 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1103 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1104 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1105 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1106 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1110 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1111 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1112 setOperationAction(ISD::MLOAD, VT, Legal);
1113 setOperationAction(ISD::MSTORE, VT, Legal);
1116 // Extract subvector is special because the value type
1117 // (result) is 128-bit but the source is 256-bit wide.
1118 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1119 MVT::v4f32, MVT::v2f64 }) {
1120 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1123 // Custom lower several nodes for 256-bit types.
1124 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1125 MVT::v8f32, MVT::v4f64 }) {
1126 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1127 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1128 setOperationAction(ISD::VSELECT, VT, Custom);
1129 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1130 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1131 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1132 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1133 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1137 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1139 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1140 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1141 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1142 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1143 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1144 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1145 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1149 // Custom legalize 2x32 to get a little better code.
1150 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1151 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1153 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1154 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1155 setOperationAction(ISD::MGATHER, VT, Custom);
1159 // This block controls legalization of the mask vector sizes that are
1160 // available with AVX512. 512-bit vectors are in a separate block controlled
1161 // by useAVX512Regs.
1162 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1163 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1164 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1165 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1166 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1167 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1169 setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
1170 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1171 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1173 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
1174 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
1175 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
1176 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
1177 setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
1180 // There is no byte sized k-register load or store without AVX512DQ.
1181 if (!Subtarget.hasDQI()) {
1182 setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
1183 setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
1184 setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
1185 setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
1187 setOperationAction(ISD::STORE, MVT::v1i1, Custom);
1188 setOperationAction(ISD::STORE, MVT::v2i1, Custom);
1189 setOperationAction(ISD::STORE, MVT::v4i1, Custom);
1190 setOperationAction(ISD::STORE, MVT::v8i1, Custom);
1193 // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
1194 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
1195 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1196 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1197 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1200 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
1201 setOperationAction(ISD::ADD, VT, Custom);
1202 setOperationAction(ISD::SUB, VT, Custom);
1203 setOperationAction(ISD::MUL, VT, Custom);
1204 setOperationAction(ISD::SETCC, VT, Custom);
1205 setOperationAction(ISD::SELECT, VT, Custom);
1206 setOperationAction(ISD::TRUNCATE, VT, Custom);
1208 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1209 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1210 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1211 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1212 setOperationAction(ISD::VSELECT, VT, Expand);
1215 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1216 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1217 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1218 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v2i1, Custom);
1219 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1220 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1221 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1222 for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
1223 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1226 // This block controls legalization for 512-bit operations with 32/64 bit
1227 // elements. 512-bits can be disabled based on prefer-vector-width and
1228 // required-vector-width function attributes.
1229 if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
1230 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1231 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1232 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1233 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1235 for (MVT VT : MVT::fp_vector_valuetypes())
1236 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1238 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1239 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1240 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1241 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1242 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1243 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1246 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1247 setOperationAction(ISD::FNEG, VT, Custom);
1248 setOperationAction(ISD::FABS, VT, Custom);
1249 setOperationAction(ISD::FMA, VT, Legal);
1250 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1253 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1254 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
1255 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
1256 setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
1257 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1258 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
1259 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
1260 setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
1261 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1262 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1264 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1265 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1266 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1267 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1268 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1270 if (!Subtarget.hasVLX()) {
1271 // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
1272 // to 512-bit rather than use the AVX2 instructions so that we can use
1274 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1275 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1276 setOperationAction(ISD::MLOAD, VT, Custom);
1277 setOperationAction(ISD::MSTORE, VT, Custom);
1281 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1282 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1283 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1284 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1285 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1286 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1287 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1288 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1290 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1291 setOperationAction(ISD::FFLOOR, VT, Legal);
1292 setOperationAction(ISD::FCEIL, VT, Legal);
1293 setOperationAction(ISD::FTRUNC, VT, Legal);
1294 setOperationAction(ISD::FRINT, VT, Legal);
1295 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1298 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1299 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1301 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1302 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1303 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1305 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1306 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1307 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1308 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1310 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1311 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1313 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1314 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1316 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1317 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1318 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1320 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1321 setOperationAction(ISD::SMAX, VT, Legal);
1322 setOperationAction(ISD::UMAX, VT, Legal);
1323 setOperationAction(ISD::SMIN, VT, Legal);
1324 setOperationAction(ISD::UMIN, VT, Legal);
1325 setOperationAction(ISD::ABS, VT, Legal);
1326 setOperationAction(ISD::SRL, VT, Custom);
1327 setOperationAction(ISD::SHL, VT, Custom);
1328 setOperationAction(ISD::SRA, VT, Custom);
1329 setOperationAction(ISD::CTPOP, VT, Custom);
1330 setOperationAction(ISD::CTTZ, VT, Custom);
1331 setOperationAction(ISD::ROTL, VT, Custom);
1332 setOperationAction(ISD::ROTR, VT, Custom);
1335 // Need to promote to 64-bit even though we have 32-bit masked instructions
1336 // because the IR optimizers rearrange bitcasts around logic ops leaving
1337 // too many variations to handle if we don't promote them.
1338 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1339 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1340 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1342 if (Subtarget.hasDQI()) {
1343 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1344 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1345 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1346 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1349 if (Subtarget.hasCDI()) {
1350 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1351 for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
1352 setOperationAction(ISD::CTLZ, VT, Legal);
1353 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1355 } // Subtarget.hasCDI()
1357 if (Subtarget.hasVPOPCNTDQ()) {
1358 for (auto VT : { MVT::v16i32, MVT::v8i64 })
1359 setOperationAction(ISD::CTPOP, VT, Legal);
1362 // Extract subvector is special because the value type
1363 // (result) is 256-bit but the source is 512-bit wide.
1364 // 128-bit was made Legal under AVX1.
1365 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1366 MVT::v8f32, MVT::v4f64 })
1367 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1369 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1370 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1371 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1372 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1373 setOperationAction(ISD::VSELECT, VT, Custom);
1374 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1375 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1376 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1377 setOperationAction(ISD::MLOAD, VT, Legal);
1378 setOperationAction(ISD::MSTORE, VT, Legal);
1379 setOperationAction(ISD::MGATHER, VT, Custom);
1380 setOperationAction(ISD::MSCATTER, VT, Custom);
1382 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1383 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1384 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1387 // Need to custom split v32i16/v64i8 bitcasts.
1388 if (!Subtarget.hasBWI()) {
1389 setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
1390 setOperationAction(ISD::BITCAST, MVT::v64i8, Custom);
1394 // This block controls legalization for operations that don't have
1395 // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
1397 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1398 // These operations are handled on non-VLX by artificially widening in
1400 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1402 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1403 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1404 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1405 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1406 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1408 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1409 setOperationAction(ISD::SMAX, VT, Legal);
1410 setOperationAction(ISD::UMAX, VT, Legal);
1411 setOperationAction(ISD::SMIN, VT, Legal);
1412 setOperationAction(ISD::UMIN, VT, Legal);
1413 setOperationAction(ISD::ABS, VT, Legal);
1416 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1417 setOperationAction(ISD::ROTL, VT, Custom);
1418 setOperationAction(ISD::ROTR, VT, Custom);
1421 // Custom legalize 2x32 to get a little better code.
1422 setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom);
1423 setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom);
1425 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1426 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1427 setOperationAction(ISD::MSCATTER, VT, Custom);
1429 if (Subtarget.hasDQI()) {
1430 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1431 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1432 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1433 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1434 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1438 if (Subtarget.hasCDI()) {
1439 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
1440 setOperationAction(ISD::CTLZ, VT, Legal);
1441 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1443 } // Subtarget.hasCDI()
1445 if (Subtarget.hasVPOPCNTDQ()) {
1446 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
1447 setOperationAction(ISD::CTPOP, VT, Legal);
1451 // This block control legalization of v32i1/v64i1 which are available with
1452 // AVX512BW. 512-bit v32i16 and v64i8 vector legalization is controlled with
1454 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1455 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1456 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1458 for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
1459 setOperationAction(ISD::ADD, VT, Custom);
1460 setOperationAction(ISD::SUB, VT, Custom);
1461 setOperationAction(ISD::MUL, VT, Custom);
1462 setOperationAction(ISD::VSELECT, VT, Expand);
1464 setOperationAction(ISD::TRUNCATE, VT, Custom);
1465 setOperationAction(ISD::SETCC, VT, Custom);
1466 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1467 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1468 setOperationAction(ISD::SELECT, VT, Custom);
1469 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1470 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1473 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1474 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1475 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1476 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1477 for (auto VT : { MVT::v16i1, MVT::v32i1 })
1478 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1480 // Extends from v32i1 masks to 256-bit vectors.
1481 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1482 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1483 setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
1486 // This block controls legalization for v32i16 and v64i8. 512-bits can be
1487 // disabled based on prefer-vector-width and required-vector-width function
1489 if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
1490 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1491 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1493 // Extends from v64i1 masks to 512-bit vectors.
1494 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1495 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1496 setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
1498 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1499 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1500 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1501 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1502 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1503 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1504 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1505 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1506 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1507 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1508 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1509 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1510 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1511 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1512 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1513 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1514 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1515 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1516 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1517 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1518 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1519 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1520 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1522 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1524 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1526 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1527 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1528 setOperationAction(ISD::VSELECT, VT, Custom);
1529 setOperationAction(ISD::ABS, VT, Legal);
1530 setOperationAction(ISD::SRL, VT, Custom);
1531 setOperationAction(ISD::SHL, VT, Custom);
1532 setOperationAction(ISD::SRA, VT, Custom);
1533 setOperationAction(ISD::MLOAD, VT, Legal);
1534 setOperationAction(ISD::MSTORE, VT, Legal);
1535 setOperationAction(ISD::CTPOP, VT, Custom);
1536 setOperationAction(ISD::CTTZ, VT, Custom);
1537 setOperationAction(ISD::CTLZ, VT, Custom);
1538 setOperationAction(ISD::SMAX, VT, Legal);
1539 setOperationAction(ISD::UMAX, VT, Legal);
1540 setOperationAction(ISD::SMIN, VT, Legal);
1541 setOperationAction(ISD::UMIN, VT, Legal);
1543 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1544 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1545 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1548 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1549 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1552 if (Subtarget.hasBITALG()) {
1553 for (auto VT : { MVT::v64i8, MVT::v32i16 })
1554 setOperationAction(ISD::CTPOP, VT, Legal);
1558 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1559 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1560 setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
1561 setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
1564 // These operations are handled on non-VLX by artificially widening in
1566 // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
1568 if (Subtarget.hasBITALG()) {
1569 for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
1570 setOperationAction(ISD::CTPOP, VT, Legal);
1574 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1575 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1576 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1577 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1578 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1579 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1581 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1582 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1583 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1584 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1585 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1587 if (Subtarget.hasDQI()) {
1588 // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
1589 // v2f32 UINT_TO_FP is already custom under SSE2.
1590 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1591 assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
1592 "Unexpected operation action!");
1593 // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
1594 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1595 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1598 if (Subtarget.hasBWI()) {
1599 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1600 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1604 // We want to custom lower some of our intrinsics.
1605 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1606 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1607 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1608 if (!Subtarget.is64Bit()) {
1609 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1610 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1613 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1614 // handle type legalization for these operations here.
1616 // FIXME: We really should do custom legalization for addition and
1617 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1618 // than generic legalization for 64-bit multiplication-with-overflow, though.
1619 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1620 if (VT == MVT::i64 && !Subtarget.is64Bit())
1622 // Add/Sub/Mul with overflow operations are custom lowered.
1623 setOperationAction(ISD::SADDO, VT, Custom);
1624 setOperationAction(ISD::UADDO, VT, Custom);
1625 setOperationAction(ISD::SSUBO, VT, Custom);
1626 setOperationAction(ISD::USUBO, VT, Custom);
1627 setOperationAction(ISD::SMULO, VT, Custom);
1628 setOperationAction(ISD::UMULO, VT, Custom);
1630 // Support carry in as value rather than glue.
1631 setOperationAction(ISD::ADDCARRY, VT, Custom);
1632 setOperationAction(ISD::SUBCARRY, VT, Custom);
1633 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1636 if (!Subtarget.is64Bit()) {
1637 // These libcalls are not available in 32-bit.
1638 setLibcallName(RTLIB::SHL_I128, nullptr);
1639 setLibcallName(RTLIB::SRL_I128, nullptr);
1640 setLibcallName(RTLIB::SRA_I128, nullptr);
1641 setLibcallName(RTLIB::MUL_I128, nullptr);
1644 // Combine sin / cos into _sincos_stret if it is available.
1645 if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr &&
1646 getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) {
1647 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1648 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1651 if (Subtarget.isTargetWin64()) {
1652 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1653 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1654 setOperationAction(ISD::SREM, MVT::i128, Custom);
1655 setOperationAction(ISD::UREM, MVT::i128, Custom);
1656 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1657 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1660 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1661 // is. We should promote the value to 64-bits to solve this.
1662 // This is what the CRT headers do - `fmodf` is an inline header
1663 // function casting to f64 and calling `fmod`.
1664 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1665 Subtarget.isTargetWindowsItanium()))
1666 for (ISD::NodeType Op :
1667 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1668 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1669 if (isOperationExpand(Op, MVT::f32))
1670 setOperationAction(Op, MVT::f32, Promote);
1672 // We have target-specific dag combine patterns for the following nodes:
1673 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1674 setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
1675 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1676 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1677 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1678 setTargetDAGCombine(ISD::BITCAST);
1679 setTargetDAGCombine(ISD::VSELECT);
1680 setTargetDAGCombine(ISD::SELECT);
1681 setTargetDAGCombine(ISD::SHL);
1682 setTargetDAGCombine(ISD::SRA);
1683 setTargetDAGCombine(ISD::SRL);
1684 setTargetDAGCombine(ISD::OR);
1685 setTargetDAGCombine(ISD::AND);
1686 setTargetDAGCombine(ISD::ADD);
1687 setTargetDAGCombine(ISD::FADD);
1688 setTargetDAGCombine(ISD::FSUB);
1689 setTargetDAGCombine(ISD::FNEG);
1690 setTargetDAGCombine(ISD::FMA);
1691 setTargetDAGCombine(ISD::FMINNUM);
1692 setTargetDAGCombine(ISD::FMAXNUM);
1693 setTargetDAGCombine(ISD::SUB);
1694 setTargetDAGCombine(ISD::LOAD);
1695 setTargetDAGCombine(ISD::MLOAD);
1696 setTargetDAGCombine(ISD::STORE);
1697 setTargetDAGCombine(ISD::MSTORE);
1698 setTargetDAGCombine(ISD::TRUNCATE);
1699 setTargetDAGCombine(ISD::ZERO_EXTEND);
1700 setTargetDAGCombine(ISD::ANY_EXTEND);
1701 setTargetDAGCombine(ISD::SIGN_EXTEND);
1702 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1703 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1704 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1705 setTargetDAGCombine(ISD::SINT_TO_FP);
1706 setTargetDAGCombine(ISD::UINT_TO_FP);
1707 setTargetDAGCombine(ISD::SETCC);
1708 setTargetDAGCombine(ISD::MUL);
1709 setTargetDAGCombine(ISD::XOR);
1710 setTargetDAGCombine(ISD::MSCATTER);
1711 setTargetDAGCombine(ISD::MGATHER);
1713 computeRegisterProperties(Subtarget.getRegisterInfo());
1715 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1716 MaxStoresPerMemsetOptSize = 8;
1717 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1718 MaxStoresPerMemcpyOptSize = 4;
1719 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1720 MaxStoresPerMemmoveOptSize = 4;
1722 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1723 // that needs to benchmarked and balanced with the potential use of vector
1724 // load/store types (PR33329, PR33914).
1725 MaxLoadsPerMemcmp = 2;
1726 MaxLoadsPerMemcmpOptSize = 2;
1728 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1729 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1731 // An out-of-order CPU can speculatively execute past a predictable branch,
1732 // but a conditional move could be stalled by an expensive earlier operation.
1733 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1734 EnableExtLdPromotion = true;
1735 setPrefFunctionAlignment(4); // 2^4 bytes.
1737 verifyIntrinsicTables();
1740 // This has so far only been implemented for 64-bit MachO.
1741 bool X86TargetLowering::useLoadStackGuardNode() const {
1742 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1745 bool X86TargetLowering::useStackGuardXorFP() const {
1746 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1747 return Subtarget.getTargetTriple().isOSMSVCRT();
1750 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1751 const SDLoc &DL) const {
1752 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1753 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1754 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1755 return SDValue(Node, 0);
1758 TargetLoweringBase::LegalizeTypeAction
1759 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1760 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1761 return TypeSplitVector;
1763 if (ExperimentalVectorWideningLegalization &&
1764 VT.getVectorNumElements() != 1 &&
1765 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1766 return TypeWidenVector;
1768 return TargetLoweringBase::getPreferredVectorAction(VT);
1771 MVT X86TargetLowering::getRegisterTypeForCallingConv(MVT VT) const {
1772 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1774 return TargetLowering::getRegisterTypeForCallingConv(VT);
1777 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
1779 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1781 return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
1784 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
1786 if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
1788 return TargetLowering::getNumRegistersForCallingConv(Context, VT);
1791 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1792 LLVMContext& Context,
1797 if (Subtarget.hasAVX512()) {
1798 const unsigned NumElts = VT.getVectorNumElements();
1800 // Figure out what this type will be legalized to.
1802 while (getTypeAction(Context, LegalVT) != TypeLegal)
1803 LegalVT = getTypeToTransformTo(Context, LegalVT);
1805 // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
1806 if (LegalVT.getSimpleVT().is512BitVector())
1807 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1809 if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
1810 // If we legalized to less than a 512-bit vector, then we will use a vXi1
1811 // compare for vXi32/vXi64 for sure. If we have BWI we will also support
1813 MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
1814 if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
1815 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1819 return VT.changeVectorElementTypeToInteger();
1822 /// Helper for getByValTypeAlignment to determine
1823 /// the desired ByVal argument alignment.
1824 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1827 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1828 if (VTy->getBitWidth() == 128)
1830 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1831 unsigned EltAlign = 0;
1832 getMaxByValAlign(ATy->getElementType(), EltAlign);
1833 if (EltAlign > MaxAlign)
1834 MaxAlign = EltAlign;
1835 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1836 for (auto *EltTy : STy->elements()) {
1837 unsigned EltAlign = 0;
1838 getMaxByValAlign(EltTy, EltAlign);
1839 if (EltAlign > MaxAlign)
1840 MaxAlign = EltAlign;
1847 /// Return the desired alignment for ByVal aggregate
1848 /// function arguments in the caller parameter area. For X86, aggregates
1849 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1850 /// are at 4-byte boundaries.
1851 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1852 const DataLayout &DL) const {
1853 if (Subtarget.is64Bit()) {
1854 // Max of 8 and alignment of type.
1855 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1862 if (Subtarget.hasSSE1())
1863 getMaxByValAlign(Ty, Align);
1867 /// Returns the target specific optimal type for load
1868 /// and store operations as a result of memset, memcpy, and memmove
1869 /// lowering. If DstAlign is zero that means it's safe to destination
1870 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1871 /// means there isn't a need to check it against alignment requirement,
1872 /// probably because the source does not need to be loaded. If 'IsMemset' is
1873 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1874 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1875 /// source is constant so it does not need to be loaded.
1876 /// It returns EVT::Other if the type should be determined using generic
1877 /// target-independent logic.
1879 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1880 unsigned DstAlign, unsigned SrcAlign,
1881 bool IsMemset, bool ZeroMemset,
1883 MachineFunction &MF) const {
1884 const Function &F = MF.getFunction();
1885 if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
1887 (!Subtarget.isUnalignedMem16Slow() ||
1888 ((DstAlign == 0 || DstAlign >= 16) &&
1889 (SrcAlign == 0 || SrcAlign >= 16)))) {
1890 // FIXME: Check if unaligned 32-byte accesses are slow.
1891 if (Size >= 32 && Subtarget.hasAVX()) {
1892 // Although this isn't a well-supported type for AVX1, we'll let
1893 // legalization and shuffle lowering produce the optimal codegen. If we
1894 // choose an optimal type with a vector element larger than a byte,
1895 // getMemsetStores() may create an intermediate splat (using an integer
1896 // multiply) before we splat as a vector.
1899 if (Subtarget.hasSSE2())
1901 // TODO: Can SSE1 handle a byte vector?
1902 if (Subtarget.hasSSE1())
1904 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1905 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1906 // Do not use f64 to lower memcpy if source is string constant. It's
1907 // better to use i32 to avoid the loads.
1908 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1909 // The gymnastics of splatting a byte value into an XMM register and then
1910 // only using 8-byte stores (because this is a CPU with slow unaligned
1911 // 16-byte accesses) makes that a loser.
1915 // This is a compromise. If we reach here, unaligned accesses may be slow on
1916 // this target. However, creating smaller, aligned accesses could be even
1917 // slower and would certainly be a lot more code.
1918 if (Subtarget.is64Bit() && Size >= 8)
1923 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1925 return X86ScalarSSEf32;
1926 else if (VT == MVT::f64)
1927 return X86ScalarSSEf64;
1932 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1937 switch (VT.getSizeInBits()) {
1939 // 8-byte and under are always assumed to be fast.
1943 *Fast = !Subtarget.isUnalignedMem16Slow();
1946 *Fast = !Subtarget.isUnalignedMem32Slow();
1948 // TODO: What about AVX-512 (512-bit) accesses?
1951 // Misaligned accesses of any size are always allowed.
1955 /// Return the entry encoding for a jump table in the
1956 /// current function. The returned value is a member of the
1957 /// MachineJumpTableInfo::JTEntryKind enum.
1958 unsigned X86TargetLowering::getJumpTableEncoding() const {
1959 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1961 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1962 return MachineJumpTableInfo::EK_Custom32;
1964 // Otherwise, use the normal jump table encoding heuristics.
1965 return TargetLowering::getJumpTableEncoding();
1968 bool X86TargetLowering::useSoftFloat() const {
1969 return Subtarget.useSoftFloat();
1972 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1973 ArgListTy &Args) const {
1975 // Only relabel X86-32 for C / Stdcall CCs.
1976 if (Subtarget.is64Bit())
1978 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1980 unsigned ParamRegs = 0;
1981 if (auto *M = MF->getFunction().getParent())
1982 ParamRegs = M->getNumberRegisterParameters();
1984 // Mark the first N int arguments as having reg
1985 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1986 Type *T = Args[Idx].Ty;
1987 if (T->isPointerTy() || T->isIntegerTy())
1988 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1989 unsigned numRegs = 1;
1990 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1992 if (ParamRegs < numRegs)
1994 ParamRegs -= numRegs;
1995 Args[Idx].IsInReg = true;
2001 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
2002 const MachineBasicBlock *MBB,
2003 unsigned uid,MCContext &Ctx) const{
2004 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
2005 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
2007 return MCSymbolRefExpr::create(MBB->getSymbol(),
2008 MCSymbolRefExpr::VK_GOTOFF, Ctx);
2011 /// Returns relocation base for the given PIC jumptable.
2012 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
2013 SelectionDAG &DAG) const {
2014 if (!Subtarget.is64Bit())
2015 // This doesn't have SDLoc associated with it, but is not really the
2016 // same as a Register.
2017 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
2018 getPointerTy(DAG.getDataLayout()));
2022 /// This returns the relocation base for the given PIC jumptable,
2023 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
2024 const MCExpr *X86TargetLowering::
2025 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
2026 MCContext &Ctx) const {
2027 // X86-64 uses RIP relative addressing based on the jump table label.
2028 if (Subtarget.isPICStyleRIPRel())
2029 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
2031 // Otherwise, the reference is relative to the PIC base.
2032 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
2035 std::pair<const TargetRegisterClass *, uint8_t>
2036 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
2038 const TargetRegisterClass *RRC = nullptr;
2040 switch (VT.SimpleTy) {
2042 return TargetLowering::findRepresentativeClass(TRI, VT);
2043 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
2044 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
2047 RRC = &X86::VR64RegClass;
2049 case MVT::f32: case MVT::f64:
2050 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
2051 case MVT::v4f32: case MVT::v2f64:
2052 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
2053 case MVT::v8f32: case MVT::v4f64:
2054 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
2055 case MVT::v16f32: case MVT::v8f64:
2056 RRC = &X86::VR128XRegClass;
2059 return std::make_pair(RRC, Cost);
2062 unsigned X86TargetLowering::getAddressSpace() const {
2063 if (Subtarget.is64Bit())
2064 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2068 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2069 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2070 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2073 static Constant* SegmentOffset(IRBuilder<> &IRB,
2074 unsigned Offset, unsigned AddressSpace) {
2075 return ConstantExpr::getIntToPtr(
2076 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2077 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2080 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2081 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2082 // tcbhead_t; use it instead of the usual global variable (see
2083 // sysdeps/{i386,x86_64}/nptl/tls.h)
2084 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2085 if (Subtarget.isTargetFuchsia()) {
2086 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2087 return SegmentOffset(IRB, 0x10, getAddressSpace());
2089 // %fs:0x28, unless we're using a Kernel code model, in which case
2090 // it's %gs:0x28. gs:0x14 on i386.
2091 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2092 return SegmentOffset(IRB, Offset, getAddressSpace());
2096 return TargetLowering::getIRStackGuard(IRB);
2099 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2100 // MSVC CRT provides functionalities for stack protection.
2101 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2102 // MSVC CRT has a global variable holding security cookie.
2103 M.getOrInsertGlobal("__security_cookie",
2104 Type::getInt8PtrTy(M.getContext()));
2106 // MSVC CRT has a function to validate security cookie.
2107 auto *SecurityCheckCookie = cast<Function>(
2108 M.getOrInsertFunction("__security_check_cookie",
2109 Type::getVoidTy(M.getContext()),
2110 Type::getInt8PtrTy(M.getContext())));
2111 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2112 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2115 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2116 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2118 TargetLowering::insertSSPDeclarations(M);
2121 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2122 // MSVC CRT has a global variable holding security cookie.
2123 if (Subtarget.getTargetTriple().isOSMSVCRT())
2124 return M.getGlobalVariable("__security_cookie");
2125 return TargetLowering::getSDagStackGuard(M);
2128 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2129 // MSVC CRT has a function to validate security cookie.
2130 if (Subtarget.getTargetTriple().isOSMSVCRT())
2131 return M.getFunction("__security_check_cookie");
2132 return TargetLowering::getSSPStackGuardCheck(M);
2135 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2136 if (Subtarget.getTargetTriple().isOSContiki())
2137 return getDefaultSafeStackPointerLocation(IRB, false);
2139 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2140 // definition of TLS_SLOT_SAFESTACK in
2141 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2142 if (Subtarget.isTargetAndroid()) {
2143 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2145 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2146 return SegmentOffset(IRB, Offset, getAddressSpace());
2149 // Fuchsia is similar.
2150 if (Subtarget.isTargetFuchsia()) {
2151 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2152 return SegmentOffset(IRB, 0x18, getAddressSpace());
2155 return TargetLowering::getSafeStackPointerLocation(IRB);
2158 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2159 unsigned DestAS) const {
2160 assert(SrcAS != DestAS && "Expected different address spaces!");
2162 return SrcAS < 256 && DestAS < 256;
2165 //===----------------------------------------------------------------------===//
2166 // Return Value Calling Convention Implementation
2167 //===----------------------------------------------------------------------===//
2169 #include "X86GenCallingConv.inc"
2171 bool X86TargetLowering::CanLowerReturn(
2172 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2173 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2174 SmallVector<CCValAssign, 16> RVLocs;
2175 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2176 return CCInfo.CheckReturn(Outs, RetCC_X86);
2179 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2180 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2184 /// Lowers masks values (v*i1) to the local register values
2185 /// \returns DAG node after lowering to register type
2186 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2187 const SDLoc &Dl, SelectionDAG &DAG) {
2188 EVT ValVT = ValArg.getValueType();
2190 if (ValVT == MVT::v1i1)
2191 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg,
2192 DAG.getIntPtrConstant(0, Dl));
2194 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2195 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2196 // Two stage lowering might be required
2197 // bitcast: v8i1 -> i8 / v16i1 -> i16
2198 // anyextend: i8 -> i32 / i16 -> i32
2199 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2200 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2201 if (ValLoc == MVT::i32)
2202 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2206 if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2207 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2208 // One stage lowering is required
2209 // bitcast: v32i1 -> i32 / v64i1 -> i64
2210 return DAG.getBitcast(ValLoc, ValArg);
2213 return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg);
2216 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2217 static void Passv64i1ArgInRegs(
2218 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2219 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2220 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2221 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2222 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2223 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2224 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2225 "The value should reside in two registers");
2227 // Before splitting the value we cast it to i64
2228 Arg = DAG.getBitcast(MVT::i64, Arg);
2230 // Splitting the value into two i32 types
2232 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2233 DAG.getConstant(0, Dl, MVT::i32));
2234 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2235 DAG.getConstant(1, Dl, MVT::i32));
2237 // Attach the two i32 types into corresponding registers
2238 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2239 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2243 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2245 const SmallVectorImpl<ISD::OutputArg> &Outs,
2246 const SmallVectorImpl<SDValue> &OutVals,
2247 const SDLoc &dl, SelectionDAG &DAG) const {
2248 MachineFunction &MF = DAG.getMachineFunction();
2249 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2251 // In some cases we need to disable registers from the default CSR list.
2252 // For example, when they are used for argument passing.
2253 bool ShouldDisableCalleeSavedRegister =
2254 CallConv == CallingConv::X86_RegCall ||
2255 MF.getFunction().hasFnAttribute("no_caller_saved_registers");
2257 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2258 report_fatal_error("X86 interrupts may not return any value");
2260 SmallVector<CCValAssign, 16> RVLocs;
2261 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2262 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2265 SmallVector<SDValue, 6> RetOps;
2266 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2267 // Operand #1 = Bytes To Pop
2268 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2271 // Copy the result values into the output registers.
2272 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2274 CCValAssign &VA = RVLocs[I];
2275 assert(VA.isRegLoc() && "Can only return in registers!");
2277 // Add the register to the CalleeSaveDisableRegs list.
2278 if (ShouldDisableCalleeSavedRegister)
2279 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2281 SDValue ValToCopy = OutVals[OutsIndex];
2282 EVT ValVT = ValToCopy.getValueType();
2284 // Promote values to the appropriate types.
2285 if (VA.getLocInfo() == CCValAssign::SExt)
2286 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2287 else if (VA.getLocInfo() == CCValAssign::ZExt)
2288 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2289 else if (VA.getLocInfo() == CCValAssign::AExt) {
2290 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2291 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2293 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2295 else if (VA.getLocInfo() == CCValAssign::BCvt)
2296 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2298 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2299 "Unexpected FP-extend for return value.");
2301 // If this is x86-64, and we disabled SSE, we can't return FP values,
2302 // or SSE or MMX vectors.
2303 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2304 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2305 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2306 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2307 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2308 } else if (ValVT == MVT::f64 &&
2309 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2310 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2311 // llvm-gcc has never done it right and no one has noticed, so this
2312 // should be OK for now.
2313 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2314 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2317 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2318 // the RET instruction and handled by the FP Stackifier.
2319 if (VA.getLocReg() == X86::FP0 ||
2320 VA.getLocReg() == X86::FP1) {
2321 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2322 // change the value to the FP stack register class.
2323 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2324 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2325 RetOps.push_back(ValToCopy);
2326 // Don't emit a copytoreg.
2330 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2331 // which is returned in RAX / RDX.
2332 if (Subtarget.is64Bit()) {
2333 if (ValVT == MVT::x86mmx) {
2334 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2335 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2336 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2338 // If we don't have SSE2 available, convert to v4f32 so the generated
2339 // register is legal.
2340 if (!Subtarget.hasSSE2())
2341 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2346 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2348 if (VA.needsCustom()) {
2349 assert(VA.getValVT() == MVT::v64i1 &&
2350 "Currently the only custom case is when we split v64i1 to 2 regs");
2352 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2355 assert(2 == RegsToPass.size() &&
2356 "Expecting two registers after Pass64BitArgInRegs");
2358 // Add the second register to the CalleeSaveDisableRegs list.
2359 if (ShouldDisableCalleeSavedRegister)
2360 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2362 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2365 // Add nodes to the DAG and add the values into the RetOps list
2366 for (auto &Reg : RegsToPass) {
2367 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2368 Flag = Chain.getValue(1);
2369 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2373 // Swift calling convention does not require we copy the sret argument
2374 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2376 // All x86 ABIs require that for returning structs by value we copy
2377 // the sret argument into %rax/%eax (depending on ABI) for the return.
2378 // We saved the argument into a virtual register in the entry block,
2379 // so now we copy the value out and into %rax/%eax.
2381 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2382 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2383 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2384 // either case FuncInfo->setSRetReturnReg() will have been called.
2385 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2386 // When we have both sret and another return value, we should use the
2387 // original Chain stored in RetOps[0], instead of the current Chain updated
2388 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2390 // For the case of sret and another return value, we have
2391 // Chain_0 at the function entry
2392 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2393 // If we use Chain_1 in getCopyFromReg, we will have
2394 // Val = getCopyFromReg(Chain_1)
2395 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2397 // getCopyToReg(Chain_0) will be glued together with
2398 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2399 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2400 // Data dependency from Unit B to Unit A due to usage of Val in
2401 // getCopyToReg(Chain_1, Val)
2402 // Chain dependency from Unit A to Unit B
2404 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2405 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2406 getPointerTy(MF.getDataLayout()));
2409 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2410 X86::RAX : X86::EAX;
2411 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2412 Flag = Chain.getValue(1);
2414 // RAX/EAX now acts like a return value.
2416 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2418 // Add the returned register to the CalleeSaveDisableRegs list.
2419 if (ShouldDisableCalleeSavedRegister)
2420 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2423 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2424 const MCPhysReg *I =
2425 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2428 if (X86::GR64RegClass.contains(*I))
2429 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2431 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2435 RetOps[0] = Chain; // Update chain.
2437 // Add the flag if we have it.
2439 RetOps.push_back(Flag);
2441 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2442 if (CallConv == CallingConv::X86_INTR)
2443 opcode = X86ISD::IRET;
2444 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2447 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2448 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2451 SDValue TCChain = Chain;
2452 SDNode *Copy = *N->use_begin();
2453 if (Copy->getOpcode() == ISD::CopyToReg) {
2454 // If the copy has a glue operand, we conservatively assume it isn't safe to
2455 // perform a tail call.
2456 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2458 TCChain = Copy->getOperand(0);
2459 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2462 bool HasRet = false;
2463 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2465 if (UI->getOpcode() != X86ISD::RET_FLAG)
2467 // If we are returning more than one value, we can definitely
2468 // not make a tail call see PR19530
2469 if (UI->getNumOperands() > 4)
2471 if (UI->getNumOperands() == 4 &&
2472 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2484 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2485 ISD::NodeType ExtendKind) const {
2486 MVT ReturnMVT = MVT::i32;
2488 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2489 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2490 // The ABI does not require i1, i8 or i16 to be extended.
2492 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2493 // always extending i8/i16 return values, so keep doing that for now.
2495 ReturnMVT = MVT::i8;
2498 EVT MinVT = getRegisterType(Context, ReturnMVT);
2499 return VT.bitsLT(MinVT) ? MinVT : VT;
2502 /// Reads two 32 bit registers and creates a 64 bit mask value.
2503 /// \param VA The current 32 bit value that need to be assigned.
2504 /// \param NextVA The next 32 bit value that need to be assigned.
2505 /// \param Root The parent DAG node.
2506 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2507 /// glue purposes. In the case the DAG is already using
2508 /// physical register instead of virtual, we should glue
2509 /// our new SDValue to InFlag SDvalue.
2510 /// \return a new SDvalue of size 64bit.
2511 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2512 SDValue &Root, SelectionDAG &DAG,
2513 const SDLoc &Dl, const X86Subtarget &Subtarget,
2514 SDValue *InFlag = nullptr) {
2515 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2516 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2517 assert(VA.getValVT() == MVT::v64i1 &&
2518 "Expecting first location of 64 bit width type");
2519 assert(NextVA.getValVT() == VA.getValVT() &&
2520 "The locations should have the same type");
2521 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2522 "The values should reside in two registers");
2526 SDValue ArgValueLo, ArgValueHi;
2528 MachineFunction &MF = DAG.getMachineFunction();
2529 const TargetRegisterClass *RC = &X86::GR32RegClass;
2531 // Read a 32 bit value from the registers
2532 if (nullptr == InFlag) {
2533 // When no physical register is present,
2534 // create an intermediate virtual register
2535 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2536 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2537 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2538 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2540 // When a physical register is available read the value from it and glue
2541 // the reads together.
2543 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2544 *InFlag = ArgValueLo.getValue(2);
2546 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2547 *InFlag = ArgValueHi.getValue(2);
2550 // Convert the i32 type into v32i1 type
2551 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2553 // Convert the i32 type into v32i1 type
2554 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2556 // Concatenate the two values together
2557 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2560 /// The function will lower a register of various sizes (8/16/32/64)
2561 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2562 /// \returns a DAG node contains the operand after lowering to mask type.
2563 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2564 const EVT &ValLoc, const SDLoc &Dl,
2565 SelectionDAG &DAG) {
2566 SDValue ValReturned = ValArg;
2568 if (ValVT == MVT::v1i1)
2569 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2571 if (ValVT == MVT::v64i1) {
2572 // In 32 bit machine, this case is handled by getv64i1Argument
2573 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2574 // In 64 bit machine, There is no need to truncate the value only bitcast
2577 switch (ValVT.getSimpleVT().SimpleTy) {
2588 llvm_unreachable("Expecting a vector of i1 types");
2591 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2593 return DAG.getBitcast(ValVT, ValReturned);
2596 /// Lower the result values of a call into the
2597 /// appropriate copies out of appropriate physical registers.
2599 SDValue X86TargetLowering::LowerCallResult(
2600 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2601 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2602 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2603 uint32_t *RegMask) const {
2605 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2606 // Assign locations to each value returned by this call.
2607 SmallVector<CCValAssign, 16> RVLocs;
2608 bool Is64Bit = Subtarget.is64Bit();
2609 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2611 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2613 // Copy all of the result registers out of their specified physreg.
2614 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2616 CCValAssign &VA = RVLocs[I];
2617 EVT CopyVT = VA.getLocVT();
2619 // In some calling conventions we need to remove the used registers
2620 // from the register mask.
2622 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2623 SubRegs.isValid(); ++SubRegs)
2624 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2627 // If this is x86-64, and we disabled SSE, we can't return FP values
2628 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2629 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2630 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2631 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2634 // If we prefer to use the value in xmm registers, copy it out as f80 and
2635 // use a truncate to move it from fp stack reg to xmm reg.
2636 bool RoundAfterCopy = false;
2637 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2638 isScalarFPTypeInSSEReg(VA.getValVT())) {
2639 if (!Subtarget.hasX87())
2640 report_fatal_error("X87 register return with X87 disabled");
2642 RoundAfterCopy = (CopyVT != VA.getLocVT());
2646 if (VA.needsCustom()) {
2647 assert(VA.getValVT() == MVT::v64i1 &&
2648 "Currently the only custom case is when we split v64i1 to 2 regs");
2650 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2652 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2654 Val = Chain.getValue(0);
2655 InFlag = Chain.getValue(2);
2659 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2660 // This truncation won't change the value.
2661 DAG.getIntPtrConstant(1, dl));
2663 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2664 if (VA.getValVT().isVector() &&
2665 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2666 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2667 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2668 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2670 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2673 InVals.push_back(Val);
2679 //===----------------------------------------------------------------------===//
2680 // C & StdCall & Fast Calling Convention implementation
2681 //===----------------------------------------------------------------------===//
2682 // StdCall calling convention seems to be standard for many Windows' API
2683 // routines and around. It differs from C calling convention just a little:
2684 // callee should clean up the stack, not caller. Symbols should be also
2685 // decorated in some fancy way :) It doesn't support any vector arguments.
2686 // For info on fast calling convention see Fast Calling Convention (tail call)
2687 // implementation LowerX86_32FastCCCallTo.
2689 /// CallIsStructReturn - Determines whether a call uses struct return
2691 enum StructReturnType {
2696 static StructReturnType
2697 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2699 return NotStructReturn;
2701 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2702 if (!Flags.isSRet())
2703 return NotStructReturn;
2704 if (Flags.isInReg() || IsMCU)
2705 return RegStructReturn;
2706 return StackStructReturn;
2709 /// Determines whether a function uses struct return semantics.
2710 static StructReturnType
2711 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2713 return NotStructReturn;
2715 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2716 if (!Flags.isSRet())
2717 return NotStructReturn;
2718 if (Flags.isInReg() || IsMCU)
2719 return RegStructReturn;
2720 return StackStructReturn;
2723 /// Make a copy of an aggregate at address specified by "Src" to address
2724 /// "Dst" with size and alignment information specified by the specific
2725 /// parameter attribute. The copy will be passed as a byval function parameter.
2726 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2727 SDValue Chain, ISD::ArgFlagsTy Flags,
2728 SelectionDAG &DAG, const SDLoc &dl) {
2729 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2731 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2732 /*isVolatile*/false, /*AlwaysInline=*/true,
2733 /*isTailCall*/false,
2734 MachinePointerInfo(), MachinePointerInfo());
2737 /// Return true if the calling convention is one that we can guarantee TCO for.
2738 static bool canGuaranteeTCO(CallingConv::ID CC) {
2739 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2740 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2741 CC == CallingConv::HHVM);
2744 /// Return true if we might ever do TCO for calls with this calling convention.
2745 static bool mayTailCallThisCC(CallingConv::ID CC) {
2747 // C calling conventions:
2748 case CallingConv::C:
2749 case CallingConv::Win64:
2750 case CallingConv::X86_64_SysV:
2751 // Callee pop conventions:
2752 case CallingConv::X86_ThisCall:
2753 case CallingConv::X86_StdCall:
2754 case CallingConv::X86_VectorCall:
2755 case CallingConv::X86_FastCall:
2758 return canGuaranteeTCO(CC);
2762 /// Return true if the function is being made into a tailcall target by
2763 /// changing its ABI.
2764 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2765 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2768 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2770 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2771 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2774 ImmutableCallSite CS(CI);
2775 CallingConv::ID CalleeCC = CS.getCallingConv();
2776 if (!mayTailCallThisCC(CalleeCC))
2783 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2784 const SmallVectorImpl<ISD::InputArg> &Ins,
2785 const SDLoc &dl, SelectionDAG &DAG,
2786 const CCValAssign &VA,
2787 MachineFrameInfo &MFI, unsigned i) const {
2788 // Create the nodes corresponding to a load from this parameter slot.
2789 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2790 bool AlwaysUseMutable = shouldGuaranteeTCO(
2791 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2792 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2794 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2796 // If value is passed by pointer we have address passed instead of the value
2797 // itself. No need to extend if the mask value and location share the same
2799 bool ExtendedInMem =
2800 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2801 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2803 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2804 ValVT = VA.getLocVT();
2806 ValVT = VA.getValVT();
2808 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2809 // taken by a return address.
2811 if (CallConv == CallingConv::X86_INTR) {
2812 // X86 interrupts may take one or two arguments.
2813 // On the stack there will be no return address as in regular call.
2814 // Offset of last argument need to be set to -4/-8 bytes.
2815 // Where offset of the first argument out of two, should be set to 0 bytes.
2816 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2817 if (Subtarget.is64Bit() && Ins.size() == 2) {
2818 // The stack pointer needs to be realigned for 64 bit handlers with error
2819 // code, so the argument offset changes by 8 bytes.
2824 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2825 // changed with more analysis.
2826 // In case of tail call optimization mark all arguments mutable. Since they
2827 // could be overwritten by lowering of arguments in case of a tail call.
2828 if (Flags.isByVal()) {
2829 unsigned Bytes = Flags.getByValSize();
2830 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2831 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2832 // Adjust SP offset of interrupt parameter.
2833 if (CallConv == CallingConv::X86_INTR) {
2834 MFI.setObjectOffset(FI, Offset);
2836 return DAG.getFrameIndex(FI, PtrVT);
2839 // This is an argument in memory. We might be able to perform copy elision.
2840 if (Flags.isCopyElisionCandidate()) {
2841 EVT ArgVT = Ins[i].ArgVT;
2843 if (Ins[i].PartOffset == 0) {
2844 // If this is a one-part value or the first part of a multi-part value,
2845 // create a stack object for the entire argument value type and return a
2846 // load from our portion of it. This assumes that if the first part of an
2847 // argument is in memory, the rest will also be in memory.
2848 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2849 /*Immutable=*/false);
2850 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2852 ValVT, dl, Chain, PartAddr,
2853 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2855 // This is not the first piece of an argument in memory. See if there is
2856 // already a fixed stack object including this offset. If so, assume it
2857 // was created by the PartOffset == 0 branch above and create a load from
2858 // the appropriate offset into it.
2859 int64_t PartBegin = VA.getLocMemOffset();
2860 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2861 int FI = MFI.getObjectIndexBegin();
2862 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2863 int64_t ObjBegin = MFI.getObjectOffset(FI);
2864 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2865 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2868 if (MFI.isFixedObjectIndex(FI)) {
2870 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2871 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2873 ValVT, dl, Chain, Addr,
2874 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2875 Ins[i].PartOffset));
2880 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2881 VA.getLocMemOffset(), isImmutable);
2883 // Set SExt or ZExt flag.
2884 if (VA.getLocInfo() == CCValAssign::ZExt) {
2885 MFI.setObjectZExt(FI, true);
2886 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2887 MFI.setObjectSExt(FI, true);
2890 // Adjust SP offset of interrupt parameter.
2891 if (CallConv == CallingConv::X86_INTR) {
2892 MFI.setObjectOffset(FI, Offset);
2895 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2896 SDValue Val = DAG.getLoad(
2897 ValVT, dl, Chain, FIN,
2898 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2899 return ExtendedInMem
2900 ? (VA.getValVT().isVector()
2901 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2902 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2906 // FIXME: Get this from tablegen.
2907 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2908 const X86Subtarget &Subtarget) {
2909 assert(Subtarget.is64Bit());
2911 if (Subtarget.isCallingConvWin64(CallConv)) {
2912 static const MCPhysReg GPR64ArgRegsWin64[] = {
2913 X86::RCX, X86::RDX, X86::R8, X86::R9
2915 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2918 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2919 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2921 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2924 // FIXME: Get this from tablegen.
2925 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2926 CallingConv::ID CallConv,
2927 const X86Subtarget &Subtarget) {
2928 assert(Subtarget.is64Bit());
2929 if (Subtarget.isCallingConvWin64(CallConv)) {
2930 // The XMM registers which might contain var arg parameters are shadowed
2931 // in their paired GPR. So we only need to save the GPR to their home
2933 // TODO: __vectorcall will change this.
2937 const Function &F = MF.getFunction();
2938 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
2939 bool isSoftFloat = Subtarget.useSoftFloat();
2940 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2941 "SSE register cannot be used when SSE is disabled!");
2942 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2943 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2947 static const MCPhysReg XMMArgRegs64Bit[] = {
2948 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2949 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2951 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2955 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2956 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2957 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2958 return A.getValNo() < B.getValNo();
2963 SDValue X86TargetLowering::LowerFormalArguments(
2964 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2965 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2966 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2967 MachineFunction &MF = DAG.getMachineFunction();
2968 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2969 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2971 const Function &F = MF.getFunction();
2972 if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
2973 F.getName() == "main")
2974 FuncInfo->setForceFramePointer(true);
2976 MachineFrameInfo &MFI = MF.getFrameInfo();
2977 bool Is64Bit = Subtarget.is64Bit();
2978 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2981 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2982 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2984 if (CallConv == CallingConv::X86_INTR) {
2985 bool isLegal = Ins.size() == 1 ||
2986 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2987 (!Is64Bit && Ins[1].VT == MVT::i32)));
2989 report_fatal_error("X86 interrupts may take one or two arguments");
2992 // Assign locations to all of the incoming arguments.
2993 SmallVector<CCValAssign, 16> ArgLocs;
2994 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2996 // Allocate shadow area for Win64.
2998 CCInfo.AllocateStack(32, 8);
3000 CCInfo.AnalyzeArguments(Ins, CC_X86);
3002 // In vectorcall calling convention a second pass is required for the HVA
3004 if (CallingConv::X86_VectorCall == CallConv) {
3005 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
3008 // The next loop assumes that the locations are in the same order of the
3010 assert(isSortedByValueNo(ArgLocs) &&
3011 "Argument Location list must be sorted before lowering");
3014 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
3016 assert(InsIndex < Ins.size() && "Invalid Ins index");
3017 CCValAssign &VA = ArgLocs[I];
3019 if (VA.isRegLoc()) {
3020 EVT RegVT = VA.getLocVT();
3021 if (VA.needsCustom()) {
3023 VA.getValVT() == MVT::v64i1 &&
3024 "Currently the only custom case is when we split v64i1 to 2 regs");
3026 // v64i1 values, in regcall calling convention, that are
3027 // compiled to 32 bit arch, are split up into two registers.
3029 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
3031 const TargetRegisterClass *RC;
3032 if (RegVT == MVT::i32)
3033 RC = &X86::GR32RegClass;
3034 else if (Is64Bit && RegVT == MVT::i64)
3035 RC = &X86::GR64RegClass;
3036 else if (RegVT == MVT::f32)
3037 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
3038 else if (RegVT == MVT::f64)
3039 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
3040 else if (RegVT == MVT::f80)
3041 RC = &X86::RFP80RegClass;
3042 else if (RegVT == MVT::f128)
3043 RC = &X86::FR128RegClass;
3044 else if (RegVT.is512BitVector())
3045 RC = &X86::VR512RegClass;
3046 else if (RegVT.is256BitVector())
3047 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
3048 else if (RegVT.is128BitVector())
3049 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
3050 else if (RegVT == MVT::x86mmx)
3051 RC = &X86::VR64RegClass;
3052 else if (RegVT == MVT::v1i1)
3053 RC = &X86::VK1RegClass;
3054 else if (RegVT == MVT::v8i1)
3055 RC = &X86::VK8RegClass;
3056 else if (RegVT == MVT::v16i1)
3057 RC = &X86::VK16RegClass;
3058 else if (RegVT == MVT::v32i1)
3059 RC = &X86::VK32RegClass;
3060 else if (RegVT == MVT::v64i1)
3061 RC = &X86::VK64RegClass;
3063 llvm_unreachable("Unknown argument type!");
3065 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
3066 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
3069 // If this is an 8 or 16-bit value, it is really passed promoted to 32
3070 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3072 if (VA.getLocInfo() == CCValAssign::SExt)
3073 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3074 DAG.getValueType(VA.getValVT()));
3075 else if (VA.getLocInfo() == CCValAssign::ZExt)
3076 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3077 DAG.getValueType(VA.getValVT()));
3078 else if (VA.getLocInfo() == CCValAssign::BCvt)
3079 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3081 if (VA.isExtInLoc()) {
3082 // Handle MMX values passed in XMM regs.
3083 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3084 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3085 else if (VA.getValVT().isVector() &&
3086 VA.getValVT().getScalarType() == MVT::i1 &&
3087 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3088 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3089 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3090 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3092 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3095 assert(VA.isMemLoc());
3097 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3100 // If value is passed via pointer - do a load.
3101 if (VA.getLocInfo() == CCValAssign::Indirect)
3103 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3105 InVals.push_back(ArgValue);
3108 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3109 // Swift calling convention does not require we copy the sret argument
3110 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3111 if (CallConv == CallingConv::Swift)
3114 // All x86 ABIs require that for returning structs by value we copy the
3115 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3116 // the argument into a virtual register so that we can access it from the
3118 if (Ins[I].Flags.isSRet()) {
3119 unsigned Reg = FuncInfo->getSRetReturnReg();
3121 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3122 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3123 FuncInfo->setSRetReturnReg(Reg);
3125 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3126 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3131 unsigned StackSize = CCInfo.getNextStackOffset();
3132 // Align stack specially for tail calls.
3133 if (shouldGuaranteeTCO(CallConv,
3134 MF.getTarget().Options.GuaranteedTailCallOpt))
3135 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3137 // If the function takes variable number of arguments, make a frame index for
3138 // the start of the first vararg value... for expansion of llvm.va_start. We
3139 // can skip this if there are no va_start calls.
3140 if (MFI.hasVAStart() &&
3141 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3142 CallConv != CallingConv::X86_ThisCall))) {
3143 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3146 // Figure out if XMM registers are in use.
3147 assert(!(Subtarget.useSoftFloat() &&
3148 F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
3149 "SSE register cannot be used when SSE is disabled!");
3151 // 64-bit calling conventions support varargs and register parameters, so we
3152 // have to do extra work to spill them in the prologue.
3153 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3154 // Find the first unallocated argument registers.
3155 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3156 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3157 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3158 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3159 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3160 "SSE register cannot be used when SSE is disabled!");
3162 // Gather all the live in physical registers.
3163 SmallVector<SDValue, 6> LiveGPRs;
3164 SmallVector<SDValue, 8> LiveXMMRegs;
3166 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3167 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3169 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3171 if (!ArgXMMs.empty()) {
3172 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3173 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3174 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3175 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3176 LiveXMMRegs.push_back(
3177 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3182 // Get to the caller-allocated home save location. Add 8 to account
3183 // for the return address.
3184 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3185 FuncInfo->setRegSaveFrameIndex(
3186 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3187 // Fixup to set vararg frame on shadow area (4 x i64).
3189 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3191 // For X86-64, if there are vararg parameters that are passed via
3192 // registers, then we must store them to their spots on the stack so
3193 // they may be loaded by dereferencing the result of va_next.
3194 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3195 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3196 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3197 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3200 // Store the integer parameter registers.
3201 SmallVector<SDValue, 8> MemOps;
3202 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3203 getPointerTy(DAG.getDataLayout()));
3204 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3205 for (SDValue Val : LiveGPRs) {
3206 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3207 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3209 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3210 MachinePointerInfo::getFixedStack(
3211 DAG.getMachineFunction(),
3212 FuncInfo->getRegSaveFrameIndex(), Offset));
3213 MemOps.push_back(Store);
3217 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3218 // Now store the XMM (fp + vector) parameter registers.
3219 SmallVector<SDValue, 12> SaveXMMOps;
3220 SaveXMMOps.push_back(Chain);
3221 SaveXMMOps.push_back(ALVal);
3222 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3223 FuncInfo->getRegSaveFrameIndex(), dl));
3224 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3225 FuncInfo->getVarArgsFPOffset(), dl));
3226 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3228 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3229 MVT::Other, SaveXMMOps));
3232 if (!MemOps.empty())
3233 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3236 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3237 // Find the largest legal vector type.
3238 MVT VecVT = MVT::Other;
3239 // FIXME: Only some x86_32 calling conventions support AVX512.
3240 if (Subtarget.hasAVX512() &&
3241 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3242 CallConv == CallingConv::Intel_OCL_BI)))
3243 VecVT = MVT::v16f32;
3244 else if (Subtarget.hasAVX())
3246 else if (Subtarget.hasSSE2())
3249 // We forward some GPRs and some vector types.
3250 SmallVector<MVT, 2> RegParmTypes;
3251 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3252 RegParmTypes.push_back(IntVT);
3253 if (VecVT != MVT::Other)
3254 RegParmTypes.push_back(VecVT);
3256 // Compute the set of forwarded registers. The rest are scratch.
3257 SmallVectorImpl<ForwardedRegister> &Forwards =
3258 FuncInfo->getForwardedMustTailRegParms();
3259 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3261 // Conservatively forward AL on x86_64, since it might be used for varargs.
3262 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3263 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3264 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3267 // Copy all forwards from physical to virtual registers.
3268 for (ForwardedRegister &F : Forwards) {
3269 // FIXME: Can we use a less constrained schedule?
3270 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3271 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3272 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3276 // Some CCs need callee pop.
3277 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3278 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3279 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3280 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3281 // X86 interrupts must pop the error code (and the alignment padding) if
3283 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3285 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3286 // If this is an sret function, the return should pop the hidden pointer.
3287 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3288 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3289 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3290 FuncInfo->setBytesToPopOnReturn(4);
3294 // RegSaveFrameIndex is X86-64 only.
3295 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3296 if (CallConv == CallingConv::X86_FastCall ||
3297 CallConv == CallingConv::X86_ThisCall)
3298 // fastcc functions can't have varargs.
3299 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3302 FuncInfo->setArgumentStackSize(StackSize);
3304 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3305 EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
3306 if (Personality == EHPersonality::CoreCLR) {
3308 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3309 // that we'd prefer this slot be allocated towards the bottom of the frame
3310 // (i.e. near the stack pointer after allocating the frame). Every
3311 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3312 // offset from the bottom of this and each funclet's frame must be the
3313 // same, so the size of funclets' (mostly empty) frames is dictated by
3314 // how far this slot is from the bottom (since they allocate just enough
3315 // space to accommodate holding this slot at the correct offset).
3316 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3317 EHInfo->PSPSymFrameIdx = PSPSymFI;
3321 if (CallConv == CallingConv::X86_RegCall ||
3322 F.hasFnAttribute("no_caller_saved_registers")) {
3323 MachineRegisterInfo &MRI = MF.getRegInfo();
3324 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3325 MRI.disableCalleeSavedRegister(Pair.first);
3331 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3332 SDValue Arg, const SDLoc &dl,
3334 const CCValAssign &VA,
3335 ISD::ArgFlagsTy Flags) const {
3336 unsigned LocMemOffset = VA.getLocMemOffset();
3337 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3338 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3340 if (Flags.isByVal())
3341 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3343 return DAG.getStore(
3344 Chain, dl, Arg, PtrOff,
3345 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3348 /// Emit a load of return address if tail call
3349 /// optimization is performed and it is required.
3350 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3351 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3352 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3353 // Adjust the Return address stack slot.
3354 EVT VT = getPointerTy(DAG.getDataLayout());
3355 OutRetAddr = getReturnAddressFrameIndex(DAG);
3357 // Load the "old" Return address.
3358 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3359 return SDValue(OutRetAddr.getNode(), 1);
3362 /// Emit a store of the return address if tail call
3363 /// optimization is performed and it is required (FPDiff!=0).
3364 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3365 SDValue Chain, SDValue RetAddrFrIdx,
3366 EVT PtrVT, unsigned SlotSize,
3367 int FPDiff, const SDLoc &dl) {
3368 // Store the return address to the appropriate stack slot.
3369 if (!FPDiff) return Chain;
3370 // Calculate the new stack slot for the return address.
3371 int NewReturnAddrFI =
3372 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3374 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3375 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3376 MachinePointerInfo::getFixedStack(
3377 DAG.getMachineFunction(), NewReturnAddrFI));
3381 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3382 /// operation of specified width.
3383 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3385 unsigned NumElems = VT.getVectorNumElements();
3386 SmallVector<int, 8> Mask;
3387 Mask.push_back(NumElems);
3388 for (unsigned i = 1; i != NumElems; ++i)
3390 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3394 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3395 SmallVectorImpl<SDValue> &InVals) const {
3396 SelectionDAG &DAG = CLI.DAG;
3398 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3399 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3400 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3401 SDValue Chain = CLI.Chain;
3402 SDValue Callee = CLI.Callee;
3403 CallingConv::ID CallConv = CLI.CallConv;
3404 bool &isTailCall = CLI.IsTailCall;
3405 bool isVarArg = CLI.IsVarArg;
3407 MachineFunction &MF = DAG.getMachineFunction();
3408 bool Is64Bit = Subtarget.is64Bit();
3409 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3410 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3411 bool IsSibcall = false;
3412 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3413 auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
3414 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3415 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3416 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3417 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3419 if (CallConv == CallingConv::X86_INTR)
3420 report_fatal_error("X86 interrupts may not be called directly");
3422 if (Attr.getValueAsString() == "true")
3425 if (Subtarget.isPICStyleGOT() &&
3426 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3427 // If we are using a GOT, disable tail calls to external symbols with
3428 // default visibility. Tail calling such a symbol requires using a GOT
3429 // relocation, which forces early binding of the symbol. This breaks code
3430 // that require lazy function symbol resolution. Using musttail or
3431 // GuaranteedTailCallOpt will override this.
3432 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3433 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3434 G->getGlobal()->hasDefaultVisibility()))
3438 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3440 // Force this to be a tail call. The verifier rules are enough to ensure
3441 // that we can lower this successfully without moving the return address
3444 } else if (isTailCall) {
3445 // Check if it's really possible to do a tail call.
3446 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3447 isVarArg, SR != NotStructReturn,
3448 MF.getFunction().hasStructRetAttr(), CLI.RetTy,
3449 Outs, OutVals, Ins, DAG);
3451 // Sibcalls are automatically detected tailcalls which do not require
3453 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3460 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3461 "Var args not supported with calling convention fastcc, ghc or hipe");
3463 // Analyze operands of the call, assigning locations to each operand.
3464 SmallVector<CCValAssign, 16> ArgLocs;
3465 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3467 // Allocate shadow area for Win64.
3469 CCInfo.AllocateStack(32, 8);
3471 CCInfo.AnalyzeArguments(Outs, CC_X86);
3473 // In vectorcall calling convention a second pass is required for the HVA
3475 if (CallingConv::X86_VectorCall == CallConv) {
3476 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3479 // Get a count of how many bytes are to be pushed on the stack.
3480 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3482 // This is a sibcall. The memory operands are available in caller's
3483 // own caller's stack.
3485 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3486 canGuaranteeTCO(CallConv))
3487 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3490 if (isTailCall && !IsSibcall && !IsMustTail) {
3491 // Lower arguments at fp - stackoffset + fpdiff.
3492 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3494 FPDiff = NumBytesCallerPushed - NumBytes;
3496 // Set the delta of movement of the returnaddr stackslot.
3497 // But only set if delta is greater than previous delta.
3498 if (FPDiff < X86Info->getTCReturnAddrDelta())
3499 X86Info->setTCReturnAddrDelta(FPDiff);
3502 unsigned NumBytesToPush = NumBytes;
3503 unsigned NumBytesToPop = NumBytes;
3505 // If we have an inalloca argument, all stack space has already been allocated
3506 // for us and be right at the top of the stack. We don't support multiple
3507 // arguments passed in memory when using inalloca.
3508 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3510 if (!ArgLocs.back().isMemLoc())
3511 report_fatal_error("cannot use inalloca attribute on a register "
3513 if (ArgLocs.back().getLocMemOffset() != 0)
3514 report_fatal_error("any parameter with the inalloca attribute must be "
3515 "the only memory argument");
3519 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3520 NumBytes - NumBytesToPush, dl);
3522 SDValue RetAddrFrIdx;
3523 // Load return address for tail calls.
3524 if (isTailCall && FPDiff)
3525 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3526 Is64Bit, FPDiff, dl);
3528 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3529 SmallVector<SDValue, 8> MemOpChains;
3532 // The next loop assumes that the locations are in the same order of the
3534 assert(isSortedByValueNo(ArgLocs) &&
3535 "Argument Location list must be sorted before lowering");
3537 // Walk the register/memloc assignments, inserting copies/loads. In the case
3538 // of tail call optimization arguments are handle later.
3539 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3540 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3542 assert(OutIndex < Outs.size() && "Invalid Out index");
3543 // Skip inalloca arguments, they have already been written.
3544 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3545 if (Flags.isInAlloca())
3548 CCValAssign &VA = ArgLocs[I];
3549 EVT RegVT = VA.getLocVT();
3550 SDValue Arg = OutVals[OutIndex];
3551 bool isByVal = Flags.isByVal();
3553 // Promote the value if needed.
3554 switch (VA.getLocInfo()) {
3555 default: llvm_unreachable("Unknown loc info!");
3556 case CCValAssign::Full: break;
3557 case CCValAssign::SExt:
3558 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3560 case CCValAssign::ZExt:
3561 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3563 case CCValAssign::AExt:
3564 if (Arg.getValueType().isVector() &&
3565 Arg.getValueType().getVectorElementType() == MVT::i1)
3566 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3567 else if (RegVT.is128BitVector()) {
3568 // Special case: passing MMX values in XMM registers.
3569 Arg = DAG.getBitcast(MVT::i64, Arg);
3570 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3571 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3573 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3575 case CCValAssign::BCvt:
3576 Arg = DAG.getBitcast(RegVT, Arg);
3578 case CCValAssign::Indirect: {
3579 // Store the argument.
3580 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3581 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3582 Chain = DAG.getStore(
3583 Chain, dl, Arg, SpillSlot,
3584 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3590 if (VA.needsCustom()) {
3591 assert(VA.getValVT() == MVT::v64i1 &&
3592 "Currently the only custom case is when we split v64i1 to 2 regs");
3593 // Split v64i1 value into two registers
3594 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3596 } else if (VA.isRegLoc()) {
3597 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3598 if (isVarArg && IsWin64) {
3599 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3600 // shadow reg if callee is a varargs function.
3601 unsigned ShadowReg = 0;
3602 switch (VA.getLocReg()) {
3603 case X86::XMM0: ShadowReg = X86::RCX; break;
3604 case X86::XMM1: ShadowReg = X86::RDX; break;
3605 case X86::XMM2: ShadowReg = X86::R8; break;
3606 case X86::XMM3: ShadowReg = X86::R9; break;
3609 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3611 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3612 assert(VA.isMemLoc());
3613 if (!StackPtr.getNode())
3614 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3615 getPointerTy(DAG.getDataLayout()));
3616 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3617 dl, DAG, VA, Flags));
3621 if (!MemOpChains.empty())
3622 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3624 if (Subtarget.isPICStyleGOT()) {
3625 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3628 RegsToPass.push_back(std::make_pair(
3629 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3630 getPointerTy(DAG.getDataLayout()))));
3632 // If we are tail calling and generating PIC/GOT style code load the
3633 // address of the callee into ECX. The value in ecx is used as target of
3634 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3635 // for tail calls on PIC/GOT architectures. Normally we would just put the
3636 // address of GOT into ebx and then call target@PLT. But for tail calls
3637 // ebx would be restored (since ebx is callee saved) before jumping to the
3640 // Note: The actual moving to ECX is done further down.
3641 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3642 if (G && !G->getGlobal()->hasLocalLinkage() &&
3643 G->getGlobal()->hasDefaultVisibility())
3644 Callee = LowerGlobalAddress(Callee, DAG);
3645 else if (isa<ExternalSymbolSDNode>(Callee))
3646 Callee = LowerExternalSymbol(Callee, DAG);
3650 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3651 // From AMD64 ABI document:
3652 // For calls that may call functions that use varargs or stdargs
3653 // (prototype-less calls or calls to functions containing ellipsis (...) in
3654 // the declaration) %al is used as hidden argument to specify the number
3655 // of SSE registers used. The contents of %al do not need to match exactly
3656 // the number of registers, but must be an ubound on the number of SSE
3657 // registers used and is in the range 0 - 8 inclusive.
3659 // Count the number of XMM registers allocated.
3660 static const MCPhysReg XMMArgRegs[] = {
3661 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3662 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3664 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3665 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3666 && "SSE registers cannot be used when SSE is disabled");
3668 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3669 DAG.getConstant(NumXMMRegs, dl,
3673 if (isVarArg && IsMustTail) {
3674 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3675 for (const auto &F : Forwards) {
3676 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3677 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3681 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3682 // don't need this because the eligibility check rejects calls that require
3683 // shuffling arguments passed in memory.
3684 if (!IsSibcall && isTailCall) {
3685 // Force all the incoming stack arguments to be loaded from the stack
3686 // before any new outgoing arguments are stored to the stack, because the
3687 // outgoing stack slots may alias the incoming argument stack slots, and
3688 // the alias isn't otherwise explicit. This is slightly more conservative
3689 // than necessary, because it means that each store effectively depends
3690 // on every argument instead of just those arguments it would clobber.
3691 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3693 SmallVector<SDValue, 8> MemOpChains2;
3696 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3698 CCValAssign &VA = ArgLocs[I];
3700 if (VA.isRegLoc()) {
3701 if (VA.needsCustom()) {
3702 assert((CallConv == CallingConv::X86_RegCall) &&
3703 "Expecting custom case only in regcall calling convention");
3704 // This means that we are in special case where one argument was
3705 // passed through two register locations - Skip the next location
3712 assert(VA.isMemLoc());
3713 SDValue Arg = OutVals[OutsIndex];
3714 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3715 // Skip inalloca arguments. They don't require any work.
3716 if (Flags.isInAlloca())
3718 // Create frame index.
3719 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3720 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3721 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3722 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3724 if (Flags.isByVal()) {
3725 // Copy relative to framepointer.
3726 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3727 if (!StackPtr.getNode())
3728 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3729 getPointerTy(DAG.getDataLayout()));
3730 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3733 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3737 // Store relative to framepointer.
3738 MemOpChains2.push_back(DAG.getStore(
3739 ArgChain, dl, Arg, FIN,
3740 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3744 if (!MemOpChains2.empty())
3745 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3747 // Store the return address to the appropriate stack slot.
3748 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3749 getPointerTy(DAG.getDataLayout()),
3750 RegInfo->getSlotSize(), FPDiff, dl);
3753 // Build a sequence of copy-to-reg nodes chained together with token chain
3754 // and flag operands which copy the outgoing args into registers.
3756 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3757 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3758 RegsToPass[i].second, InFlag);
3759 InFlag = Chain.getValue(1);
3762 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3763 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3764 // In the 64-bit large code model, we have to make all calls
3765 // through a register, since the call instruction's 32-bit
3766 // pc-relative offset may not be large enough to hold the whole
3768 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3769 // If the callee is a GlobalAddress node (quite common, every direct call
3770 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3772 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3774 // We should use extra load for direct calls to dllimported functions in
3776 const GlobalValue *GV = G->getGlobal();
3777 if (!GV->hasDLLImportStorageClass()) {
3778 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3780 Callee = DAG.getTargetGlobalAddress(
3781 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3783 if (OpFlags == X86II::MO_GOTPCREL) {
3785 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3786 getPointerTy(DAG.getDataLayout()), Callee);
3787 // Add extra indirection
3788 Callee = DAG.getLoad(
3789 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3790 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3793 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3794 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
3795 unsigned char OpFlags =
3796 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3798 Callee = DAG.getTargetExternalSymbol(
3799 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3801 if (OpFlags == X86II::MO_GOTPCREL) {
3802 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3803 getPointerTy(DAG.getDataLayout()), Callee);
3804 Callee = DAG.getLoad(
3805 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3806 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3808 } else if (Subtarget.isTarget64BitILP32() &&
3809 Callee->getValueType(0) == MVT::i32) {
3810 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3811 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3814 // Returns a chain & a flag for retval copy to use.
3815 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3816 SmallVector<SDValue, 8> Ops;
3818 if (!IsSibcall && isTailCall) {
3819 Chain = DAG.getCALLSEQ_END(Chain,
3820 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3821 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3822 InFlag = Chain.getValue(1);
3825 Ops.push_back(Chain);
3826 Ops.push_back(Callee);
3829 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3831 // Add argument registers to the end of the list so that they are known live
3833 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3834 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3835 RegsToPass[i].second.getValueType()));
3837 // Add a register mask operand representing the call-preserved registers.
3838 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3839 // set X86_INTR calling convention because it has the same CSR mask
3840 // (same preserved registers).
3841 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3842 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3843 assert(Mask && "Missing call preserved mask for calling convention");
3845 // If this is an invoke in a 32-bit function using a funclet-based
3846 // personality, assume the function clobbers all registers. If an exception
3847 // is thrown, the runtime will not restore CSRs.
3848 // FIXME: Model this more precisely so that we can register allocate across
3849 // the normal edge and spill and fill across the exceptional edge.
3850 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3851 const Function &CallerFn = MF.getFunction();
3852 EHPersonality Pers =
3853 CallerFn.hasPersonalityFn()
3854 ? classifyEHPersonality(CallerFn.getPersonalityFn())
3855 : EHPersonality::Unknown;
3856 if (isFuncletEHPersonality(Pers))
3857 Mask = RegInfo->getNoPreservedMask();
3860 // Define a new register mask from the existing mask.
3861 uint32_t *RegMask = nullptr;
3863 // In some calling conventions we need to remove the used physical registers
3864 // from the reg mask.
3865 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3866 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3868 // Allocate a new Reg Mask and copy Mask.
3869 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3870 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3871 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3873 // Make sure all sub registers of the argument registers are reset
3875 for (auto const &RegPair : RegsToPass)
3876 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3877 SubRegs.isValid(); ++SubRegs)
3878 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3880 // Create the RegMask Operand according to our updated mask.
3881 Ops.push_back(DAG.getRegisterMask(RegMask));
3883 // Create the RegMask Operand according to the static mask.
3884 Ops.push_back(DAG.getRegisterMask(Mask));
3887 if (InFlag.getNode())
3888 Ops.push_back(InFlag);
3892 //// If this is the first return lowered for this function, add the regs
3893 //// to the liveout set for the function.
3894 // This isn't right, although it's probably harmless on x86; liveouts
3895 // should be computed from returns not tail calls. Consider a void
3896 // function making a tail call to a function returning int.
3897 MF.getFrameInfo().setHasTailCall();
3898 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3901 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3902 InFlag = Chain.getValue(1);
3904 // Create the CALLSEQ_END node.
3905 unsigned NumBytesForCalleeToPop;
3906 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3907 DAG.getTarget().Options.GuaranteedTailCallOpt))
3908 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3909 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3910 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3911 SR == StackStructReturn)
3912 // If this is a call to a struct-return function, the callee
3913 // pops the hidden struct pointer, so we have to push it back.
3914 // This is common for Darwin/X86, Linux & Mingw32 targets.
3915 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3916 NumBytesForCalleeToPop = 4;
3918 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3920 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3921 // No need to reset the stack after the call if the call doesn't return. To
3922 // make the MI verify, we'll pretend the callee does it for us.
3923 NumBytesForCalleeToPop = NumBytes;
3926 // Returns a flag for retval copy to use.
3928 Chain = DAG.getCALLSEQ_END(Chain,
3929 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3930 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3933 InFlag = Chain.getValue(1);
3936 // Handle result values, copying them out of physregs into vregs that we
3938 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3942 //===----------------------------------------------------------------------===//
3943 // Fast Calling Convention (tail call) implementation
3944 //===----------------------------------------------------------------------===//
3946 // Like std call, callee cleans arguments, convention except that ECX is
3947 // reserved for storing the tail called function address. Only 2 registers are
3948 // free for argument passing (inreg). Tail call optimization is performed
3950 // * tailcallopt is enabled
3951 // * caller/callee are fastcc
3952 // On X86_64 architecture with GOT-style position independent code only local
3953 // (within module) calls are supported at the moment.
3954 // To keep the stack aligned according to platform abi the function
3955 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3956 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3957 // If a tail called function callee has more arguments than the caller the
3958 // caller needs to make sure that there is room to move the RETADDR to. This is
3959 // achieved by reserving an area the size of the argument delta right after the
3960 // original RETADDR, but before the saved framepointer or the spilled registers
3961 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3973 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3976 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3977 SelectionDAG& DAG) const {
3978 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3979 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3980 unsigned StackAlignment = TFI.getStackAlignment();
3981 uint64_t AlignMask = StackAlignment - 1;
3982 int64_t Offset = StackSize;
3983 unsigned SlotSize = RegInfo->getSlotSize();
3984 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3985 // Number smaller than 12 so just add the difference.
3986 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3988 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3989 Offset = ((~AlignMask) & Offset) + StackAlignment +
3990 (StackAlignment-SlotSize);
3995 /// Return true if the given stack call argument is already available in the
3996 /// same position (relatively) of the caller's incoming argument stack.
3998 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3999 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
4000 const X86InstrInfo *TII, const CCValAssign &VA) {
4001 unsigned Bytes = Arg.getValueSizeInBits() / 8;
4004 // Look through nodes that don't alter the bits of the incoming value.
4005 unsigned Op = Arg.getOpcode();
4006 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
4007 Arg = Arg.getOperand(0);
4010 if (Op == ISD::TRUNCATE) {
4011 const SDValue &TruncInput = Arg.getOperand(0);
4012 if (TruncInput.getOpcode() == ISD::AssertZext &&
4013 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
4014 Arg.getValueType()) {
4015 Arg = TruncInput.getOperand(0);
4023 if (Arg.getOpcode() == ISD::CopyFromReg) {
4024 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
4025 if (!TargetRegisterInfo::isVirtualRegister(VR))
4027 MachineInstr *Def = MRI->getVRegDef(VR);
4030 if (!Flags.isByVal()) {
4031 if (!TII->isLoadFromStackSlot(*Def, FI))
4034 unsigned Opcode = Def->getOpcode();
4035 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
4036 Opcode == X86::LEA64_32r) &&
4037 Def->getOperand(1).isFI()) {
4038 FI = Def->getOperand(1).getIndex();
4039 Bytes = Flags.getByValSize();
4043 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
4044 if (Flags.isByVal())
4045 // ByVal argument is passed in as a pointer but it's now being
4046 // dereferenced. e.g.
4047 // define @foo(%struct.X* %A) {
4048 // tail call @bar(%struct.X* byval %A)
4051 SDValue Ptr = Ld->getBasePtr();
4052 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
4055 FI = FINode->getIndex();
4056 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
4057 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
4058 FI = FINode->getIndex();
4059 Bytes = Flags.getByValSize();
4063 assert(FI != INT_MAX);
4064 if (!MFI.isFixedObjectIndex(FI))
4067 if (Offset != MFI.getObjectOffset(FI))
4070 // If this is not byval, check that the argument stack object is immutable.
4071 // inalloca and argument copy elision can create mutable argument stack
4072 // objects. Byval objects can be mutated, but a byval call intends to pass the
4074 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
4077 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4078 // If the argument location is wider than the argument type, check that any
4079 // extension flags match.
4080 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4081 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4086 return Bytes == MFI.getObjectSize(FI);
4089 /// Check whether the call is eligible for tail call optimization. Targets
4090 /// that want to do tail call optimization should implement this function.
4091 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4092 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4093 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4094 const SmallVectorImpl<ISD::OutputArg> &Outs,
4095 const SmallVectorImpl<SDValue> &OutVals,
4096 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4097 if (!mayTailCallThisCC(CalleeCC))
4100 // If -tailcallopt is specified, make fastcc functions tail-callable.
4101 MachineFunction &MF = DAG.getMachineFunction();
4102 const Function &CallerF = MF.getFunction();
4104 // If the function return type is x86_fp80 and the callee return type is not,
4105 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4106 // perform a tailcall optimization here.
4107 if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4110 CallingConv::ID CallerCC = CallerF.getCallingConv();
4111 bool CCMatch = CallerCC == CalleeCC;
4112 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4113 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4115 // Win64 functions have extra shadow space for argument homing. Don't do the
4116 // sibcall if the caller and callee have mismatched expectations for this
4118 if (IsCalleeWin64 != IsCallerWin64)
4121 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4122 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4127 // Look for obvious safe cases to perform tail call optimization that do not
4128 // require ABI changes. This is what gcc calls sibcall.
4130 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4131 // emit a special epilogue.
4132 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4133 if (RegInfo->needsStackRealignment(MF))
4136 // Also avoid sibcall optimization if either caller or callee uses struct
4137 // return semantics.
4138 if (isCalleeStructRet || isCallerStructRet)
4141 // Do not sibcall optimize vararg calls unless all arguments are passed via
4143 LLVMContext &C = *DAG.getContext();
4144 if (isVarArg && !Outs.empty()) {
4145 // Optimizing for varargs on Win64 is unlikely to be safe without
4146 // additional testing.
4147 if (IsCalleeWin64 || IsCallerWin64)
4150 SmallVector<CCValAssign, 16> ArgLocs;
4151 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4153 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4154 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4155 if (!ArgLocs[i].isRegLoc())
4159 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4160 // stack. Therefore, if it's not used by the call it is not safe to optimize
4161 // this into a sibcall.
4162 bool Unused = false;
4163 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4170 SmallVector<CCValAssign, 16> RVLocs;
4171 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4172 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4173 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4174 CCValAssign &VA = RVLocs[i];
4175 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4180 // Check that the call results are passed in the same way.
4181 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4182 RetCC_X86, RetCC_X86))
4184 // The callee has to preserve all registers the caller needs to preserve.
4185 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4186 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4188 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4189 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4193 unsigned StackArgsSize = 0;
4195 // If the callee takes no arguments then go on to check the results of the
4197 if (!Outs.empty()) {
4198 // Check if stack adjustment is needed. For now, do not do this if any
4199 // argument is passed on the stack.
4200 SmallVector<CCValAssign, 16> ArgLocs;
4201 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4203 // Allocate shadow area for Win64
4205 CCInfo.AllocateStack(32, 8);
4207 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4208 StackArgsSize = CCInfo.getNextStackOffset();
4210 if (CCInfo.getNextStackOffset()) {
4211 // Check if the arguments are already laid out in the right way as
4212 // the caller's fixed stack objects.
4213 MachineFrameInfo &MFI = MF.getFrameInfo();
4214 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4215 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4216 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4217 CCValAssign &VA = ArgLocs[i];
4218 SDValue Arg = OutVals[i];
4219 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4220 if (VA.getLocInfo() == CCValAssign::Indirect)
4222 if (!VA.isRegLoc()) {
4223 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4230 bool PositionIndependent = isPositionIndependent();
4231 // If the tailcall address may be in a register, then make sure it's
4232 // possible to register allocate for it. In 32-bit, the call address can
4233 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4234 // callee-saved registers are restored. These happen to be the same
4235 // registers used to pass 'inreg' arguments so watch out for those.
4236 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4237 !isa<ExternalSymbolSDNode>(Callee)) ||
4238 PositionIndependent)) {
4239 unsigned NumInRegs = 0;
4240 // In PIC we need an extra register to formulate the address computation
4242 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4244 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4245 CCValAssign &VA = ArgLocs[i];
4248 unsigned Reg = VA.getLocReg();
4251 case X86::EAX: case X86::EDX: case X86::ECX:
4252 if (++NumInRegs == MaxInRegs)
4259 const MachineRegisterInfo &MRI = MF.getRegInfo();
4260 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4264 bool CalleeWillPop =
4265 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4266 MF.getTarget().Options.GuaranteedTailCallOpt);
4268 if (unsigned BytesToPop =
4269 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4270 // If we have bytes to pop, the callee must pop them.
4271 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4272 if (!CalleePopMatches)
4274 } else if (CalleeWillPop && StackArgsSize > 0) {
4275 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4283 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4284 const TargetLibraryInfo *libInfo) const {
4285 return X86::createFastISel(funcInfo, libInfo);
4288 //===----------------------------------------------------------------------===//
4289 // Other Lowering Hooks
4290 //===----------------------------------------------------------------------===//
4292 static bool MayFoldLoad(SDValue Op) {
4293 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4296 static bool MayFoldIntoStore(SDValue Op) {
4297 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4300 static bool MayFoldIntoZeroExtend(SDValue Op) {
4301 if (Op.hasOneUse()) {
4302 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4303 return (ISD::ZERO_EXTEND == Opcode);
4308 static bool isTargetShuffle(unsigned Opcode) {
4310 default: return false;
4311 case X86ISD::BLENDI:
4312 case X86ISD::PSHUFB:
4313 case X86ISD::PSHUFD:
4314 case X86ISD::PSHUFHW:
4315 case X86ISD::PSHUFLW:
4317 case X86ISD::INSERTPS:
4318 case X86ISD::EXTRQI:
4319 case X86ISD::INSERTQI:
4320 case X86ISD::PALIGNR:
4321 case X86ISD::VSHLDQ:
4322 case X86ISD::VSRLDQ:
4323 case X86ISD::MOVLHPS:
4324 case X86ISD::MOVHLPS:
4325 case X86ISD::MOVLPS:
4326 case X86ISD::MOVLPD:
4327 case X86ISD::MOVSHDUP:
4328 case X86ISD::MOVSLDUP:
4329 case X86ISD::MOVDDUP:
4332 case X86ISD::UNPCKL:
4333 case X86ISD::UNPCKH:
4334 case X86ISD::VBROADCAST:
4335 case X86ISD::VPERMILPI:
4336 case X86ISD::VPERMILPV:
4337 case X86ISD::VPERM2X128:
4338 case X86ISD::VPERMIL2:
4339 case X86ISD::VPERMI:
4340 case X86ISD::VPPERM:
4341 case X86ISD::VPERMV:
4342 case X86ISD::VPERMV3:
4343 case X86ISD::VPERMIV3:
4344 case X86ISD::VZEXT_MOVL:
4349 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4351 default: return false;
4353 case X86ISD::PSHUFB:
4354 case X86ISD::VPERMILPV:
4355 case X86ISD::VPERMIL2:
4356 case X86ISD::VPPERM:
4357 case X86ISD::VPERMV:
4358 case X86ISD::VPERMV3:
4359 case X86ISD::VPERMIV3:
4361 // 'Faux' Target Shuffles.
4368 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4369 MachineFunction &MF = DAG.getMachineFunction();
4370 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4371 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4372 int ReturnAddrIndex = FuncInfo->getRAIndex();
4374 if (ReturnAddrIndex == 0) {
4375 // Set up a frame object for the return address.
4376 unsigned SlotSize = RegInfo->getSlotSize();
4377 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4380 FuncInfo->setRAIndex(ReturnAddrIndex);
4383 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4386 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4387 bool hasSymbolicDisplacement) {
4388 // Offset should fit into 32 bit immediate field.
4389 if (!isInt<32>(Offset))
4392 // If we don't have a symbolic displacement - we don't have any extra
4394 if (!hasSymbolicDisplacement)
4397 // FIXME: Some tweaks might be needed for medium code model.
4398 if (M != CodeModel::Small && M != CodeModel::Kernel)
4401 // For small code model we assume that latest object is 16MB before end of 31
4402 // bits boundary. We may also accept pretty large negative constants knowing
4403 // that all objects are in the positive half of address space.
4404 if (M == CodeModel::Small && Offset < 16*1024*1024)
4407 // For kernel code model we know that all object resist in the negative half
4408 // of 32bits address space. We may not accept negative offsets, since they may
4409 // be just off and we may accept pretty large positive ones.
4410 if (M == CodeModel::Kernel && Offset >= 0)
4416 /// Determines whether the callee is required to pop its own arguments.
4417 /// Callee pop is necessary to support tail calls.
4418 bool X86::isCalleePop(CallingConv::ID CallingConv,
4419 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4420 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4421 // can guarantee TCO.
4422 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4425 switch (CallingConv) {
4428 case CallingConv::X86_StdCall:
4429 case CallingConv::X86_FastCall:
4430 case CallingConv::X86_ThisCall:
4431 case CallingConv::X86_VectorCall:
4436 /// \brief Return true if the condition is an unsigned comparison operation.
4437 static bool isX86CCUnsigned(unsigned X86CC) {
4440 llvm_unreachable("Invalid integer condition!");
4456 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4457 switch (SetCCOpcode) {
4458 default: llvm_unreachable("Invalid integer condition!");
4459 case ISD::SETEQ: return X86::COND_E;
4460 case ISD::SETGT: return X86::COND_G;
4461 case ISD::SETGE: return X86::COND_GE;
4462 case ISD::SETLT: return X86::COND_L;
4463 case ISD::SETLE: return X86::COND_LE;
4464 case ISD::SETNE: return X86::COND_NE;
4465 case ISD::SETULT: return X86::COND_B;
4466 case ISD::SETUGT: return X86::COND_A;
4467 case ISD::SETULE: return X86::COND_BE;
4468 case ISD::SETUGE: return X86::COND_AE;
4472 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4473 /// condition code, returning the condition code and the LHS/RHS of the
4474 /// comparison to make.
4475 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4476 bool isFP, SDValue &LHS, SDValue &RHS,
4477 SelectionDAG &DAG) {
4479 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4480 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4481 // X > -1 -> X == 0, jump !sign.
4482 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4483 return X86::COND_NS;
4485 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4486 // X < 0 -> X == 0, jump on sign.
4489 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4491 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4492 return X86::COND_LE;
4496 return TranslateIntegerX86CC(SetCCOpcode);
4499 // First determine if it is required or is profitable to flip the operands.
4501 // If LHS is a foldable load, but RHS is not, flip the condition.
4502 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4503 !ISD::isNON_EXTLoad(RHS.getNode())) {
4504 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4505 std::swap(LHS, RHS);
4508 switch (SetCCOpcode) {
4514 std::swap(LHS, RHS);
4518 // On a floating point condition, the flags are set as follows:
4520 // 0 | 0 | 0 | X > Y
4521 // 0 | 0 | 1 | X < Y
4522 // 1 | 0 | 0 | X == Y
4523 // 1 | 1 | 1 | unordered
4524 switch (SetCCOpcode) {
4525 default: llvm_unreachable("Condcode should be pre-legalized away");
4527 case ISD::SETEQ: return X86::COND_E;
4528 case ISD::SETOLT: // flipped
4530 case ISD::SETGT: return X86::COND_A;
4531 case ISD::SETOLE: // flipped
4533 case ISD::SETGE: return X86::COND_AE;
4534 case ISD::SETUGT: // flipped
4536 case ISD::SETLT: return X86::COND_B;
4537 case ISD::SETUGE: // flipped
4539 case ISD::SETLE: return X86::COND_BE;
4541 case ISD::SETNE: return X86::COND_NE;
4542 case ISD::SETUO: return X86::COND_P;
4543 case ISD::SETO: return X86::COND_NP;
4545 case ISD::SETUNE: return X86::COND_INVALID;
4549 /// Is there a floating point cmov for the specific X86 condition code?
4550 /// Current x86 isa includes the following FP cmov instructions:
4551 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4552 static bool hasFPCMov(unsigned X86CC) {
4569 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4571 MachineFunction &MF,
4572 unsigned Intrinsic) const {
4574 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4578 Info.opc = ISD::INTRINSIC_W_CHAIN;
4579 Info.flags = MachineMemOperand::MONone;
4582 switch (IntrData->Type) {
4583 case EXPAND_FROM_MEM: {
4584 Info.ptrVal = I.getArgOperand(0);
4585 Info.memVT = MVT::getVT(I.getType());
4587 Info.flags |= MachineMemOperand::MOLoad;
4590 case COMPRESS_TO_MEM: {
4591 Info.ptrVal = I.getArgOperand(0);
4592 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4594 Info.flags |= MachineMemOperand::MOStore;
4597 case TRUNCATE_TO_MEM_VI8:
4598 case TRUNCATE_TO_MEM_VI16:
4599 case TRUNCATE_TO_MEM_VI32: {
4600 Info.ptrVal = I.getArgOperand(0);
4601 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4602 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4603 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4605 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4606 ScalarVT = MVT::i16;
4607 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4608 ScalarVT = MVT::i32;
4610 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4612 Info.flags |= MachineMemOperand::MOStore;
4622 /// Returns true if the target can instruction select the
4623 /// specified FP immediate natively. If false, the legalizer will
4624 /// materialize the FP immediate as a load from a constant pool.
4625 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4626 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4627 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4633 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4634 ISD::LoadExtType ExtTy,
4636 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4637 // relocation target a movq or addq instruction: don't let the load shrink.
4638 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4639 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4640 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4641 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4645 /// \brief Returns true if it is beneficial to convert a load of a constant
4646 /// to just the constant itself.
4647 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4649 assert(Ty->isIntegerTy());
4651 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4652 if (BitSize == 0 || BitSize > 64)
4657 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4658 // TODO: It might be a win to ease or lift this restriction, but the generic
4659 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4660 if (VT.isVector() && Subtarget.hasAVX512())
4666 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4667 unsigned Index) const {
4668 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4671 // Mask vectors support all subregister combinations and operations that
4672 // extract half of vector.
4673 if (ResVT.getVectorElementType() == MVT::i1)
4674 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4675 (Index == ResVT.getVectorNumElements()));
4677 return (Index % ResVT.getVectorNumElements()) == 0;
4680 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4681 // Speculate cttz only if we can directly use TZCNT.
4682 return Subtarget.hasBMI();
4685 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4686 // Speculate ctlz only if we can directly use LZCNT.
4687 return Subtarget.hasLZCNT();
4690 bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT,
4691 EVT BitcastVT) const {
4692 if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1)
4695 return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT);
4698 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4699 const SelectionDAG &DAG) const {
4700 // Do not merge to float value size (128 bytes) if no implicit
4701 // float attribute is set.
4702 bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
4703 Attribute::NoImplicitFloat);
4706 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4707 return (MemVT.getSizeInBits() <= MaxIntSize);
4712 bool X86TargetLowering::isCtlzFast() const {
4713 return Subtarget.hasFastLZCNT();
4716 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4717 const Instruction &AndI) const {
4721 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4722 if (!Subtarget.hasBMI())
4725 // There are only 32-bit and 64-bit forms for 'andn'.
4726 EVT VT = Y.getValueType();
4727 if (VT != MVT::i32 && VT != MVT::i64)
4733 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4734 MVT VT = MVT::getIntegerVT(NumBits);
4735 if (isTypeLegal(VT))
4738 // PMOVMSKB can handle this.
4739 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4742 // VPMOVMSKB can handle this.
4743 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4746 // TODO: Allow 64-bit type for 32-bit target.
4747 // TODO: 512-bit types should be allowed, but make sure that those
4748 // cases are handled in combineVectorSizedSetCCEquality().
4750 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4753 /// Val is the undef sentinel value or equal to the specified value.
4754 static bool isUndefOrEqual(int Val, int CmpVal) {
4755 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4758 /// Val is either the undef or zero sentinel value.
4759 static bool isUndefOrZero(int Val) {
4760 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4763 /// Return true if every element in Mask, beginning
4764 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4765 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4766 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4767 if (Mask[i] != SM_SentinelUndef)
4772 /// Return true if Val is undef or if its value falls within the
4773 /// specified range (L, H].
4774 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4775 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4778 /// Return true if every element in Mask is undef or if its value
4779 /// falls within the specified range (L, H].
4780 static bool isUndefOrInRange(ArrayRef<int> Mask,
4783 if (!isUndefOrInRange(M, Low, Hi))
4788 /// Return true if Val is undef, zero or if its value falls within the
4789 /// specified range (L, H].
4790 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4791 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4794 /// Return true if every element in Mask is undef, zero or if its value
4795 /// falls within the specified range (L, H].
4796 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4798 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4803 /// Return true if every element in Mask, beginning
4804 /// from position Pos and ending in Pos+Size, falls within the specified
4805 /// sequential range (Low, Low+Size]. or is undef.
4806 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4807 unsigned Pos, unsigned Size, int Low) {
4808 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4809 if (!isUndefOrEqual(Mask[i], Low))
4814 /// Return true if every element in Mask, beginning
4815 /// from position Pos and ending in Pos+Size, falls within the specified
4816 /// sequential range (Low, Low+Size], or is undef or is zero.
4817 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4818 unsigned Size, int Low) {
4819 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4820 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4825 /// Return true if every element in Mask, beginning
4826 /// from position Pos and ending in Pos+Size is undef or is zero.
4827 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4829 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4830 if (!isUndefOrZero(Mask[i]))
4835 /// \brief Helper function to test whether a shuffle mask could be
4836 /// simplified by widening the elements being shuffled.
4838 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4839 /// leaves it in an unspecified state.
4841 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4842 /// shuffle masks. The latter have the special property of a '-2' representing
4843 /// a zero-ed lane of a vector.
4844 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4845 SmallVectorImpl<int> &WidenedMask) {
4846 WidenedMask.assign(Mask.size() / 2, 0);
4847 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4849 int M1 = Mask[i + 1];
4851 // If both elements are undef, its trivial.
4852 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4853 WidenedMask[i / 2] = SM_SentinelUndef;
4857 // Check for an undef mask and a mask value properly aligned to fit with
4858 // a pair of values. If we find such a case, use the non-undef mask's value.
4859 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4860 WidenedMask[i / 2] = M1 / 2;
4863 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4864 WidenedMask[i / 2] = M0 / 2;
4868 // When zeroing, we need to spread the zeroing across both lanes to widen.
4869 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4870 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4871 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4872 WidenedMask[i / 2] = SM_SentinelZero;
4878 // Finally check if the two mask values are adjacent and aligned with
4880 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4881 WidenedMask[i / 2] = M0 / 2;
4885 // Otherwise we can't safely widen the elements used in this shuffle.
4888 assert(WidenedMask.size() == Mask.size() / 2 &&
4889 "Incorrect size of mask after widening the elements!");
4894 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4895 bool X86::isZeroNode(SDValue Elt) {
4896 return isNullConstant(Elt) || isNullFPConstant(Elt);
4899 // Build a vector of constants.
4900 // Use an UNDEF node if MaskElt == -1.
4901 // Split 64-bit constants in the 32-bit mode.
4902 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4903 const SDLoc &dl, bool IsMask = false) {
4905 SmallVector<SDValue, 32> Ops;
4908 MVT ConstVecVT = VT;
4909 unsigned NumElts = VT.getVectorNumElements();
4910 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4911 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4912 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4916 MVT EltVT = ConstVecVT.getVectorElementType();
4917 for (unsigned i = 0; i < NumElts; ++i) {
4918 bool IsUndef = Values[i] < 0 && IsMask;
4919 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4920 DAG.getConstant(Values[i], dl, EltVT);
4921 Ops.push_back(OpNode);
4923 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4924 DAG.getConstant(0, dl, EltVT));
4926 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4928 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4932 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4933 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4934 assert(Bits.size() == Undefs.getBitWidth() &&
4935 "Unequal constant and undef arrays");
4936 SmallVector<SDValue, 32> Ops;
4939 MVT ConstVecVT = VT;
4940 unsigned NumElts = VT.getVectorNumElements();
4941 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4942 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4943 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4947 MVT EltVT = ConstVecVT.getVectorElementType();
4948 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4950 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4953 const APInt &V = Bits[i];
4954 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4956 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4957 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4958 } else if (EltVT == MVT::f32) {
4959 APFloat FV(APFloat::IEEEsingle(), V);
4960 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4961 } else if (EltVT == MVT::f64) {
4962 APFloat FV(APFloat::IEEEdouble(), V);
4963 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4965 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4969 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4970 return DAG.getBitcast(VT, ConstsNode);
4973 /// Returns a vector of specified type with all zero elements.
4974 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4975 SelectionDAG &DAG, const SDLoc &dl) {
4976 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4977 VT.getVectorElementType() == MVT::i1) &&
4978 "Unexpected vector type");
4980 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4981 // type. This ensures they get CSE'd. But if the integer type is not
4982 // available, use a floating-point +0.0 instead.
4984 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4985 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4986 } else if (VT.getVectorElementType() == MVT::i1) {
4987 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4988 "Unexpected vector type");
4989 Vec = DAG.getConstant(0, dl, VT);
4991 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4992 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4994 return DAG.getBitcast(VT, Vec);
4997 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4998 const SDLoc &dl, unsigned vectorWidth) {
4999 EVT VT = Vec.getValueType();
5000 EVT ElVT = VT.getVectorElementType();
5001 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5002 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5003 VT.getVectorNumElements()/Factor);
5005 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5006 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5007 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5009 // This is the index of the first element of the vectorWidth-bit chunk
5010 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5011 IdxVal &= ~(ElemsPerChunk - 1);
5013 // If the input is a buildvector just emit a smaller one.
5014 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5015 return DAG.getBuildVector(ResultVT, dl,
5016 Vec->ops().slice(IdxVal, ElemsPerChunk));
5018 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5019 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5022 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5023 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5024 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5025 /// instructions or a simple subregister reference. Idx is an index in the
5026 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5027 /// lowering EXTRACT_VECTOR_ELT operations easier.
5028 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5029 SelectionDAG &DAG, const SDLoc &dl) {
5030 assert((Vec.getValueType().is256BitVector() ||
5031 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5032 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5035 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5036 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5037 SelectionDAG &DAG, const SDLoc &dl) {
5038 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5039 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5042 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5043 SelectionDAG &DAG, const SDLoc &dl,
5044 unsigned vectorWidth) {
5045 assert((vectorWidth == 128 || vectorWidth == 256) &&
5046 "Unsupported vector width");
5047 // Inserting UNDEF is Result
5050 EVT VT = Vec.getValueType();
5051 EVT ElVT = VT.getVectorElementType();
5052 EVT ResultVT = Result.getValueType();
5054 // Insert the relevant vectorWidth bits.
5055 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5056 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5058 // This is the index of the first element of the vectorWidth-bit chunk
5059 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5060 IdxVal &= ~(ElemsPerChunk - 1);
5062 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5063 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5066 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5067 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5068 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5069 /// simple superregister reference. Idx is an index in the 128 bits
5070 /// we want. It need not be aligned to a 128-bit boundary. That makes
5071 /// lowering INSERT_VECTOR_ELT operations easier.
5072 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5073 SelectionDAG &DAG, const SDLoc &dl) {
5074 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5075 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5078 /// Widen a vector to a larger size with the same scalar type, with the new
5079 /// elements either zero or undef.
5080 static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements,
5081 const X86Subtarget &Subtarget, SelectionDAG &DAG,
5083 assert(Vec.getValueSizeInBits() < VT.getSizeInBits() &&
5084 Vec.getValueType().getScalarType() == VT.getScalarType() &&
5085 "Unsupported vector widening type");
5086 SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl)
5088 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec,
5089 DAG.getIntPtrConstant(0, dl));
5092 // Helper for splitting operands of an operation to legal target size and
5093 // apply a function on each part.
5094 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
5095 // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for
5096 // deciding if/how to split Ops. Ops elements do *not* have to be of type VT.
5097 // The argument Builder is a function that will be applied on each split part:
5098 // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>)
5099 template <typename F>
5100 SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget,
5101 const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops,
5103 assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2");
5104 unsigned NumSubs = 1;
5105 if (Subtarget.useBWIRegs()) {
5106 if (VT.getSizeInBits() > 512) {
5107 NumSubs = VT.getSizeInBits() / 512;
5108 assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size");
5110 } else if (Subtarget.hasAVX2()) {
5111 if (VT.getSizeInBits() > 256) {
5112 NumSubs = VT.getSizeInBits() / 256;
5113 assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size");
5116 if (VT.getSizeInBits() > 128) {
5117 NumSubs = VT.getSizeInBits() / 128;
5118 assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size");
5123 return Builder(DAG, DL, Ops);
5125 SmallVector<SDValue, 4> Subs;
5126 for (unsigned i = 0; i != NumSubs; ++i) {
5127 SmallVector<SDValue, 2> SubOps;
5128 for (SDValue Op : Ops) {
5129 EVT OpVT = Op.getValueType();
5130 unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs;
5131 unsigned SizeSub = OpVT.getSizeInBits() / NumSubs;
5132 SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub));
5134 Subs.push_back(Builder(DAG, DL, SubOps));
5136 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs);
5139 // Return true if the instruction zeroes the unused upper part of the
5140 // destination and accepts mask.
5141 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5147 case X86ISD::CMPM_RND:
5152 /// Insert i1-subvector to i1-vector.
5153 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5154 const X86Subtarget &Subtarget) {
5157 SDValue Vec = Op.getOperand(0);
5158 SDValue SubVec = Op.getOperand(1);
5159 SDValue Idx = Op.getOperand(2);
5161 if (!isa<ConstantSDNode>(Idx))
5164 // Inserting undef is a nop. We can just return the original vector.
5165 if (SubVec.isUndef())
5168 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5169 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5172 MVT OpVT = Op.getSimpleValueType();
5173 unsigned NumElems = OpVT.getVectorNumElements();
5175 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5177 // Extend to natively supported kshift.
5178 MVT WideOpVT = OpVT;
5179 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
5180 WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5182 // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
5184 if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
5185 // May need to promote to a legal type.
5186 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5187 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5189 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5192 MVT SubVecVT = SubVec.getSimpleValueType();
5193 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5195 assert(IdxVal + SubVecNumElems <= NumElems &&
5196 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5197 "Unexpected index value in INSERT_SUBVECTOR");
5199 SDValue Undef = DAG.getUNDEF(WideOpVT);
5202 // Zero lower bits of the Vec
5203 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5204 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
5206 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5207 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5208 // Merge them together, SubVec should be zero extended.
5209 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5210 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5212 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5213 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5216 SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5217 Undef, SubVec, ZeroIdx);
5219 if (Vec.isUndef()) {
5220 assert(IdxVal != 0 && "Unexpected index");
5221 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5222 DAG.getConstant(IdxVal, dl, MVT::i8));
5223 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5226 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5227 assert(IdxVal != 0 && "Unexpected index");
5228 NumElems = WideOpVT.getVectorNumElements();
5229 unsigned ShiftLeft = NumElems - SubVecNumElems;
5230 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5231 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5232 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5233 if (ShiftRight != 0)
5234 SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
5235 DAG.getConstant(ShiftRight, dl, MVT::i8));
5236 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
5239 // Simple case when we put subvector in the upper part
5240 if (IdxVal + SubVecNumElems == NumElems) {
5241 SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
5242 DAG.getConstant(IdxVal, dl, MVT::i8));
5243 if (SubVecNumElems * 2 == NumElems) {
5244 // Special case, use legal zero extending insert_subvector. This allows
5245 // isel to opimitize when bits are known zero.
5246 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
5247 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5248 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5251 // Otherwise use explicit shifts to zero the bits.
5252 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5253 Undef, Vec, ZeroIdx);
5254 NumElems = WideOpVT.getVectorNumElements();
5255 SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
5256 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5257 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5259 Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
5260 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5263 // Inserting into the middle is more complicated.
5265 NumElems = WideOpVT.getVectorNumElements();
5267 // Widen the vector if needed.
5268 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5269 // Move the current value of the bit to be replace to the lsbs.
5270 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5271 DAG.getConstant(IdxVal, dl, MVT::i8));
5272 // Xor with the new bit.
5273 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
5274 // Shift to MSB, filling bottom bits with 0.
5275 unsigned ShiftLeft = NumElems - SubVecNumElems;
5276 Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
5277 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5278 // Shift to the final position, filling upper bits with 0.
5279 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5280 Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
5281 DAG.getConstant(ShiftRight, dl, MVT::i8));
5282 // Xor with original vector leaving the new value.
5283 Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
5284 // Reduce to original width if needed.
5285 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
5288 static SDValue concatSubVectors(SDValue V1, SDValue V2, EVT VT,
5289 unsigned NumElems, SelectionDAG &DAG,
5290 const SDLoc &dl, unsigned VectorWidth) {
5291 SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, VectorWidth);
5292 return insertSubVector(V, V2, NumElems / 2, DAG, dl, VectorWidth);
5295 /// Returns a vector of specified type with all bits set.
5296 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5297 /// Then bitcast to their original type, ensuring they get CSE'd.
5298 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5299 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5300 "Expected a 128/256/512-bit vector type");
5302 APInt Ones = APInt::getAllOnesValue(32);
5303 unsigned NumElts = VT.getSizeInBits() / 32;
5304 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5305 return DAG.getBitcast(VT, Vec);
5308 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5309 SelectionDAG &DAG) {
5310 EVT InVT = In.getValueType();
5311 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5313 if (VT.is128BitVector() && InVT.is128BitVector())
5314 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5315 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5317 // For 256-bit vectors, we only need the lower (128-bit) input half.
5318 // For 512-bit vectors, we only need the lower input half or quarter.
5319 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5320 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5321 In = extractSubVector(In, 0, DAG, DL,
5322 std::max(128, (int)VT.getSizeInBits() / Scale));
5325 return DAG.getNode(Opc, DL, VT, In);
5328 /// Returns a vector_shuffle node for an unpackl operation.
5329 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5330 SDValue V1, SDValue V2) {
5331 SmallVector<int, 8> Mask;
5332 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5333 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5336 /// Returns a vector_shuffle node for an unpackh operation.
5337 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5338 SDValue V1, SDValue V2) {
5339 SmallVector<int, 8> Mask;
5340 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5341 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5344 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5345 /// This produces a shuffle where the low element of V2 is swizzled into the
5346 /// zero/undef vector, landing at element Idx.
5347 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5348 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5350 const X86Subtarget &Subtarget,
5351 SelectionDAG &DAG) {
5352 MVT VT = V2.getSimpleValueType();
5354 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5355 int NumElems = VT.getVectorNumElements();
5356 SmallVector<int, 16> MaskVec(NumElems);
5357 for (int i = 0; i != NumElems; ++i)
5358 // If this is the insertion idx, put the low elt of V2 here.
5359 MaskVec[i] = (i == Idx) ? NumElems : i;
5360 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5363 static SDValue peekThroughBitcasts(SDValue V) {
5364 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5365 V = V.getOperand(0);
5369 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5370 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5371 V.getOperand(0).hasOneUse())
5372 V = V.getOperand(0);
5376 static const Constant *getTargetConstantFromNode(SDValue Op) {
5377 Op = peekThroughBitcasts(Op);
5379 auto *Load = dyn_cast<LoadSDNode>(Op);
5383 SDValue Ptr = Load->getBasePtr();
5384 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5385 Ptr->getOpcode() == X86ISD::WrapperRIP)
5386 Ptr = Ptr->getOperand(0);
5388 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5389 if (!CNode || CNode->isMachineConstantPoolEntry())
5392 return dyn_cast<Constant>(CNode->getConstVal());
5395 // Extract raw constant bits from constant pools.
5396 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5398 SmallVectorImpl<APInt> &EltBits,
5399 bool AllowWholeUndefs = true,
5400 bool AllowPartialUndefs = true) {
5401 assert(EltBits.empty() && "Expected an empty EltBits vector");
5403 Op = peekThroughBitcasts(Op);
5405 EVT VT = Op.getValueType();
5406 unsigned SizeInBits = VT.getSizeInBits();
5407 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5408 unsigned NumElts = SizeInBits / EltSizeInBits;
5410 // Bitcast a source array of element bits to the target size.
5411 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5412 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5413 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5414 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5415 "Constant bit sizes don't match");
5417 // Don't split if we don't allow undef bits.
5418 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5419 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5422 // If we're already the right size, don't bother bitcasting.
5423 if (NumSrcElts == NumElts) {
5424 UndefElts = UndefSrcElts;
5425 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5429 // Extract all the undef/constant element data and pack into single bitsets.
5430 APInt UndefBits(SizeInBits, 0);
5431 APInt MaskBits(SizeInBits, 0);
5433 for (unsigned i = 0; i != NumSrcElts; ++i) {
5434 unsigned BitOffset = i * SrcEltSizeInBits;
5435 if (UndefSrcElts[i])
5436 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5437 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5440 // Split the undef/constant single bitset data into the target elements.
5441 UndefElts = APInt(NumElts, 0);
5442 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5444 for (unsigned i = 0; i != NumElts; ++i) {
5445 unsigned BitOffset = i * EltSizeInBits;
5446 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5448 // Only treat an element as UNDEF if all bits are UNDEF.
5449 if (UndefEltBits.isAllOnesValue()) {
5450 if (!AllowWholeUndefs)
5452 UndefElts.setBit(i);
5456 // If only some bits are UNDEF then treat them as zero (or bail if not
5458 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5461 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5462 EltBits[i] = Bits.getZExtValue();
5467 // Collect constant bits and insert into mask/undef bit masks.
5468 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5469 unsigned UndefBitIndex) {
5472 if (isa<UndefValue>(Cst)) {
5473 Undefs.setBit(UndefBitIndex);
5476 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5477 Mask = CInt->getValue();
5480 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5481 Mask = CFP->getValueAPF().bitcastToAPInt();
5489 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5490 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5491 return CastBitData(UndefSrcElts, SrcEltBits);
5494 // Extract scalar constant bits.
5495 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5496 APInt UndefSrcElts = APInt::getNullValue(1);
5497 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5498 return CastBitData(UndefSrcElts, SrcEltBits);
5500 if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
5501 APInt UndefSrcElts = APInt::getNullValue(1);
5502 APInt RawBits = Cst->getValueAPF().bitcastToAPInt();
5503 SmallVector<APInt, 64> SrcEltBits(1, RawBits);
5504 return CastBitData(UndefSrcElts, SrcEltBits);
5507 // Extract constant bits from build vector.
5508 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5509 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5510 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5512 APInt UndefSrcElts(NumSrcElts, 0);
5513 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5514 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5515 const SDValue &Src = Op.getOperand(i);
5516 if (Src.isUndef()) {
5517 UndefSrcElts.setBit(i);
5520 auto *Cst = cast<ConstantSDNode>(Src);
5521 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5523 return CastBitData(UndefSrcElts, SrcEltBits);
5526 // Extract constant bits from constant pool vector.
5527 if (auto *Cst = getTargetConstantFromNode(Op)) {
5528 Type *CstTy = Cst->getType();
5529 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5532 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5533 unsigned NumSrcElts = CstTy->getVectorNumElements();
5535 APInt UndefSrcElts(NumSrcElts, 0);
5536 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5537 for (unsigned i = 0; i != NumSrcElts; ++i)
5538 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5542 return CastBitData(UndefSrcElts, SrcEltBits);
5545 // Extract constant bits from a broadcasted constant pool scalar.
5546 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5547 EltSizeInBits <= VT.getScalarSizeInBits()) {
5548 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5549 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5550 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5552 APInt UndefSrcElts(NumSrcElts, 0);
5553 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5554 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5555 if (UndefSrcElts[0])
5556 UndefSrcElts.setBits(0, NumSrcElts);
5557 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5558 return CastBitData(UndefSrcElts, SrcEltBits);
5563 // Extract a rematerialized scalar constant insertion.
5564 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5565 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5566 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5567 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5568 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5570 APInt UndefSrcElts(NumSrcElts, 0);
5571 SmallVector<APInt, 64> SrcEltBits;
5572 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5573 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5574 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5575 return CastBitData(UndefSrcElts, SrcEltBits);
5581 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5582 unsigned MaskEltSizeInBits,
5583 SmallVectorImpl<uint64_t> &RawMask) {
5585 SmallVector<APInt, 64> EltBits;
5587 // Extract the raw target constant bits.
5588 // FIXME: We currently don't support UNDEF bits or mask entries.
5589 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5590 EltBits, /* AllowWholeUndefs */ false,
5591 /* AllowPartialUndefs */ false))
5594 // Insert the extracted elements into the mask.
5595 for (APInt Elt : EltBits)
5596 RawMask.push_back(Elt.getZExtValue());
5601 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5602 /// Note: This ignores saturation, so inputs must be checked first.
5603 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5605 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5606 unsigned NumElts = VT.getVectorNumElements();
5607 unsigned NumLanes = VT.getSizeInBits() / 128;
5608 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5609 unsigned Offset = Unary ? 0 : NumElts;
5611 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5612 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5613 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5614 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5615 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5619 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5620 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5621 /// operands in \p Ops, and returns true.
5622 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5623 /// IsUnary for shuffles which use a single input multiple times, and in those
5624 /// cases it will adjust the mask to only have indices within that single input.
5625 /// It is an error to call this with non-empty Mask/Ops vectors.
5626 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5627 SmallVectorImpl<SDValue> &Ops,
5628 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5629 unsigned NumElems = VT.getVectorNumElements();
5632 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5633 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5636 bool IsFakeUnary = false;
5637 switch(N->getOpcode()) {
5638 case X86ISD::BLENDI:
5639 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5640 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5641 ImmN = N->getOperand(N->getNumOperands()-1);
5642 DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5643 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5646 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5647 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5648 ImmN = N->getOperand(N->getNumOperands()-1);
5649 DecodeSHUFPMask(NumElems, VT.getScalarSizeInBits(),
5650 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5651 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5653 case X86ISD::INSERTPS:
5654 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5655 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5656 ImmN = N->getOperand(N->getNumOperands()-1);
5657 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5658 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5660 case X86ISD::EXTRQI:
5661 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5662 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5663 isa<ConstantSDNode>(N->getOperand(2))) {
5664 int BitLen = N->getConstantOperandVal(1);
5665 int BitIdx = N->getConstantOperandVal(2);
5666 DecodeEXTRQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5671 case X86ISD::INSERTQI:
5672 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5673 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5674 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5675 isa<ConstantSDNode>(N->getOperand(3))) {
5676 int BitLen = N->getConstantOperandVal(2);
5677 int BitIdx = N->getConstantOperandVal(3);
5678 DecodeINSERTQIMask(NumElems, VT.getScalarSizeInBits(), BitLen, BitIdx,
5680 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5683 case X86ISD::UNPCKH:
5684 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5685 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5686 DecodeUNPCKHMask(NumElems, VT.getScalarSizeInBits(), Mask);
5687 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5689 case X86ISD::UNPCKL:
5690 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5691 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5692 DecodeUNPCKLMask(NumElems, VT.getScalarSizeInBits(), Mask);
5693 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5695 case X86ISD::MOVHLPS:
5696 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5697 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5698 DecodeMOVHLPSMask(NumElems, Mask);
5699 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5701 case X86ISD::MOVLHPS:
5702 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5703 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5704 DecodeMOVLHPSMask(NumElems, Mask);
5705 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5707 case X86ISD::PALIGNR:
5708 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5709 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5710 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5711 ImmN = N->getOperand(N->getNumOperands()-1);
5712 DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5714 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5715 Ops.push_back(N->getOperand(1));
5716 Ops.push_back(N->getOperand(0));
5718 case X86ISD::VSHLDQ:
5719 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5720 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5721 ImmN = N->getOperand(N->getNumOperands() - 1);
5722 DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5726 case X86ISD::VSRLDQ:
5727 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5728 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5729 ImmN = N->getOperand(N->getNumOperands() - 1);
5730 DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5734 case X86ISD::PSHUFD:
5735 case X86ISD::VPERMILPI:
5736 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5737 ImmN = N->getOperand(N->getNumOperands()-1);
5738 DecodePSHUFMask(NumElems, VT.getScalarSizeInBits(),
5739 cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5742 case X86ISD::PSHUFHW:
5743 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5744 ImmN = N->getOperand(N->getNumOperands()-1);
5745 DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5749 case X86ISD::PSHUFLW:
5750 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5751 ImmN = N->getOperand(N->getNumOperands()-1);
5752 DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5756 case X86ISD::VZEXT_MOVL:
5757 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5758 DecodeZeroMoveLowMask(NumElems, Mask);
5761 case X86ISD::VBROADCAST: {
5762 SDValue N0 = N->getOperand(0);
5763 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5764 // add the pre-extracted value to the Ops vector.
5765 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5766 N0.getOperand(0).getValueType() == VT &&
5767 N0.getConstantOperandVal(1) == 0)
5768 Ops.push_back(N0.getOperand(0));
5770 // We only decode broadcasts of same-sized vectors, unless the broadcast
5771 // came from an extract from the original width. If we found one, we
5772 // pushed it the Ops vector above.
5773 if (N0.getValueType() == VT || !Ops.empty()) {
5774 DecodeVectorBroadcast(NumElems, Mask);
5780 case X86ISD::VPERMILPV: {
5781 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5783 SDValue MaskNode = N->getOperand(1);
5784 unsigned MaskEltSize = VT.getScalarSizeInBits();
5785 SmallVector<uint64_t, 32> RawMask;
5786 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5787 DecodeVPERMILPMask(NumElems, VT.getScalarSizeInBits(), RawMask, Mask);
5790 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5791 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5796 case X86ISD::PSHUFB: {
5797 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5798 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5799 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5801 SDValue MaskNode = N->getOperand(1);
5802 SmallVector<uint64_t, 32> RawMask;
5803 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5804 DecodePSHUFBMask(RawMask, Mask);
5807 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5808 DecodePSHUFBMask(C, Mask);
5813 case X86ISD::VPERMI:
5814 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5815 ImmN = N->getOperand(N->getNumOperands()-1);
5816 DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5821 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5822 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5823 DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask);
5825 case X86ISD::VPERM2X128:
5826 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5827 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5828 ImmN = N->getOperand(N->getNumOperands()-1);
5829 DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
5831 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5833 case X86ISD::MOVSLDUP:
5834 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5835 DecodeMOVSLDUPMask(NumElems, Mask);
5838 case X86ISD::MOVSHDUP:
5839 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5840 DecodeMOVSHDUPMask(NumElems, Mask);
5843 case X86ISD::MOVDDUP:
5844 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5845 DecodeMOVDDUPMask(NumElems, Mask);
5848 case X86ISD::MOVLPD:
5849 case X86ISD::MOVLPS:
5850 // Not yet implemented
5852 case X86ISD::VPERMIL2: {
5853 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5854 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5855 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5856 unsigned MaskEltSize = VT.getScalarSizeInBits();
5857 SDValue MaskNode = N->getOperand(2);
5858 SDValue CtrlNode = N->getOperand(3);
5859 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5860 unsigned CtrlImm = CtrlOp->getZExtValue();
5861 SmallVector<uint64_t, 32> RawMask;
5862 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5863 DecodeVPERMIL2PMask(NumElems, VT.getScalarSizeInBits(), CtrlImm,
5867 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5868 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5874 case X86ISD::VPPERM: {
5875 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5876 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5877 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5878 SDValue MaskNode = N->getOperand(2);
5879 SmallVector<uint64_t, 32> RawMask;
5880 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5881 DecodeVPPERMMask(RawMask, Mask);
5884 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5885 DecodeVPPERMMask(C, Mask);
5890 case X86ISD::VPERMV: {
5891 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5893 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5894 Ops.push_back(N->getOperand(1));
5895 SDValue MaskNode = N->getOperand(0);
5896 SmallVector<uint64_t, 32> RawMask;
5897 unsigned MaskEltSize = VT.getScalarSizeInBits();
5898 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5899 DecodeVPERMVMask(RawMask, Mask);
5902 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5903 DecodeVPERMVMask(C, MaskEltSize, Mask);
5908 case X86ISD::VPERMV3: {
5909 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5910 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5911 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5912 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5913 Ops.push_back(N->getOperand(0));
5914 Ops.push_back(N->getOperand(2));
5915 SDValue MaskNode = N->getOperand(1);
5916 unsigned MaskEltSize = VT.getScalarSizeInBits();
5917 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5918 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5923 case X86ISD::VPERMIV3: {
5924 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5925 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5926 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5927 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5928 Ops.push_back(N->getOperand(1));
5929 Ops.push_back(N->getOperand(2));
5930 SDValue MaskNode = N->getOperand(0);
5931 unsigned MaskEltSize = VT.getScalarSizeInBits();
5932 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5933 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5938 default: llvm_unreachable("unknown target shuffle node");
5941 // Empty mask indicates the decode failed.
5945 // Check if we're getting a shuffle mask with zero'd elements.
5946 if (!AllowSentinelZero)
5947 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5950 // If we have a fake unary shuffle, the shuffle mask is spread across two
5951 // inputs that are actually the same node. Re-map the mask to always point
5952 // into the first input.
5955 if (M >= (int)Mask.size())
5958 // If we didn't already add operands in the opcode-specific code, default to
5959 // adding 1 or 2 operands starting at 0.
5961 Ops.push_back(N->getOperand(0));
5962 if (!IsUnary || IsFakeUnary)
5963 Ops.push_back(N->getOperand(1));
5969 /// Check a target shuffle mask's inputs to see if we can set any values to
5970 /// SM_SentinelZero - this is for elements that are known to be zero
5971 /// (not just zeroable) from their inputs.
5972 /// Returns true if the target shuffle mask was decoded.
5973 static bool setTargetShuffleZeroElements(SDValue N,
5974 SmallVectorImpl<int> &Mask,
5975 SmallVectorImpl<SDValue> &Ops) {
5977 if (!isTargetShuffle(N.getOpcode()))
5980 MVT VT = N.getSimpleValueType();
5981 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5984 SDValue V1 = Ops[0];
5985 SDValue V2 = IsUnary ? V1 : Ops[1];
5987 V1 = peekThroughBitcasts(V1);
5988 V2 = peekThroughBitcasts(V2);
5990 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5991 "Illegal split of shuffle value type");
5992 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5994 // Extract known constant input data.
5995 APInt UndefSrcElts[2];
5996 SmallVector<APInt, 32> SrcEltBits[2];
5997 bool IsSrcConstant[2] = {
5998 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5999 SrcEltBits[0], true, false),
6000 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
6001 SrcEltBits[1], true, false)};
6003 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
6006 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
6010 // Determine shuffle input and normalize the mask.
6011 unsigned SrcIdx = M / Size;
6012 SDValue V = M < Size ? V1 : V2;
6015 // We are referencing an UNDEF input.
6017 Mask[i] = SM_SentinelUndef;
6021 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
6022 // TODO: We currently only set UNDEF for integer types - floats use the same
6023 // registers as vectors and many of the scalar folded loads rely on the
6024 // SCALAR_TO_VECTOR pattern.
6025 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
6026 (Size % V.getValueType().getVectorNumElements()) == 0) {
6027 int Scale = Size / V.getValueType().getVectorNumElements();
6028 int Idx = M / Scale;
6029 if (Idx != 0 && !VT.isFloatingPoint())
6030 Mask[i] = SM_SentinelUndef;
6031 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
6032 Mask[i] = SM_SentinelZero;
6036 // Attempt to extract from the source's constant bits.
6037 if (IsSrcConstant[SrcIdx]) {
6038 if (UndefSrcElts[SrcIdx][M])
6039 Mask[i] = SM_SentinelUndef;
6040 else if (SrcEltBits[SrcIdx][M] == 0)
6041 Mask[i] = SM_SentinelZero;
6045 assert(VT.getVectorNumElements() == Mask.size() &&
6046 "Different mask size from vector size!");
6050 // Attempt to decode ops that could be represented as a shuffle mask.
6051 // The decoded shuffle mask may contain a different number of elements to the
6052 // destination value type.
6053 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
6054 SmallVectorImpl<SDValue> &Ops,
6055 SelectionDAG &DAG) {
6059 MVT VT = N.getSimpleValueType();
6060 unsigned NumElts = VT.getVectorNumElements();
6061 unsigned NumSizeInBits = VT.getSizeInBits();
6062 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
6063 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
6064 "Expected byte aligned value types");
6066 unsigned Opcode = N.getOpcode();
6068 case ISD::VECTOR_SHUFFLE: {
6069 // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here.
6070 ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask();
6071 if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) {
6072 Mask.append(ShuffleMask.begin(), ShuffleMask.end());
6073 Ops.push_back(N.getOperand(0));
6074 Ops.push_back(N.getOperand(1));
6080 case X86ISD::ANDNP: {
6081 // Attempt to decode as a per-byte mask.
6083 SmallVector<APInt, 32> EltBits;
6084 SDValue N0 = N.getOperand(0);
6085 SDValue N1 = N.getOperand(1);
6086 bool IsAndN = (X86ISD::ANDNP == Opcode);
6087 uint64_t ZeroMask = IsAndN ? 255 : 0;
6088 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
6090 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
6092 Mask.push_back(SM_SentinelUndef);
6095 uint64_t ByteBits = EltBits[i].getZExtValue();
6096 if (ByteBits != 0 && ByteBits != 255)
6098 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
6100 Ops.push_back(IsAndN ? N1 : N0);
6103 case ISD::SCALAR_TO_VECTOR: {
6104 // Match against a scalar_to_vector of an extract from a vector,
6105 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
6106 SDValue N0 = N.getOperand(0);
6109 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6110 N0.getOperand(0).getValueType() == VT) ||
6111 (N0.getOpcode() == X86ISD::PEXTRW &&
6112 N0.getOperand(0).getValueType() == MVT::v8i16) ||
6113 (N0.getOpcode() == X86ISD::PEXTRB &&
6114 N0.getOperand(0).getValueType() == MVT::v16i8)) {
6118 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
6121 SDValue SrcVec = SrcExtract.getOperand(0);
6122 EVT SrcVT = SrcVec.getValueType();
6123 unsigned NumSrcElts = SrcVT.getVectorNumElements();
6124 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
6126 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
6127 if (NumSrcElts <= SrcIdx)
6130 Ops.push_back(SrcVec);
6131 Mask.push_back(SrcIdx);
6132 Mask.append(NumZeros, SM_SentinelZero);
6133 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6136 case X86ISD::PINSRB:
6137 case X86ISD::PINSRW: {
6138 SDValue InVec = N.getOperand(0);
6139 SDValue InScl = N.getOperand(1);
6140 SDValue InIndex = N.getOperand(2);
6141 if (!isa<ConstantSDNode>(InIndex) ||
6142 cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
6144 uint64_t InIdx = N.getConstantOperandVal(2);
6146 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6147 if (X86::isZeroNode(InScl)) {
6148 Ops.push_back(InVec);
6149 for (unsigned i = 0; i != NumElts; ++i)
6150 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6154 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
6155 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6157 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6158 if (InScl.getOpcode() != ExOp)
6161 SDValue ExVec = InScl.getOperand(0);
6162 SDValue ExIndex = InScl.getOperand(1);
6163 if (!isa<ConstantSDNode>(ExIndex) ||
6164 cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
6166 uint64_t ExIdx = InScl.getConstantOperandVal(1);
6168 Ops.push_back(InVec);
6169 Ops.push_back(ExVec);
6170 for (unsigned i = 0; i != NumElts; ++i)
6171 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6174 case X86ISD::PACKSS:
6175 case X86ISD::PACKUS: {
6176 SDValue N0 = N.getOperand(0);
6177 SDValue N1 = N.getOperand(1);
6178 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6179 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6180 "Unexpected input value type");
6182 // If we know input saturation won't happen we can treat this
6183 // as a truncation shuffle.
6184 if (Opcode == X86ISD::PACKSS) {
6185 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6186 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6189 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6190 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6191 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6195 bool IsUnary = (N0 == N1);
6201 createPackShuffleMask(VT, Mask, IsUnary);
6205 case X86ISD::VSRLI: {
6206 uint64_t ShiftVal = N.getConstantOperandVal(1);
6207 // Out of range bit shifts are guaranteed to be zero.
6208 if (NumBitsPerElt <= ShiftVal) {
6209 Mask.append(NumElts, SM_SentinelZero);
6213 // We can only decode 'whole byte' bit shifts as shuffles.
6214 if ((ShiftVal % 8) != 0)
6217 uint64_t ByteShift = ShiftVal / 8;
6218 unsigned NumBytes = NumSizeInBits / 8;
6219 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6220 Ops.push_back(N.getOperand(0));
6222 // Clear mask to all zeros and insert the shifted byte indices.
6223 Mask.append(NumBytes, SM_SentinelZero);
6225 if (X86ISD::VSHLI == Opcode) {
6226 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6227 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6228 Mask[i + j] = i + j - ByteShift;
6230 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6231 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6232 Mask[i + j - ByteShift] = i + j;
6236 case ISD::ZERO_EXTEND_VECTOR_INREG:
6237 case X86ISD::VZEXT: {
6238 // TODO - add support for VPMOVZX with smaller input vector types.
6239 SDValue Src = N.getOperand(0);
6240 MVT SrcVT = Src.getSimpleValueType();
6241 if (NumSizeInBits != SrcVT.getSizeInBits())
6243 DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), VT.getScalarSizeInBits(),
6244 VT.getVectorNumElements(), Mask);
6253 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6254 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6255 SmallVectorImpl<int> &Mask) {
6256 int MaskWidth = Mask.size();
6257 SmallVector<SDValue, 16> UsedInputs;
6258 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6259 int lo = UsedInputs.size() * MaskWidth;
6260 int hi = lo + MaskWidth;
6262 // Strip UNDEF input usage.
6263 if (Inputs[i].isUndef())
6265 if ((lo <= M) && (M < hi))
6266 M = SM_SentinelUndef;
6268 // Check for unused inputs.
6269 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6270 UsedInputs.push_back(Inputs[i]);
6277 Inputs = UsedInputs;
6280 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6281 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6282 /// remaining input indices in case we now have a unary shuffle and adjust the
6283 /// inputs accordingly.
6284 /// Returns true if the target shuffle mask was decoded.
6285 static bool resolveTargetShuffleInputs(SDValue Op,
6286 SmallVectorImpl<SDValue> &Inputs,
6287 SmallVectorImpl<int> &Mask,
6288 SelectionDAG &DAG) {
6289 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6290 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6293 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6297 /// Returns the scalar element that will make up the ith
6298 /// element of the result of the vector shuffle.
6299 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6302 return SDValue(); // Limit search depth.
6304 SDValue V = SDValue(N, 0);
6305 EVT VT = V.getValueType();
6306 unsigned Opcode = V.getOpcode();
6308 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6309 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6310 int Elt = SV->getMaskElt(Index);
6313 return DAG.getUNDEF(VT.getVectorElementType());
6315 unsigned NumElems = VT.getVectorNumElements();
6316 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6317 : SV->getOperand(1);
6318 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6321 // Recurse into target specific vector shuffles to find scalars.
6322 if (isTargetShuffle(Opcode)) {
6323 MVT ShufVT = V.getSimpleValueType();
6324 MVT ShufSVT = ShufVT.getVectorElementType();
6325 int NumElems = (int)ShufVT.getVectorNumElements();
6326 SmallVector<int, 16> ShuffleMask;
6327 SmallVector<SDValue, 16> ShuffleOps;
6330 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6333 int Elt = ShuffleMask[Index];
6334 if (Elt == SM_SentinelZero)
6335 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6336 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6337 if (Elt == SM_SentinelUndef)
6338 return DAG.getUNDEF(ShufSVT);
6340 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6341 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6342 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6346 // Actual nodes that may contain scalar elements
6347 if (Opcode == ISD::BITCAST) {
6348 V = V.getOperand(0);
6349 EVT SrcVT = V.getValueType();
6350 unsigned NumElems = VT.getVectorNumElements();
6352 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6356 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6357 return (Index == 0) ? V.getOperand(0)
6358 : DAG.getUNDEF(VT.getVectorElementType());
6360 if (V.getOpcode() == ISD::BUILD_VECTOR)
6361 return V.getOperand(Index);
6366 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6367 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6368 unsigned NumNonZero, unsigned NumZero,
6370 const X86Subtarget &Subtarget) {
6371 MVT VT = Op.getSimpleValueType();
6372 unsigned NumElts = VT.getVectorNumElements();
6373 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6374 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6375 "Illegal vector insertion");
6381 for (unsigned i = 0; i < NumElts; ++i) {
6382 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6386 // If the build vector contains zeros or our first insertion is not the
6387 // first index then insert into zero vector to break any register
6388 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6391 if (NumZero || 0 != i)
6392 V = getZeroVector(VT, Subtarget, DAG, dl);
6394 assert(0 == i && "Expected insertion into zero-index");
6395 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6396 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6397 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6398 V = DAG.getBitcast(VT, V);
6402 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6403 DAG.getIntPtrConstant(i, dl));
6409 /// Custom lower build_vector of v16i8.
6410 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6411 unsigned NumNonZero, unsigned NumZero,
6413 const X86Subtarget &Subtarget) {
6414 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6417 // SSE4.1 - use PINSRB to insert each byte directly.
6418 if (Subtarget.hasSSE41())
6419 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6426 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6427 for (unsigned i = 0; i < 16; ++i) {
6428 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6429 if (ThisIsNonZero && First) {
6431 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6433 V = DAG.getUNDEF(MVT::v8i16);
6438 // FIXME: Investigate extending to i32 instead of just i16.
6439 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6440 SDValue ThisElt, LastElt;
6441 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6442 if (LastIsNonZero) {
6444 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6446 if (ThisIsNonZero) {
6447 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6448 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6449 DAG.getConstant(8, dl, MVT::i8));
6451 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6457 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6458 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6459 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6460 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6461 V = DAG.getBitcast(MVT::v8i16, V);
6463 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6464 DAG.getIntPtrConstant(i / 2, dl));
6470 return DAG.getBitcast(MVT::v16i8, V);
6473 /// Custom lower build_vector of v8i16.
6474 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6475 unsigned NumNonZero, unsigned NumZero,
6477 const X86Subtarget &Subtarget) {
6478 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6481 // Use PINSRW to insert each byte directly.
6482 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6486 /// Custom lower build_vector of v4i32 or v4f32.
6487 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6488 const X86Subtarget &Subtarget) {
6489 // Find all zeroable elements.
6490 std::bitset<4> Zeroable;
6491 for (int i=0; i < 4; ++i) {
6492 SDValue Elt = Op->getOperand(i);
6493 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6495 assert(Zeroable.size() - Zeroable.count() > 1 &&
6496 "We expect at least two non-zero elements!");
6498 // We only know how to deal with build_vector nodes where elements are either
6499 // zeroable or extract_vector_elt with constant index.
6500 SDValue FirstNonZero;
6501 unsigned FirstNonZeroIdx;
6502 for (unsigned i=0; i < 4; ++i) {
6505 SDValue Elt = Op->getOperand(i);
6506 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6507 !isa<ConstantSDNode>(Elt.getOperand(1)))
6509 // Make sure that this node is extracting from a 128-bit vector.
6510 MVT VT = Elt.getOperand(0).getSimpleValueType();
6511 if (!VT.is128BitVector())
6513 if (!FirstNonZero.getNode()) {
6515 FirstNonZeroIdx = i;
6519 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6520 SDValue V1 = FirstNonZero.getOperand(0);
6521 MVT VT = V1.getSimpleValueType();
6523 // See if this build_vector can be lowered as a blend with zero.
6525 unsigned EltMaskIdx, EltIdx;
6527 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6528 if (Zeroable[EltIdx]) {
6529 // The zero vector will be on the right hand side.
6530 Mask[EltIdx] = EltIdx+4;
6534 Elt = Op->getOperand(EltIdx);
6535 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6536 EltMaskIdx = Elt.getConstantOperandVal(1);
6537 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6539 Mask[EltIdx] = EltIdx;
6543 // Let the shuffle legalizer deal with blend operations.
6544 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6545 if (V1.getSimpleValueType() != VT)
6546 V1 = DAG.getBitcast(VT, V1);
6547 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6550 // See if we can lower this build_vector to a INSERTPS.
6551 if (!Subtarget.hasSSE41())
6554 SDValue V2 = Elt.getOperand(0);
6555 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6558 bool CanFold = true;
6559 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6563 SDValue Current = Op->getOperand(i);
6564 SDValue SrcVector = Current->getOperand(0);
6567 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6573 assert(V1.getNode() && "Expected at least two non-zero elements!");
6574 if (V1.getSimpleValueType() != MVT::v4f32)
6575 V1 = DAG.getBitcast(MVT::v4f32, V1);
6576 if (V2.getSimpleValueType() != MVT::v4f32)
6577 V2 = DAG.getBitcast(MVT::v4f32, V2);
6579 // Ok, we can emit an INSERTPS instruction.
6580 unsigned ZMask = Zeroable.to_ulong();
6582 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6583 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6585 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6586 DAG.getIntPtrConstant(InsertPSMask, DL));
6587 return DAG.getBitcast(VT, Result);
6590 /// Return a vector logical shift node.
6591 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6592 SelectionDAG &DAG, const TargetLowering &TLI,
6594 assert(VT.is128BitVector() && "Unknown type for VShift");
6595 MVT ShVT = MVT::v16i8;
6596 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6597 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6598 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6599 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6600 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6601 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6604 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6605 SelectionDAG &DAG) {
6607 // Check if the scalar load can be widened into a vector load. And if
6608 // the address is "base + cst" see if the cst can be "absorbed" into
6609 // the shuffle mask.
6610 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6611 SDValue Ptr = LD->getBasePtr();
6612 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6614 EVT PVT = LD->getValueType(0);
6615 if (PVT != MVT::i32 && PVT != MVT::f32)
6620 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6621 FI = FINode->getIndex();
6623 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6624 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6625 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6626 Offset = Ptr.getConstantOperandVal(1);
6627 Ptr = Ptr.getOperand(0);
6632 // FIXME: 256-bit vector instructions don't require a strict alignment,
6633 // improve this code to support it better.
6634 unsigned RequiredAlign = VT.getSizeInBits()/8;
6635 SDValue Chain = LD->getChain();
6636 // Make sure the stack object alignment is at least 16 or 32.
6637 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6638 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6639 if (MFI.isFixedObjectIndex(FI)) {
6640 // Can't change the alignment. FIXME: It's possible to compute
6641 // the exact stack offset and reference FI + adjust offset instead.
6642 // If someone *really* cares about this. That's the way to implement it.
6645 MFI.setObjectAlignment(FI, RequiredAlign);
6649 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6650 // Ptr + (Offset & ~15).
6653 if ((Offset % RequiredAlign) & 3)
6655 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6658 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6659 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6662 int EltNo = (Offset - StartOffset) >> 2;
6663 unsigned NumElems = VT.getVectorNumElements();
6665 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6666 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6667 LD->getPointerInfo().getWithOffset(StartOffset));
6669 SmallVector<int, 8> Mask(NumElems, EltNo);
6671 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6677 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6678 /// elements can be replaced by a single large load which has the same value as
6679 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6681 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6682 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6683 const SDLoc &DL, SelectionDAG &DAG,
6684 const X86Subtarget &Subtarget,
6685 bool isAfterLegalize) {
6686 unsigned NumElems = Elts.size();
6688 int LastLoadedElt = -1;
6689 SmallBitVector LoadMask(NumElems, false);
6690 SmallBitVector ZeroMask(NumElems, false);
6691 SmallBitVector UndefMask(NumElems, false);
6693 // For each element in the initializer, see if we've found a load, zero or an
6695 for (unsigned i = 0; i < NumElems; ++i) {
6696 SDValue Elt = peekThroughBitcasts(Elts[i]);
6701 UndefMask[i] = true;
6702 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6704 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6707 // Each loaded element must be the correct fractional portion of the
6708 // requested vector load.
6709 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6714 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6715 "Incomplete element masks");
6717 // Handle Special Cases - all undef or undef/zero.
6718 if (UndefMask.count() == NumElems)
6719 return DAG.getUNDEF(VT);
6721 // FIXME: Should we return this as a BUILD_VECTOR instead?
6722 if ((ZeroMask | UndefMask).count() == NumElems)
6723 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6724 : DAG.getConstantFP(0.0, DL, VT);
6726 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6727 int FirstLoadedElt = LoadMask.find_first();
6728 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6729 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6730 EVT LDBaseVT = EltBase.getValueType();
6732 // Consecutive loads can contain UNDEFS but not ZERO elements.
6733 // Consecutive loads with UNDEFs and ZEROs elements require a
6734 // an additional shuffle stage to clear the ZERO elements.
6735 bool IsConsecutiveLoad = true;
6736 bool IsConsecutiveLoadWithZeros = true;
6737 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6739 SDValue Elt = peekThroughBitcasts(Elts[i]);
6740 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6741 if (!DAG.areNonVolatileConsecutiveLoads(
6742 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6743 i - FirstLoadedElt)) {
6744 IsConsecutiveLoad = false;
6745 IsConsecutiveLoadWithZeros = false;
6748 } else if (ZeroMask[i]) {
6749 IsConsecutiveLoad = false;
6753 SmallVector<LoadSDNode *, 8> Loads;
6754 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6756 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6758 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6759 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6760 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6761 "Cannot merge volatile loads.");
6763 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6764 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6765 for (auto *LD : Loads)
6766 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6770 // LOAD - all consecutive load/undefs (must start/end with a load).
6771 // If we have found an entire vector of loads and undefs, then return a large
6772 // load of the entire vector width starting at the base pointer.
6773 // If the vector contains zeros, then attempt to shuffle those elements.
6774 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6775 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6776 assert(LDBase && "Did not find base load for merging consecutive loads");
6777 EVT EltVT = LDBase->getValueType(0);
6778 // Ensure that the input vector size for the merged loads matches the
6779 // cumulative size of the input elements.
6780 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6783 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6786 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6787 // will lower to regular temporal loads and use the cache.
6788 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6789 VT.is256BitVector() && !Subtarget.hasInt256())
6792 if (IsConsecutiveLoad)
6793 return CreateLoad(VT, LDBase);
6795 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6796 // vector and a zero vector to clear out the zero elements.
6797 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6798 SmallVector<int, 4> ClearMask(NumElems, -1);
6799 for (unsigned i = 0; i < NumElems; ++i) {
6801 ClearMask[i] = i + NumElems;
6802 else if (LoadMask[i])
6805 SDValue V = CreateLoad(VT, LDBase);
6806 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6807 : DAG.getConstantFP(0.0, DL, VT);
6808 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6813 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6815 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6816 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6817 (LoadSize == 32 || LoadSize == 64) &&
6818 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6819 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6820 : MVT::getIntegerVT(LoadSize);
6821 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6822 if (TLI.isTypeLegal(VecVT)) {
6823 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6824 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6826 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6827 LDBase->getPointerInfo(),
6828 LDBase->getAlignment(),
6829 MachineMemOperand::MOLoad);
6830 for (auto *LD : Loads)
6831 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6832 return DAG.getBitcast(VT, ResNode);
6839 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6840 unsigned SplatBitSize, LLVMContext &C) {
6841 unsigned ScalarSize = VT.getScalarSizeInBits();
6842 unsigned NumElm = SplatBitSize / ScalarSize;
6844 SmallVector<Constant *, 32> ConstantVec;
6845 for (unsigned i = 0; i < NumElm; i++) {
6846 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6848 if (VT.isFloatingPoint()) {
6849 if (ScalarSize == 32) {
6850 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6852 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6853 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6856 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6857 ConstantVec.push_back(Const);
6859 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6862 static bool isUseOfShuffle(SDNode *N) {
6863 for (auto *U : N->uses()) {
6864 if (isTargetShuffle(U->getOpcode()))
6866 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6867 return isUseOfShuffle(U);
6872 // Check if the current node of build vector is a zero extended vector.
6873 // // If so, return the value extended.
6874 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6875 // // NumElt - return the number of zero extended identical values.
6876 // // EltType - return the type of the value include the zero extend.
6877 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6878 unsigned &NumElt, MVT &EltType) {
6879 SDValue ExtValue = Op->getOperand(0);
6880 unsigned NumElts = Op->getNumOperands();
6881 unsigned Delta = NumElts;
6883 for (unsigned i = 1; i < NumElts; i++) {
6884 if (Op->getOperand(i) == ExtValue) {
6888 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6891 if (!isPowerOf2_32(Delta) || Delta == 1)
6894 for (unsigned i = Delta; i < NumElts; i++) {
6895 if (i % Delta == 0) {
6896 if (Op->getOperand(i) != ExtValue)
6898 } else if (!(isNullConstant(Op->getOperand(i)) ||
6899 Op->getOperand(i).isUndef()))
6902 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6903 unsigned ExtVTSize = EltSize * Delta;
6904 EltType = MVT::getIntegerVT(ExtVTSize);
6905 NumElt = NumElts / Delta;
6909 /// Attempt to use the vbroadcast instruction to generate a splat value
6910 /// from a splat BUILD_VECTOR which uses:
6911 /// a. A single scalar load, or a constant.
6912 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6914 /// The VBROADCAST node is returned when a pattern is found,
6915 /// or SDValue() otherwise.
6916 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6917 const X86Subtarget &Subtarget,
6918 SelectionDAG &DAG) {
6919 // VBROADCAST requires AVX.
6920 // TODO: Splats could be generated for non-AVX CPUs using SSE
6921 // instructions, but there's less potential gain for only 128-bit vectors.
6922 if (!Subtarget.hasAVX())
6925 MVT VT = BVOp->getSimpleValueType(0);
6928 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6929 "Unsupported vector type for broadcast.");
6931 BitVector UndefElements;
6932 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6934 // Attempt to use VBROADCASTM
6935 // From this paterrn:
6936 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6937 // b. t1 = (build_vector t0 t0)
6939 // Create (VBROADCASTM v2i1 X)
6940 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6941 MVT EltType = VT.getScalarType();
6942 unsigned NumElts = VT.getVectorNumElements();
6944 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6945 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6946 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6947 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6949 BOperand = ZeroExtended.getOperand(0);
6951 BOperand = Ld.getOperand(0).getOperand(0);
6952 if (BOperand.getValueType().isVector() &&
6953 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6954 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6955 NumElts == 8)) || // for broadcastmb2q
6956 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6957 NumElts == 16))) { // for broadcastmw2d
6959 DAG.getNode(X86ISD::VBROADCASTM, dl,
6960 MVT::getVectorVT(EltType, NumElts), BOperand);
6961 return DAG.getBitcast(VT, Brdcst);
6967 // We need a splat of a single value to use broadcast, and it doesn't
6968 // make any sense if the value is only in one element of the vector.
6969 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6970 APInt SplatValue, Undef;
6971 unsigned SplatBitSize;
6973 // Check if this is a repeated constant pattern suitable for broadcasting.
6974 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6975 SplatBitSize > VT.getScalarSizeInBits() &&
6976 SplatBitSize < VT.getSizeInBits()) {
6977 // Avoid replacing with broadcast when it's a use of a shuffle
6978 // instruction to preserve the present custom lowering of shuffles.
6979 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6981 // replace BUILD_VECTOR with broadcast of the repeated constants.
6982 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6983 LLVMContext *Ctx = DAG.getContext();
6984 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6985 if (Subtarget.hasAVX()) {
6986 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6987 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6988 // Splatted value can fit in one INTEGER constant in constant pool.
6989 // Load the constant and broadcast it.
6990 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6991 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6992 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6993 SDValue CP = DAG.getConstantPool(C, PVT);
6994 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6996 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6998 CVT, dl, DAG.getEntryNode(), CP,
6999 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7001 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7002 MVT::getVectorVT(CVT, Repeat), Ld);
7003 return DAG.getBitcast(VT, Brdcst);
7004 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
7005 // Splatted value can fit in one FLOAT constant in constant pool.
7006 // Load the constant and broadcast it.
7007 // AVX have support for 32 and 64 bit broadcast for floats only.
7008 // No 64bit integer in 32bit subtarget.
7009 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
7010 // Lower the splat via APFloat directly, to avoid any conversion.
7013 ? ConstantFP::get(*Ctx,
7014 APFloat(APFloat::IEEEsingle(), SplatValue))
7015 : ConstantFP::get(*Ctx,
7016 APFloat(APFloat::IEEEdouble(), SplatValue));
7017 SDValue CP = DAG.getConstantPool(C, PVT);
7018 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
7020 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7022 CVT, dl, DAG.getEntryNode(), CP,
7023 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7025 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
7026 MVT::getVectorVT(CVT, Repeat), Ld);
7027 return DAG.getBitcast(VT, Brdcst);
7028 } else if (SplatBitSize > 64) {
7029 // Load the vector of constants and broadcast it.
7030 MVT CVT = VT.getScalarType();
7031 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
7033 SDValue VCP = DAG.getConstantPool(VecC, PVT);
7034 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
7035 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
7037 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
7038 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7040 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
7041 return DAG.getBitcast(VT, Brdcst);
7048 bool ConstSplatVal =
7049 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
7051 // Make sure that all of the users of a non-constant load are from the
7052 // BUILD_VECTOR node.
7053 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
7056 unsigned ScalarSize = Ld.getValueSizeInBits();
7057 bool IsGE256 = (VT.getSizeInBits() >= 256);
7059 // When optimizing for size, generate up to 5 extra bytes for a broadcast
7060 // instruction to save 8 or more bytes of constant pool data.
7061 // TODO: If multiple splats are generated to load the same constant,
7062 // it may be detrimental to overall size. There needs to be a way to detect
7063 // that condition to know if this is truly a size win.
7064 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
7066 // Handle broadcasting a single constant scalar from the constant pool
7068 // On Sandybridge (no AVX2), it is still better to load a constant vector
7069 // from the constant pool and not to broadcast it from a scalar.
7070 // But override that restriction when optimizing for size.
7071 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
7072 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
7073 EVT CVT = Ld.getValueType();
7074 assert(!CVT.isVector() && "Must not broadcast a vector type");
7076 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
7077 // For size optimization, also splat v2f64 and v2i64, and for size opt
7078 // with AVX2, also splat i8 and i16.
7079 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
7080 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7081 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
7082 const Constant *C = nullptr;
7083 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
7084 C = CI->getConstantIntValue();
7085 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
7086 C = CF->getConstantFPValue();
7088 assert(C && "Invalid constant type");
7090 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7092 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
7093 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
7095 CVT, dl, DAG.getEntryNode(), CP,
7096 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
7099 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7103 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
7105 // Handle AVX2 in-register broadcasts.
7106 if (!IsLoad && Subtarget.hasInt256() &&
7107 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
7108 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7110 // The scalar source must be a normal load.
7114 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
7115 (Subtarget.hasVLX() && ScalarSize == 64))
7116 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7118 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
7119 // double since there is no vbroadcastsd xmm
7120 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
7121 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
7122 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
7125 // Unsupported broadcast.
7129 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
7130 /// underlying vector and index.
7132 /// Modifies \p ExtractedFromVec to the real vector and returns the real
7134 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
7136 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
7137 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
7140 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
7142 // (extract_vector_elt (v8f32 %1), Constant<6>)
7144 // (extract_vector_elt (vector_shuffle<2,u,u,u>
7145 // (extract_subvector (v8f32 %0), Constant<4>),
7148 // In this case the vector is the extract_subvector expression and the index
7149 // is 2, as specified by the shuffle.
7150 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
7151 SDValue ShuffleVec = SVOp->getOperand(0);
7152 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
7153 assert(ShuffleVecVT.getVectorElementType() ==
7154 ExtractedFromVec.getSimpleValueType().getVectorElementType());
7156 int ShuffleIdx = SVOp->getMaskElt(Idx);
7157 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
7158 ExtractedFromVec = ShuffleVec;
7164 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
7165 MVT VT = Op.getSimpleValueType();
7167 // Skip if insert_vec_elt is not supported.
7168 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7169 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
7173 unsigned NumElems = Op.getNumOperands();
7177 SmallVector<unsigned, 4> InsertIndices;
7178 SmallVector<int, 8> Mask(NumElems, -1);
7180 for (unsigned i = 0; i != NumElems; ++i) {
7181 unsigned Opc = Op.getOperand(i).getOpcode();
7183 if (Opc == ISD::UNDEF)
7186 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7187 // Quit if more than 1 elements need inserting.
7188 if (InsertIndices.size() > 1)
7191 InsertIndices.push_back(i);
7195 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7196 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7198 // Quit if non-constant index.
7199 if (!isa<ConstantSDNode>(ExtIdx))
7201 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7203 // Quit if extracted from vector of different type.
7204 if (ExtractedFromVec.getValueType() != VT)
7207 if (!VecIn1.getNode())
7208 VecIn1 = ExtractedFromVec;
7209 else if (VecIn1 != ExtractedFromVec) {
7210 if (!VecIn2.getNode())
7211 VecIn2 = ExtractedFromVec;
7212 else if (VecIn2 != ExtractedFromVec)
7213 // Quit if more than 2 vectors to shuffle
7217 if (ExtractedFromVec == VecIn1)
7219 else if (ExtractedFromVec == VecIn2)
7220 Mask[i] = Idx + NumElems;
7223 if (!VecIn1.getNode())
7226 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7227 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7229 for (unsigned Idx : InsertIndices)
7230 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7231 DAG.getIntPtrConstant(Idx, DL));
7236 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7237 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7238 Op.getScalarValueSizeInBits() == 1 &&
7239 "Can not convert non-constant vector");
7240 uint64_t Immediate = 0;
7241 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7242 SDValue In = Op.getOperand(idx);
7244 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7247 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7248 return DAG.getConstant(Immediate, dl, VT);
7250 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7251 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
7252 const X86Subtarget &Subtarget) {
7254 MVT VT = Op.getSimpleValueType();
7255 assert((VT.getVectorElementType() == MVT::i1) &&
7256 "Unexpected type in LowerBUILD_VECTORvXi1!");
7259 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7262 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7265 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7266 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7267 // Split the pieces.
7269 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7271 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7272 // We have to manually lower both halves so getNode doesn't try to
7273 // reassemble the build_vector.
7274 Lower = LowerBUILD_VECTORvXi1(Lower, DAG, Subtarget);
7275 Upper = LowerBUILD_VECTORvXi1(Upper, DAG, Subtarget);
7276 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7278 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7279 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7280 return DAG.getBitcast(VT, Imm);
7281 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7282 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7283 DAG.getIntPtrConstant(0, dl));
7286 // Vector has one or more non-const elements
7287 uint64_t Immediate = 0;
7288 SmallVector<unsigned, 16> NonConstIdx;
7289 bool IsSplat = true;
7290 bool HasConstElts = false;
7292 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7293 SDValue In = Op.getOperand(idx);
7296 if (!isa<ConstantSDNode>(In))
7297 NonConstIdx.push_back(idx);
7299 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7300 HasConstElts = true;
7304 else if (In != Op.getOperand(SplatIdx))
7308 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7310 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7311 DAG.getConstant(1, dl, VT),
7312 DAG.getConstant(0, dl, VT));
7314 // insert elements one by one
7318 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7319 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7321 else if (HasConstElts)
7322 Imm = DAG.getConstant(0, dl, VT);
7324 Imm = DAG.getUNDEF(VT);
7325 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7326 DstVec = DAG.getBitcast(VT, Imm);
7328 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7329 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7330 DAG.getIntPtrConstant(0, dl));
7333 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7334 unsigned InsertIdx = NonConstIdx[i];
7335 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7336 Op.getOperand(InsertIdx),
7337 DAG.getIntPtrConstant(InsertIdx, dl));
7342 /// \brief Return true if \p N implements a horizontal binop and return the
7343 /// operands for the horizontal binop into V0 and V1.
7345 /// This is a helper function of LowerToHorizontalOp().
7346 /// This function checks that the build_vector \p N in input implements a
7347 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7348 /// operation to match.
7349 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7350 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7351 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7354 /// This function only analyzes elements of \p N whose indices are
7355 /// in range [BaseIdx, LastIdx).
7356 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7358 unsigned BaseIdx, unsigned LastIdx,
7359 SDValue &V0, SDValue &V1) {
7360 EVT VT = N->getValueType(0);
7362 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7363 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7364 "Invalid Vector in input!");
7366 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7367 bool CanFold = true;
7368 unsigned ExpectedVExtractIdx = BaseIdx;
7369 unsigned NumElts = LastIdx - BaseIdx;
7370 V0 = DAG.getUNDEF(VT);
7371 V1 = DAG.getUNDEF(VT);
7373 // Check if N implements a horizontal binop.
7374 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7375 SDValue Op = N->getOperand(i + BaseIdx);
7378 if (Op->isUndef()) {
7379 // Update the expected vector extract index.
7380 if (i * 2 == NumElts)
7381 ExpectedVExtractIdx = BaseIdx;
7382 ExpectedVExtractIdx += 2;
7386 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7391 SDValue Op0 = Op.getOperand(0);
7392 SDValue Op1 = Op.getOperand(1);
7394 // Try to match the following pattern:
7395 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7396 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7397 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7398 Op0.getOperand(0) == Op1.getOperand(0) &&
7399 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7400 isa<ConstantSDNode>(Op1.getOperand(1)));
7404 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7405 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7407 if (i * 2 < NumElts) {
7409 V0 = Op0.getOperand(0);
7410 if (V0.getValueType() != VT)
7415 V1 = Op0.getOperand(0);
7416 if (V1.getValueType() != VT)
7419 if (i * 2 == NumElts)
7420 ExpectedVExtractIdx = BaseIdx;
7423 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7424 if (I0 == ExpectedVExtractIdx)
7425 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7426 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7427 // Try to match the following dag sequence:
7428 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7429 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7433 ExpectedVExtractIdx += 2;
7439 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7440 /// a concat_vector.
7442 /// This is a helper function of LowerToHorizontalOp().
7443 /// This function expects two 256-bit vectors called V0 and V1.
7444 /// At first, each vector is split into two separate 128-bit vectors.
7445 /// Then, the resulting 128-bit vectors are used to implement two
7446 /// horizontal binary operations.
7448 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7450 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7451 /// the two new horizontal binop.
7452 /// When Mode is set, the first horizontal binop dag node would take as input
7453 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7454 /// horizontal binop dag node would take as input the lower 128-bit of V1
7455 /// and the upper 128-bit of V1.
7457 /// HADD V0_LO, V0_HI
7458 /// HADD V1_LO, V1_HI
7460 /// Otherwise, the first horizontal binop dag node takes as input the lower
7461 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7462 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7464 /// HADD V0_LO, V1_LO
7465 /// HADD V0_HI, V1_HI
7467 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7468 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7469 /// the upper 128-bits of the result.
7470 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7471 const SDLoc &DL, SelectionDAG &DAG,
7472 unsigned X86Opcode, bool Mode,
7473 bool isUndefLO, bool isUndefHI) {
7474 MVT VT = V0.getSimpleValueType();
7475 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7476 "Invalid nodes in input!");
7478 unsigned NumElts = VT.getVectorNumElements();
7479 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7480 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7481 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7482 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7483 MVT NewVT = V0_LO.getSimpleValueType();
7485 SDValue LO = DAG.getUNDEF(NewVT);
7486 SDValue HI = DAG.getUNDEF(NewVT);
7489 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7490 if (!isUndefLO && !V0->isUndef())
7491 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7492 if (!isUndefHI && !V1->isUndef())
7493 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7495 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7496 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7497 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7499 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7500 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7503 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7506 /// Returns true iff \p BV builds a vector with the result equivalent to
7507 /// the result of ADDSUB/SUBADD operation.
7508 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1
7509 /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters
7510 /// \p Opnd0 and \p Opnd1.
7511 static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
7512 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7513 SDValue &Opnd0, SDValue &Opnd1,
7514 unsigned &NumExtracts,
7517 MVT VT = BV->getSimpleValueType(0);
7518 if (!Subtarget.hasSSE3() || !VT.isFloatingPoint())
7521 unsigned NumElts = VT.getVectorNumElements();
7522 SDValue InVec0 = DAG.getUNDEF(VT);
7523 SDValue InVec1 = DAG.getUNDEF(VT);
7527 // Odd-numbered elements in the input build vector are obtained from
7528 // adding/subtracting two integer/float elements.
7529 // Even-numbered elements in the input build vector are obtained from
7530 // subtracting/adding two integer/float elements.
7531 unsigned Opc[2] {0, 0};
7532 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7533 SDValue Op = BV->getOperand(i);
7535 // Skip 'undef' values.
7536 unsigned Opcode = Op.getOpcode();
7537 if (Opcode == ISD::UNDEF)
7540 // Early exit if we found an unexpected opcode.
7541 if (Opcode != ISD::FADD && Opcode != ISD::FSUB)
7544 SDValue Op0 = Op.getOperand(0);
7545 SDValue Op1 = Op.getOperand(1);
7547 // Try to match the following pattern:
7548 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7549 // Early exit if we cannot match that sequence.
7550 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7551 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7552 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7553 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7554 Op0.getOperand(1) != Op1.getOperand(1))
7557 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7561 // We found a valid add/sub node, make sure its the same opcode as previous
7562 // elements for this parity.
7563 if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode)
7565 Opc[i % 2] = Opcode;
7567 // Update InVec0 and InVec1.
7568 if (InVec0.isUndef()) {
7569 InVec0 = Op0.getOperand(0);
7570 if (InVec0.getSimpleValueType() != VT)
7573 if (InVec1.isUndef()) {
7574 InVec1 = Op1.getOperand(0);
7575 if (InVec1.getSimpleValueType() != VT)
7579 // Make sure that operands in input to each add/sub node always
7580 // come from a same pair of vectors.
7581 if (InVec0 != Op0.getOperand(0)) {
7582 if (Opcode == ISD::FSUB)
7585 // FADD is commutable. Try to commute the operands
7586 // and then test again.
7587 std::swap(Op0, Op1);
7588 if (InVec0 != Op0.getOperand(0))
7592 if (InVec1 != Op1.getOperand(0))
7595 // Increment the number of extractions done.
7599 // Ensure we have found an opcode for both parities and that they are
7600 // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the
7601 // inputs are undef.
7602 if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] ||
7603 InVec0.isUndef() || InVec1.isUndef())
7606 IsSubAdd = Opc[0] == ISD::FADD;
7613 /// Returns true if is possible to fold MUL and an idiom that has already been
7614 /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into
7615 /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the
7616 /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2.
7618 /// Prior to calling this function it should be known that there is some
7619 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7620 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7621 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7622 /// of \p Opnd0 uses is expected to be equal to 2.
7623 /// For example, this function may be called for the following IR:
7624 /// %AB = fmul fast <2 x double> %A, %B
7625 /// %Sub = fsub fast <2 x double> %AB, %C
7626 /// %Add = fadd fast <2 x double> %AB, %C
7627 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7628 /// <2 x i32> <i32 0, i32 3>
7629 /// There is a def for %Addsub here, which potentially can be replaced by
7630 /// X86ISD::ADDSUB operation:
7631 /// %Addsub = X86ISD::ADDSUB %AB, %C
7632 /// and such ADDSUB can further be replaced with FMADDSUB:
7633 /// %Addsub = FMADDSUB %A, %B, %C.
7635 /// The main reason why this method is called before the replacement of the
7636 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7637 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7639 static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget,
7641 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
7642 unsigned ExpectedUses) {
7643 if (Opnd0.getOpcode() != ISD::FMUL ||
7644 !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA())
7647 // FIXME: These checks must match the similar ones in
7648 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7649 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7650 // or MUL + ADDSUB to FMADDSUB.
7651 const TargetOptions &Options = DAG.getTarget().Options;
7653 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7658 Opnd1 = Opnd0.getOperand(1);
7659 Opnd0 = Opnd0.getOperand(0);
7664 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or
7665 /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or
7666 /// X86ISD::FMSUBADD node.
7667 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7668 const X86Subtarget &Subtarget,
7669 SelectionDAG &DAG) {
7670 SDValue Opnd0, Opnd1;
7671 unsigned NumExtracts;
7673 if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts,
7677 MVT VT = BV->getSimpleValueType(0);
7680 // Try to generate X86ISD::FMADDSUB node here.
7682 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) {
7683 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
7684 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
7687 // We only support ADDSUB.
7691 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7692 // the ADDSUB idiom has been successfully recognized. There are no known
7693 // X86 targets with 512-bit ADDSUB instructions!
7694 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7696 if (VT.is512BitVector())
7699 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7702 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7703 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7704 const X86Subtarget &Subtarget,
7705 SelectionDAG &DAG) {
7706 MVT VT = BV->getSimpleValueType(0);
7707 unsigned NumElts = VT.getVectorNumElements();
7708 unsigned NumUndefsLO = 0;
7709 unsigned NumUndefsHI = 0;
7710 unsigned Half = NumElts/2;
7712 // Count the number of UNDEF operands in the build_vector in input.
7713 for (unsigned i = 0, e = Half; i != e; ++i)
7714 if (BV->getOperand(i)->isUndef())
7717 for (unsigned i = Half, e = NumElts; i != e; ++i)
7718 if (BV->getOperand(i)->isUndef())
7721 // Early exit if this is either a build_vector of all UNDEFs or all the
7722 // operands but one are UNDEF.
7723 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7727 SDValue InVec0, InVec1;
7728 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7729 // Try to match an SSE3 float HADD/HSUB.
7730 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7731 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7733 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7734 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7735 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7736 // Try to match an SSSE3 integer HADD/HSUB.
7737 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7738 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7740 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7741 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7744 if (!Subtarget.hasAVX())
7747 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7748 // Try to match an AVX horizontal add/sub of packed single/double
7749 // precision floating point values from 256-bit vectors.
7750 SDValue InVec2, InVec3;
7751 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7752 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7753 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7754 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7755 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7757 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7758 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7759 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7760 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7761 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7762 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7763 // Try to match an AVX2 horizontal add/sub of signed integers.
7764 SDValue InVec2, InVec3;
7766 bool CanFold = true;
7768 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7769 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7770 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7771 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7772 X86Opcode = X86ISD::HADD;
7773 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7774 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7775 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7776 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7777 X86Opcode = X86ISD::HSUB;
7782 // Fold this build_vector into a single horizontal add/sub.
7783 // Do this only if the target has AVX2.
7784 if (Subtarget.hasAVX2())
7785 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7787 // Do not try to expand this build_vector into a pair of horizontal
7788 // add/sub if we can emit a pair of scalar add/sub.
7789 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7792 // Convert this build_vector into a pair of horizontal binop followed by
7794 bool isUndefLO = NumUndefsLO == Half;
7795 bool isUndefHI = NumUndefsHI == Half;
7796 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7797 isUndefLO, isUndefHI);
7801 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7802 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7804 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7805 X86Opcode = X86ISD::HADD;
7806 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7807 X86Opcode = X86ISD::HSUB;
7808 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7809 X86Opcode = X86ISD::FHADD;
7810 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7811 X86Opcode = X86ISD::FHSUB;
7815 // Don't try to expand this build_vector into a pair of horizontal add/sub
7816 // if we can simply emit a pair of scalar add/sub.
7817 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7820 // Convert this build_vector into two horizontal add/sub followed by
7822 bool isUndefLO = NumUndefsLO == Half;
7823 bool isUndefHI = NumUndefsHI == Half;
7824 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7825 isUndefLO, isUndefHI);
7831 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7832 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7833 /// just apply the bit to the vectors.
7834 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7835 /// from this, but enough scalar bit operations are created from the later
7836 /// legalization + scalarization stages to need basic support.
7837 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7838 SelectionDAG &DAG) {
7840 MVT VT = Op->getSimpleValueType(0);
7841 unsigned NumElems = VT.getVectorNumElements();
7842 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7844 // Check that all elements have the same opcode.
7845 // TODO: Should we allow UNDEFS and if so how many?
7846 unsigned Opcode = Op->getOperand(0).getOpcode();
7847 for (unsigned i = 1; i < NumElems; ++i)
7848 if (Opcode != Op->getOperand(i).getOpcode())
7851 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7858 // Don't do this if the buildvector is a splat - we'd replace one
7859 // constant with an entire vector.
7860 if (Op->getSplatValue())
7862 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7867 SmallVector<SDValue, 4> LHSElts, RHSElts;
7868 for (SDValue Elt : Op->ops()) {
7869 SDValue LHS = Elt.getOperand(0);
7870 SDValue RHS = Elt.getOperand(1);
7872 // We expect the canonicalized RHS operand to be the constant.
7873 if (!isa<ConstantSDNode>(RHS))
7875 LHSElts.push_back(LHS);
7876 RHSElts.push_back(RHS);
7879 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7880 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7881 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7884 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7885 /// functionality to do this, so it's all zeros, all ones, or some derivation
7886 /// that is cheap to calculate.
7887 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7888 const X86Subtarget &Subtarget) {
7890 MVT VT = Op.getSimpleValueType();
7892 // Vectors containing all zeros can be matched by pxor and xorps.
7893 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7894 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7895 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7896 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7899 return getZeroVector(VT, Subtarget, DAG, DL);
7902 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7903 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7904 // vpcmpeqd on 256-bit vectors.
7905 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7906 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7907 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7910 return getOnesVector(VT, DAG, DL);
7916 /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute
7917 /// from a vector of source values and a vector of extraction indices.
7918 /// The vectors might be manipulated to match the type of the permute op.
7919 SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
7920 SDLoc &DL, SelectionDAG &DAG,
7921 const X86Subtarget &Subtarget) {
7923 EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
7924 unsigned NumElts = VT.getVectorNumElements();
7925 unsigned SizeInBits = VT.getSizeInBits();
7927 // Adjust IndicesVec to match VT size.
7928 assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&
7929 "Illegal variable permute mask size");
7930 if (IndicesVec.getValueType().getVectorNumElements() > NumElts)
7931 IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec),
7932 NumElts * VT.getScalarSizeInBits());
7933 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7935 // Handle SrcVec that don't match VT type.
7936 if (SrcVec.getValueSizeInBits() != SizeInBits) {
7937 if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) {
7938 // Handle larger SrcVec by treating it as a larger permute.
7939 unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits;
7940 VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts);
7941 IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
7942 IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
7943 Subtarget, DAG, SDLoc(IndicesVec));
7944 return extractSubVector(
7945 createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
7946 DAG, DL, SizeInBits);
7947 } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
7948 // Widen smaller SrcVec to match VT.
7949 SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
7954 auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) {
7955 assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale");
7956 EVT SrcVT = Idx.getValueType();
7957 unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale;
7958 uint64_t IndexScale = 0;
7959 uint64_t IndexOffset = 0;
7961 // If we're scaling a smaller permute op, then we need to repeat the
7962 // indices, scaling and offsetting them as well.
7963 // e.g. v4i32 -> v16i8 (Scale = 4)
7964 // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4)
7965 // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0)
7966 for (uint64_t i = 0; i != Scale; ++i) {
7967 IndexScale |= Scale << (i * NumDstBits);
7968 IndexOffset |= i << (i * NumDstBits);
7971 Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx,
7972 DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT));
7973 Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx,
7974 DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT));
7978 unsigned Opcode = 0;
7979 switch (VT.SimpleTy) {
7983 if (Subtarget.hasSSSE3())
7984 Opcode = X86ISD::PSHUFB;
7987 if (Subtarget.hasVLX() && Subtarget.hasBWI())
7988 Opcode = X86ISD::VPERMV;
7989 else if (Subtarget.hasSSSE3()) {
7990 Opcode = X86ISD::PSHUFB;
7991 ShuffleVT = MVT::v16i8;
7996 if (Subtarget.hasAVX()) {
7997 Opcode = X86ISD::VPERMILPV;
7998 ShuffleVT = MVT::v4f32;
7999 } else if (Subtarget.hasSSSE3()) {
8000 Opcode = X86ISD::PSHUFB;
8001 ShuffleVT = MVT::v16i8;
8006 if (Subtarget.hasAVX()) {
8007 // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec.
8008 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8009 Opcode = X86ISD::VPERMILPV;
8010 ShuffleVT = MVT::v2f64;
8011 } else if (Subtarget.hasSSE41()) {
8012 // SSE41 can compare v2i64 - select between indices 0 and 1.
8013 return DAG.getSelectCC(
8015 getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL),
8016 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}),
8017 DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}),
8018 ISD::CondCode::SETEQ);
8022 if (Subtarget.hasVLX() && Subtarget.hasVBMI())
8023 Opcode = X86ISD::VPERMV;
8024 else if (Subtarget.hasXOP()) {
8025 SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL);
8026 SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL);
8027 SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL);
8028 SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL);
8030 ISD::CONCAT_VECTORS, DL, VT,
8031 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx),
8032 DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx));
8033 } else if (Subtarget.hasAVX()) {
8034 SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL);
8035 SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL);
8036 SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo);
8037 SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi);
8038 auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
8039 ArrayRef<SDValue> Ops) {
8040 // Permute Lo and Hi and then select based on index range.
8041 // This works as SHUFB uses bits[3:0] to permute elements and we don't
8042 // care about the bit[7] as its just an index vector.
8043 SDValue Idx = Ops[2];
8044 EVT VT = Idx.getValueType();
8045 return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT),
8046 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx),
8047 DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx),
8048 ISD::CondCode::SETGT);
8050 SDValue Ops[] = {LoLo, HiHi, IndicesVec};
8051 return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops,
8056 if (Subtarget.hasVLX() && Subtarget.hasBWI())
8057 Opcode = X86ISD::VPERMV;
8058 else if (Subtarget.hasAVX()) {
8059 // Scale to v32i8 and perform as v32i8.
8060 IndicesVec = ScaleIndices(IndicesVec, 2);
8061 return DAG.getBitcast(
8062 VT, createVariablePermute(
8063 MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec),
8064 DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget));
8069 if (Subtarget.hasAVX2())
8070 Opcode = X86ISD::VPERMV;
8071 else if (Subtarget.hasAVX()) {
8072 SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec);
8073 SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8074 {0, 1, 2, 3, 0, 1, 2, 3});
8075 SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec,
8076 {4, 5, 6, 7, 4, 5, 6, 7});
8077 if (Subtarget.hasXOP())
8078 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32,
8079 LoLo, HiHi, IndicesVec,
8080 DAG.getConstant(0, DL, MVT::i8)));
8081 // Permute Lo and Hi and then select based on index range.
8082 // This works as VPERMILPS only uses index bits[0:1] to permute elements.
8083 SDValue Res = DAG.getSelectCC(
8084 DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32),
8085 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec),
8086 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec),
8087 ISD::CondCode::SETGT);
8088 return DAG.getBitcast(VT, Res);
8093 if (Subtarget.hasAVX512()) {
8094 if (!Subtarget.hasVLX()) {
8095 MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8);
8096 SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG,
8098 IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget,
8099 DAG, SDLoc(IndicesVec));
8100 SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL,
8102 return extract256BitVector(Res, 0, DAG, DL);
8104 Opcode = X86ISD::VPERMV;
8105 } else if (Subtarget.hasAVX()) {
8106 SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec);
8108 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1});
8110 DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3});
8111 // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec.
8112 IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec);
8113 if (Subtarget.hasXOP())
8114 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64,
8115 LoLo, HiHi, IndicesVec,
8116 DAG.getConstant(0, DL, MVT::i8)));
8117 // Permute Lo and Hi and then select based on index range.
8118 // This works as VPERMILPD only uses index bit[1] to permute elements.
8119 SDValue Res = DAG.getSelectCC(
8120 DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64),
8121 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec),
8122 DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec),
8123 ISD::CondCode::SETGT);
8124 return DAG.getBitcast(VT, Res);
8128 if (Subtarget.hasVBMI())
8129 Opcode = X86ISD::VPERMV;
8132 if (Subtarget.hasBWI())
8133 Opcode = X86ISD::VPERMV;
8139 if (Subtarget.hasAVX512())
8140 Opcode = X86ISD::VPERMV;
8146 assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&
8147 (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&
8148 "Illegal variable permute shuffle type");
8150 uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits();
8152 IndicesVec = ScaleIndices(IndicesVec, Scale);
8154 EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger();
8155 IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec);
8157 SrcVec = DAG.getBitcast(ShuffleVT, SrcVec);
8158 SDValue Res = Opcode == X86ISD::VPERMV
8159 ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec)
8160 : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec);
8161 return DAG.getBitcast(VT, Res);
8164 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
8165 // reasoned to be a permutation of a vector by indices in a non-constant vector.
8166 // (build_vector (extract_elt V, (extract_elt I, 0)),
8167 // (extract_elt V, (extract_elt I, 1)),
8172 // TODO: Handle undefs
8173 // TODO: Utilize pshufb and zero mask blending to support more efficient
8174 // construction of vectors with constant-0 elements.
8176 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
8177 const X86Subtarget &Subtarget) {
8178 SDValue SrcVec, IndicesVec;
8179 // Check for a match of the permute source vector and permute index elements.
8180 // This is done by checking that the i-th build_vector operand is of the form:
8181 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
8182 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
8183 SDValue Op = V.getOperand(Idx);
8184 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8187 // If this is the first extract encountered in V, set the source vector,
8188 // otherwise verify the extract is from the previously defined source
8191 SrcVec = Op.getOperand(0);
8192 else if (SrcVec != Op.getOperand(0))
8194 SDValue ExtractedIndex = Op->getOperand(1);
8195 // Peek through extends.
8196 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
8197 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
8198 ExtractedIndex = ExtractedIndex.getOperand(0);
8199 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
8202 // If this is the first extract from the index vector candidate, set the
8203 // indices vector, otherwise verify the extract is from the previously
8204 // defined indices vector.
8206 IndicesVec = ExtractedIndex.getOperand(0);
8207 else if (IndicesVec != ExtractedIndex.getOperand(0))
8210 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
8211 if (!PermIdx || PermIdx->getZExtValue() != Idx)
8216 MVT VT = V.getSimpleValueType();
8217 return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
8221 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
8224 MVT VT = Op.getSimpleValueType();
8225 MVT EltVT = VT.getVectorElementType();
8226 unsigned NumElems = Op.getNumOperands();
8228 // Generate vectors for predicate vectors.
8229 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
8230 return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget);
8232 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
8233 return VectorConstant;
8235 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
8236 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
8238 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
8239 return HorizontalOp;
8240 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
8242 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
8245 unsigned EVTBits = EltVT.getSizeInBits();
8247 unsigned NumZero = 0;
8248 unsigned NumNonZero = 0;
8249 uint64_t NonZeros = 0;
8250 bool IsAllConstants = true;
8251 SmallSet<SDValue, 8> Values;
8252 unsigned NumConstants = NumElems;
8253 for (unsigned i = 0; i < NumElems; ++i) {
8254 SDValue Elt = Op.getOperand(i);
8258 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
8259 IsAllConstants = false;
8262 if (X86::isZeroNode(Elt))
8265 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
8266 NonZeros |= ((uint64_t)1 << i);
8271 // All undef vector. Return an UNDEF. All zero vectors were handled above.
8272 if (NumNonZero == 0)
8273 return DAG.getUNDEF(VT);
8275 // If we are inserting one variable into a vector of non-zero constants, try
8276 // to avoid loading each constant element as a scalar. Load the constants as a
8277 // vector and then insert the variable scalar element. If insertion is not
8278 // supported, we assume that we will fall back to a shuffle to get the scalar
8279 // blended with the constants. Insertion into a zero vector is handled as a
8280 // special-case somewhere below here.
8281 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
8282 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
8283 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
8284 // Create an all-constant vector. The variable element in the old
8285 // build vector is replaced by undef in the constant vector. Save the
8286 // variable scalar element and its index for use in the insertelement.
8287 LLVMContext &Context = *DAG.getContext();
8288 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
8289 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
8292 for (unsigned i = 0; i != NumElems; ++i) {
8293 SDValue Elt = Op.getOperand(i);
8294 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
8295 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
8296 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
8297 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
8298 else if (!Elt.isUndef()) {
8299 assert(!VarElt.getNode() && !InsIndex.getNode() &&
8300 "Expected one variable element in this vector");
8302 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
8305 Constant *CV = ConstantVector::get(ConstVecOps);
8306 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
8308 // The constants we just created may not be legal (eg, floating point). We
8309 // must lower the vector right here because we can not guarantee that we'll
8310 // legalize it before loading it. This is also why we could not just create
8311 // a new build vector here. If the build vector contains illegal constants,
8312 // it could get split back up into a series of insert elements.
8313 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
8314 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
8315 MachineFunction &MF = DAG.getMachineFunction();
8316 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
8317 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
8318 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
8321 // Special case for single non-zero, non-undef, element.
8322 if (NumNonZero == 1) {
8323 unsigned Idx = countTrailingZeros(NonZeros);
8324 SDValue Item = Op.getOperand(Idx);
8326 // If we have a constant or non-constant insertion into the low element of
8327 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
8328 // the rest of the elements. This will be matched as movd/movq/movss/movsd
8329 // depending on what the source datatype is.
8332 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8334 if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
8335 (EltVT == MVT::i64 && Subtarget.is64Bit())) {
8336 assert((VT.is128BitVector() || VT.is256BitVector() ||
8337 VT.is512BitVector()) &&
8338 "Expected an SSE value type!");
8339 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8340 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
8341 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8344 // We can't directly insert an i8 or i16 into a vector, so zero extend
8346 if (EltVT == MVT::i16 || EltVT == MVT::i8) {
8347 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
8348 if (VT.getSizeInBits() >= 256) {
8349 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
8350 if (Subtarget.hasAVX()) {
8351 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
8352 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8354 // Without AVX, we need to extend to a 128-bit vector and then
8355 // insert into the 256-bit vector.
8356 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8357 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
8358 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
8361 assert(VT.is128BitVector() && "Expected an SSE value type!");
8362 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
8363 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
8365 return DAG.getBitcast(VT, Item);
8369 // Is it a vector logical left shift?
8370 if (NumElems == 2 && Idx == 1 &&
8371 X86::isZeroNode(Op.getOperand(0)) &&
8372 !X86::isZeroNode(Op.getOperand(1))) {
8373 unsigned NumBits = VT.getSizeInBits();
8374 return getVShift(true, VT,
8375 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8376 VT, Op.getOperand(1)),
8377 NumBits/2, DAG, *this, dl);
8380 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8383 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8384 // is a non-constant being inserted into an element other than the low one,
8385 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8386 // movd/movss) to move this into the low element, then shuffle it into
8388 if (EVTBits == 32) {
8389 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8390 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8394 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8395 if (Values.size() == 1) {
8396 if (EVTBits == 32) {
8397 // Instead of a shuffle like this:
8398 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8399 // Check if it's possible to issue this instead.
8400 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8401 unsigned Idx = countTrailingZeros(NonZeros);
8402 SDValue Item = Op.getOperand(Idx);
8403 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8404 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8409 // A vector full of immediates; various special cases are already
8410 // handled, so this is best done with a single constant-pool load.
8414 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8417 // See if we can use a vector load to get all of the elements.
8419 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8421 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8425 // If this is a splat of pairs of 32-bit elements, we can use a narrower
8426 // build_vector and broadcast it.
8427 // TODO: We could probably generalize this more.
8428 if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) {
8429 SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1),
8430 DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) };
8431 auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) {
8432 // Make sure all the even/odd operands match.
8433 for (unsigned i = 2; i != NumElems; ++i)
8434 if (Ops[i % 2] != Op.getOperand(i))
8438 if (CanSplat(Op, NumElems, Ops)) {
8439 MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64;
8440 MVT NarrowVT = MVT::getVectorVT(EltVT, 4);
8441 // Create a new build vector and cast to v2i64/v2f64.
8442 SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2),
8443 DAG.getBuildVector(NarrowVT, dl, Ops));
8444 // Broadcast from v2i64/v2f64 and cast to final VT.
8445 MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems/2);
8446 return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT,
8451 // For AVX-length vectors, build the individual 128-bit pieces and use
8452 // shuffles to put them in place.
8453 if (VT.getSizeInBits() > 128) {
8454 MVT HVT = MVT::getVectorVT(EltVT, NumElems/2);
8456 // Build both the lower and upper subvector.
8458 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8459 SDValue Upper = DAG.getBuildVector(
8460 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8462 // Recreate the wider vector with the lower and upper part.
8463 return concatSubVectors(Lower, Upper, VT, NumElems, DAG, dl,
8464 VT.getSizeInBits() / 2);
8467 // Let legalizer expand 2-wide build_vectors.
8468 if (EVTBits == 64) {
8469 if (NumNonZero == 1) {
8470 // One half is zero or undef.
8471 unsigned Idx = countTrailingZeros(NonZeros);
8472 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8473 Op.getOperand(Idx));
8474 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8479 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8480 if (EVTBits == 8 && NumElems == 16)
8481 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8485 if (EVTBits == 16 && NumElems == 8)
8486 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8490 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8491 if (EVTBits == 32 && NumElems == 4)
8492 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8495 // If element VT is == 32 bits, turn it into a number of shuffles.
8496 if (NumElems == 4 && NumZero > 0) {
8497 SmallVector<SDValue, 8> Ops(NumElems);
8498 for (unsigned i = 0; i < 4; ++i) {
8499 bool isZero = !(NonZeros & (1ULL << i));
8501 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8503 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8506 for (unsigned i = 0; i < 2; ++i) {
8507 switch ((NonZeros >> (i*2)) & 0x3) {
8508 default: llvm_unreachable("Unexpected NonZero count");
8510 Ops[i] = Ops[i*2]; // Must be a zero vector.
8513 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8516 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8519 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8524 bool Reverse1 = (NonZeros & 0x3) == 2;
8525 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8529 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8530 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8532 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8535 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8537 // Check for a build vector from mostly shuffle plus few inserting.
8538 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8541 // For SSE 4.1, use insertps to put the high elements into the low element.
8542 if (Subtarget.hasSSE41()) {
8544 if (!Op.getOperand(0).isUndef())
8545 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8547 Result = DAG.getUNDEF(VT);
8549 for (unsigned i = 1; i < NumElems; ++i) {
8550 if (Op.getOperand(i).isUndef()) continue;
8551 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8552 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8557 // Otherwise, expand into a number of unpckl*, start by extending each of
8558 // our (non-undef) elements to the full vector width with the element in the
8559 // bottom slot of the vector (which generates no code for SSE).
8560 SmallVector<SDValue, 8> Ops(NumElems);
8561 for (unsigned i = 0; i < NumElems; ++i) {
8562 if (!Op.getOperand(i).isUndef())
8563 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8565 Ops[i] = DAG.getUNDEF(VT);
8568 // Next, we iteratively mix elements, e.g. for v4f32:
8569 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8570 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8571 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8572 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8573 // Generate scaled UNPCKL shuffle mask.
8574 SmallVector<int, 16> Mask;
8575 for(unsigned i = 0; i != Scale; ++i)
8577 for (unsigned i = 0; i != Scale; ++i)
8578 Mask.push_back(NumElems+i);
8579 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8581 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8582 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8587 // 256-bit AVX can use the vinsertf128 instruction
8588 // to create 256-bit vectors from two other 128-bit ones.
8589 // TODO: Detect subvector broadcast here instead of DAG combine?
8590 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG,
8591 const X86Subtarget &Subtarget) {
8593 MVT ResVT = Op.getSimpleValueType();
8595 assert((ResVT.is256BitVector() ||
8596 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8598 unsigned NumOperands = Op.getNumOperands();
8599 unsigned NumZero = 0;
8600 unsigned NumNonZero = 0;
8601 unsigned NonZeros = 0;
8602 for (unsigned i = 0; i != NumOperands; ++i) {
8603 SDValue SubVec = Op.getOperand(i);
8604 if (SubVec.isUndef())
8606 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8609 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8615 // If we have more than 2 non-zeros, build each half separately.
8616 if (NumNonZero > 2) {
8617 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8618 ResVT.getVectorNumElements()/2);
8619 ArrayRef<SDUse> Ops = Op->ops();
8620 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8621 Ops.slice(0, NumOperands/2));
8622 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8623 Ops.slice(NumOperands/2));
8624 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8627 // Otherwise, build it up through insert_subvectors.
8628 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8629 : DAG.getUNDEF(ResVT);
8631 MVT SubVT = Op.getOperand(0).getSimpleValueType();
8632 unsigned NumSubElems = SubVT.getVectorNumElements();
8633 for (unsigned i = 0; i != NumOperands; ++i) {
8634 if ((NonZeros & (1 << i)) == 0)
8637 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec,
8639 DAG.getIntPtrConstant(i * NumSubElems, dl));
8645 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8646 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8647 static bool isExpandWithZeros(const SDValue &Op) {
8648 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8649 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8651 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8652 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8658 // Returns true if the given node is a type promotion (by concatenating i1
8659 // zeros) of the result of a node that already zeros all upper bits of
8661 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8662 unsigned Opc = Op.getOpcode();
8664 assert(Opc == ISD::CONCAT_VECTORS &&
8665 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8666 "Unexpected node to check for type promotion!");
8668 // As long as we are concatenating zeros to the upper part of a previous node
8669 // result, climb up the tree until a node with different opcode is
8671 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8672 if (Opc == ISD::INSERT_SUBVECTOR) {
8673 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8674 Op.getConstantOperandVal(2) == 0)
8675 Op = Op.getOperand(1);
8678 } else { // Opc == ISD::CONCAT_VECTORS
8679 if (isExpandWithZeros(Op))
8680 Op = Op.getOperand(0);
8684 Opc = Op.getOpcode();
8687 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8688 // of a node that zeros the upper bits (its masked version).
8689 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8690 (Op.getOpcode() == ISD::AND &&
8691 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8692 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8699 // TODO: Merge this with LowerAVXCONCAT_VECTORS?
8700 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8701 const X86Subtarget &Subtarget,
8702 SelectionDAG & DAG) {
8704 MVT ResVT = Op.getSimpleValueType();
8705 unsigned NumOperands = Op.getNumOperands();
8707 assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
8708 "Unexpected number of operands in CONCAT_VECTORS");
8710 // If this node promotes - by concatenating zeroes - the type of the result
8711 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8712 // output register, mark it as legal and catch the pattern in instruction
8713 // selection to avoid emitting extra instructions (for zeroing upper bits).
8714 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op))
8715 return widenSubVector(ResVT, Promoted, true, Subtarget, DAG, dl);
8717 unsigned NumZero = 0;
8718 unsigned NumNonZero = 0;
8719 uint64_t NonZeros = 0;
8720 for (unsigned i = 0; i != NumOperands; ++i) {
8721 SDValue SubVec = Op.getOperand(i);
8722 if (SubVec.isUndef())
8724 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
8727 assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
8728 NonZeros |= (uint64_t)1 << i;
8734 // If there are zero or one non-zeros we can handle this very simply.
8735 if (NumNonZero <= 1) {
8736 SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
8737 : DAG.getUNDEF(ResVT);
8740 unsigned Idx = countTrailingZeros(NonZeros);
8741 SDValue SubVec = Op.getOperand(Idx);
8742 unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
8743 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
8744 DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
8747 if (NumOperands > 2) {
8748 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8749 ResVT.getVectorNumElements()/2);
8750 ArrayRef<SDUse> Ops = Op->ops();
8751 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8752 Ops.slice(0, NumOperands/2));
8753 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
8754 Ops.slice(NumOperands/2));
8755 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8758 assert(NumNonZero == 2 && "Simple cases not handled?");
8760 if (ResVT.getVectorNumElements() >= 16)
8761 return Op; // The operation is legal with KUNPCK
8763 SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
8764 DAG.getUNDEF(ResVT), Op.getOperand(0),
8765 DAG.getIntPtrConstant(0, dl));
8766 unsigned NumElems = ResVT.getVectorNumElements();
8767 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
8768 DAG.getIntPtrConstant(NumElems/2, dl));
8771 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8772 const X86Subtarget &Subtarget,
8773 SelectionDAG &DAG) {
8774 MVT VT = Op.getSimpleValueType();
8775 if (VT.getVectorElementType() == MVT::i1)
8776 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8778 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8779 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8780 Op.getNumOperands() == 4)));
8782 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8783 // from two other 128-bit ones.
8785 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8786 return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget);
8789 //===----------------------------------------------------------------------===//
8790 // Vector shuffle lowering
8792 // This is an experimental code path for lowering vector shuffles on x86. It is
8793 // designed to handle arbitrary vector shuffles and blends, gracefully
8794 // degrading performance as necessary. It works hard to recognize idiomatic
8795 // shuffles and lower them to optimal instruction patterns without leaving
8796 // a framework that allows reasonably efficient handling of all vector shuffle
8798 //===----------------------------------------------------------------------===//
8800 /// \brief Tiny helper function to identify a no-op mask.
8802 /// This is a somewhat boring predicate function. It checks whether the mask
8803 /// array input, which is assumed to be a single-input shuffle mask of the kind
8804 /// used by the X86 shuffle instructions (not a fully general
8805 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8806 /// in-place shuffle are 'no-op's.
8807 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8808 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8809 assert(Mask[i] >= -1 && "Out of bound mask element!");
8810 if (Mask[i] >= 0 && Mask[i] != i)
8816 /// \brief Test whether there are elements crossing 128-bit lanes in this
8819 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8820 /// and we routinely test for these.
8821 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8822 int LaneSize = 128 / VT.getScalarSizeInBits();
8823 int Size = Mask.size();
8824 for (int i = 0; i < Size; ++i)
8825 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8830 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8832 /// This checks a shuffle mask to see if it is performing the same
8833 /// lane-relative shuffle in each sub-lane. This trivially implies
8834 /// that it is also not lane-crossing. It may however involve a blend from the
8835 /// same lane of a second vector.
8837 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8838 /// non-trivial to compute in the face of undef lanes. The representation is
8839 /// suitable for use with existing 128-bit shuffles as entries from the second
8840 /// vector have been remapped to [LaneSize, 2*LaneSize).
8841 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8843 SmallVectorImpl<int> &RepeatedMask) {
8844 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8845 RepeatedMask.assign(LaneSize, -1);
8846 int Size = Mask.size();
8847 for (int i = 0; i < Size; ++i) {
8848 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8851 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8852 // This entry crosses lanes, so there is no way to model this shuffle.
8855 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8856 // Adjust second vector indices to start at LaneSize instead of Size.
8857 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8858 : Mask[i] % LaneSize + LaneSize;
8859 if (RepeatedMask[i % LaneSize] < 0)
8860 // This is the first non-undef entry in this slot of a 128-bit lane.
8861 RepeatedMask[i % LaneSize] = LocalM;
8862 else if (RepeatedMask[i % LaneSize] != LocalM)
8863 // Found a mismatch with the repeated mask.
8869 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8871 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8872 SmallVectorImpl<int> &RepeatedMask) {
8873 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8876 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8878 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8879 SmallVectorImpl<int> &RepeatedMask) {
8880 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8883 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8884 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8885 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8887 SmallVectorImpl<int> &RepeatedMask) {
8888 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8889 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8890 int Size = Mask.size();
8891 for (int i = 0; i < Size; ++i) {
8892 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8893 if (Mask[i] == SM_SentinelUndef)
8895 if (Mask[i] == SM_SentinelZero) {
8896 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8898 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8901 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8902 // This entry crosses lanes, so there is no way to model this shuffle.
8905 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8906 // Adjust second vector indices to start at LaneSize instead of Size.
8908 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8909 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8910 // This is the first non-undef entry in this slot of a 128-bit lane.
8911 RepeatedMask[i % LaneSize] = LocalM;
8912 else if (RepeatedMask[i % LaneSize] != LocalM)
8913 // Found a mismatch with the repeated mask.
8919 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8922 /// This is a fast way to test a shuffle mask against a fixed pattern:
8924 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8926 /// It returns true if the mask is exactly as wide as the argument list, and
8927 /// each element of the mask is either -1 (signifying undef) or the value given
8928 /// in the argument.
8929 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8930 ArrayRef<int> ExpectedMask) {
8931 if (Mask.size() != ExpectedMask.size())
8934 int Size = Mask.size();
8936 // If the values are build vectors, we can look through them to find
8937 // equivalent inputs that make the shuffles equivalent.
8938 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8939 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8941 for (int i = 0; i < Size; ++i) {
8942 assert(Mask[i] >= -1 && "Out of bound mask element!");
8943 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8944 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8945 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8946 if (!MaskBV || !ExpectedBV ||
8947 MaskBV->getOperand(Mask[i] % Size) !=
8948 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8956 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8958 /// The masks must be exactly the same width.
8960 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8961 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8963 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8964 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8965 ArrayRef<int> ExpectedMask) {
8966 int Size = Mask.size();
8967 if (Size != (int)ExpectedMask.size())
8970 for (int i = 0; i < Size; ++i)
8971 if (Mask[i] == SM_SentinelUndef)
8973 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8975 else if (Mask[i] != ExpectedMask[i])
8981 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8983 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8984 const APInt &Zeroable) {
8985 int NumElts = Mask.size();
8986 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8988 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8989 for (int i = 0; i != NumElts; ++i) {
8991 if (M == SM_SentinelUndef)
8993 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8994 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8999 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
9001 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
9002 if (VT != MVT::v8i32 && VT != MVT::v8f32)
9005 SmallVector<int, 8> Unpcklwd;
9006 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
9007 /* Unary = */ false);
9008 SmallVector<int, 8> Unpckhwd;
9009 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
9010 /* Unary = */ false);
9011 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
9012 isTargetShuffleEquivalent(Mask, Unpckhwd));
9013 return IsUnpackwdMask;
9016 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
9018 /// This helper function produces an 8-bit shuffle immediate corresponding to
9019 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
9020 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
9023 /// NB: We rely heavily on "undef" masks preserving the input lane.
9024 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
9025 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
9026 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
9027 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
9028 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
9029 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
9032 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
9033 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
9034 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
9035 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
9039 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
9040 SelectionDAG &DAG) {
9041 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
9044 /// \brief Compute whether each element of a shuffle is zeroable.
9046 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
9047 /// Either it is an undef element in the shuffle mask, the element of the input
9048 /// referenced is undef, or the element of the input referenced is known to be
9049 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
9050 /// as many lanes with this technique as possible to simplify the remaining
9052 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
9053 SDValue V1, SDValue V2) {
9054 APInt Zeroable(Mask.size(), 0);
9055 V1 = peekThroughBitcasts(V1);
9056 V2 = peekThroughBitcasts(V2);
9058 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
9059 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
9061 int VectorSizeInBits = V1.getValueSizeInBits();
9062 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
9063 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
9065 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9067 // Handle the easy cases.
9068 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
9073 // Determine shuffle input and normalize the mask.
9074 SDValue V = M < Size ? V1 : V2;
9077 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
9078 if (V.getOpcode() != ISD::BUILD_VECTOR)
9081 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
9082 // the (larger) source element must be UNDEF/ZERO.
9083 if ((Size % V.getNumOperands()) == 0) {
9084 int Scale = Size / V->getNumOperands();
9085 SDValue Op = V.getOperand(M / Scale);
9086 if (Op.isUndef() || X86::isZeroNode(Op))
9088 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
9089 APInt Val = Cst->getAPIntValue();
9090 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9091 Val = Val.getLoBits(ScalarSizeInBits);
9094 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
9095 APInt Val = Cst->getValueAPF().bitcastToAPInt();
9096 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
9097 Val = Val.getLoBits(ScalarSizeInBits);
9104 // If the BUILD_VECTOR has more elements then all the (smaller) source
9105 // elements must be UNDEF or ZERO.
9106 if ((V.getNumOperands() % Size) == 0) {
9107 int Scale = V->getNumOperands() / Size;
9108 bool AllZeroable = true;
9109 for (int j = 0; j < Scale; ++j) {
9110 SDValue Op = V.getOperand((M * Scale) + j);
9111 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
9122 // The Shuffle result is as follow:
9123 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
9124 // Each Zeroable's element correspond to a particular Mask's element.
9125 // As described in computeZeroableShuffleElements function.
9127 // The function looks for a sub-mask that the nonzero elements are in
9128 // increasing order. If such sub-mask exist. The function returns true.
9129 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
9130 ArrayRef<int> Mask, const EVT &VectorType,
9131 bool &IsZeroSideLeft) {
9132 int NextElement = -1;
9133 // Check if the Mask's nonzero elements are in increasing order.
9134 for (int i = 0, e = Mask.size(); i < e; i++) {
9135 // Checks if the mask's zeros elements are built from only zeros.
9136 assert(Mask[i] >= -1 && "Out of bound mask element!");
9141 // Find the lowest non zero element
9142 if (NextElement < 0) {
9143 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
9144 IsZeroSideLeft = NextElement != 0;
9146 // Exit if the mask's non zero elements are not in increasing order.
9147 if (NextElement != Mask[i])
9154 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
9155 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
9156 ArrayRef<int> Mask, SDValue V1,
9158 const APInt &Zeroable,
9159 const X86Subtarget &Subtarget,
9160 SelectionDAG &DAG) {
9161 int Size = Mask.size();
9162 int LaneSize = 128 / VT.getScalarSizeInBits();
9163 const int NumBytes = VT.getSizeInBits() / 8;
9164 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
9166 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
9167 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
9168 (Subtarget.hasBWI() && VT.is512BitVector()));
9170 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
9171 // Sign bit set in i8 mask means zero element.
9172 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
9175 for (int i = 0; i < NumBytes; ++i) {
9176 int M = Mask[i / NumEltBytes];
9178 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
9181 if (Zeroable[i / NumEltBytes]) {
9182 PSHUFBMask[i] = ZeroMask;
9186 // We can only use a single input of V1 or V2.
9187 SDValue SrcV = (M >= Size ? V2 : V1);
9193 // PSHUFB can't cross lanes, ensure this doesn't happen.
9194 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
9198 M = M * NumEltBytes + (i % NumEltBytes);
9199 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
9201 assert(V && "Failed to find a source input");
9203 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
9204 return DAG.getBitcast(
9205 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
9206 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
9209 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
9210 const X86Subtarget &Subtarget, SelectionDAG &DAG,
9213 // X86 has dedicated shuffle that can be lowered to VEXPAND
9214 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
9215 const APInt &Zeroable,
9216 ArrayRef<int> Mask, SDValue &V1,
9217 SDValue &V2, SelectionDAG &DAG,
9218 const X86Subtarget &Subtarget) {
9219 bool IsLeftZeroSide = true;
9220 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
9223 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
9225 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9226 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
9227 unsigned NumElts = VT.getVectorNumElements();
9228 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
9229 "Unexpected number of vector elements");
9230 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
9231 Subtarget, DAG, DL);
9232 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
9233 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
9234 return DAG.getSelect(DL, VT, VMask,
9235 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
9239 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
9240 unsigned &UnpackOpcode, bool IsUnary,
9241 ArrayRef<int> TargetMask,
9242 const SDLoc &DL, SelectionDAG &DAG,
9243 const X86Subtarget &Subtarget) {
9244 int NumElts = VT.getVectorNumElements();
9246 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
9247 for (int i = 0; i != NumElts; i += 2) {
9248 int M1 = TargetMask[i + 0];
9249 int M2 = TargetMask[i + 1];
9250 Undef1 &= (SM_SentinelUndef == M1);
9251 Undef2 &= (SM_SentinelUndef == M2);
9252 Zero1 &= isUndefOrZero(M1);
9253 Zero2 &= isUndefOrZero(M2);
9255 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
9256 "Zeroable shuffle detected");
9258 // Attempt to match the target mask against the unpack lo/hi mask patterns.
9259 SmallVector<int, 64> Unpckl, Unpckh;
9260 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
9261 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9262 UnpackOpcode = X86ISD::UNPCKL;
9263 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9264 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9268 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
9269 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9270 UnpackOpcode = X86ISD::UNPCKH;
9271 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
9272 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
9276 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
9277 if (IsUnary && (Zero1 || Zero2)) {
9278 // Don't bother if we can blend instead.
9279 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
9280 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
9283 bool MatchLo = true, MatchHi = true;
9284 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
9285 int M = TargetMask[i];
9287 // Ignore if the input is known to be zero or the index is undef.
9288 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
9289 (M == SM_SentinelUndef))
9292 MatchLo &= (M == Unpckl[i]);
9293 MatchHi &= (M == Unpckh[i]);
9296 if (MatchLo || MatchHi) {
9297 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
9298 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9299 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
9304 // If a binary shuffle, commute and try again.
9306 ShuffleVectorSDNode::commuteMask(Unpckl);
9307 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
9308 UnpackOpcode = X86ISD::UNPCKL;
9313 ShuffleVectorSDNode::commuteMask(Unpckh);
9314 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
9315 UnpackOpcode = X86ISD::UNPCKH;
9324 // X86 has dedicated unpack instructions that can handle specific blend
9325 // operations: UNPCKH and UNPCKL.
9326 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
9327 ArrayRef<int> Mask, SDValue V1,
9328 SDValue V2, SelectionDAG &DAG) {
9329 SmallVector<int, 8> Unpckl;
9330 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
9331 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9332 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
9334 SmallVector<int, 8> Unpckh;
9335 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
9336 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9337 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
9339 // Commute and try again.
9340 ShuffleVectorSDNode::commuteMask(Unpckl);
9341 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
9342 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
9344 ShuffleVectorSDNode::commuteMask(Unpckh);
9345 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
9346 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
9351 // X86 has dedicated pack instructions that can handle specific truncation
9352 // operations: PACKSS and PACKUS.
9353 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
9354 SDValue &V2, unsigned &PackOpcode,
9355 ArrayRef<int> TargetMask,
9357 const X86Subtarget &Subtarget) {
9358 unsigned NumElts = VT.getVectorNumElements();
9359 unsigned BitSize = VT.getScalarSizeInBits();
9360 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
9361 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
9363 auto MatchPACK = [&](SDValue N1, SDValue N2) {
9364 SDValue VV1 = DAG.getBitcast(PackVT, N1);
9365 SDValue VV2 = DAG.getBitcast(PackVT, N2);
9366 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
9367 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
9371 PackOpcode = X86ISD::PACKSS;
9375 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
9376 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
9377 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
9378 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
9382 PackOpcode = X86ISD::PACKUS;
9390 // Try binary shuffle.
9391 SmallVector<int, 32> BinaryMask;
9392 createPackShuffleMask(VT, BinaryMask, false);
9393 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
9394 if (MatchPACK(V1, V2))
9397 // Try unary shuffle.
9398 SmallVector<int, 32> UnaryMask;
9399 createPackShuffleMask(VT, UnaryMask, true);
9400 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
9401 if (MatchPACK(V1, V1))
9407 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
9408 ArrayRef<int> Mask, SDValue V1,
9409 SDValue V2, SelectionDAG &DAG,
9410 const X86Subtarget &Subtarget) {
9412 unsigned PackOpcode;
9413 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9415 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9416 DAG.getBitcast(PackVT, V2));
9421 /// \brief Try to emit a bitmask instruction for a shuffle.
9423 /// This handles cases where we can model a blend exactly as a bitmask due to
9424 /// one of the inputs being zeroable.
9425 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9426 SDValue V2, ArrayRef<int> Mask,
9427 const APInt &Zeroable,
9428 SelectionDAG &DAG) {
9429 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9430 MVT EltVT = VT.getVectorElementType();
9431 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9432 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9433 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9435 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9438 if (Mask[i] % Size != i)
9439 return SDValue(); // Not a blend.
9441 V = Mask[i] < Size ? V1 : V2;
9442 else if (V != (Mask[i] < Size ? V1 : V2))
9443 return SDValue(); // Can only let one input through the mask.
9445 VMaskOps[i] = AllOnes;
9448 return SDValue(); // No non-zeroable elements!
9450 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9451 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9454 /// \brief Try to emit a blend instruction for a shuffle using bit math.
9456 /// This is used as a fallback approach when first class blend instructions are
9457 /// unavailable. Currently it is only suitable for integer vectors, but could
9458 /// be generalized for floating point vectors if desirable.
9459 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9460 SDValue V2, ArrayRef<int> Mask,
9461 SelectionDAG &DAG) {
9462 assert(VT.isInteger() && "Only supports integer vector types!");
9463 MVT EltVT = VT.getVectorElementType();
9464 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9465 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9466 SmallVector<SDValue, 16> MaskOps;
9467 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9468 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9469 return SDValue(); // Shuffled input!
9470 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9473 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9474 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9475 // We have to cast V2 around.
9476 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9477 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9478 DAG.getBitcast(MaskVT, V1Mask),
9479 DAG.getBitcast(MaskVT, V2)));
9480 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9483 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9484 SDValue PreservedSrc,
9485 const X86Subtarget &Subtarget,
9488 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9489 MutableArrayRef<int> TargetMask,
9490 bool &ForceV1Zero, bool &ForceV2Zero,
9491 uint64_t &BlendMask) {
9492 bool V1IsZeroOrUndef =
9493 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9494 bool V2IsZeroOrUndef =
9495 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9498 ForceV1Zero = false, ForceV2Zero = false;
9499 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9501 // Attempt to generate the binary blend mask. If an input is zero then
9502 // we can use any lane.
9503 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9504 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9505 int M = TargetMask[i];
9506 if (M == SM_SentinelUndef)
9510 if (M == i + Size) {
9511 BlendMask |= 1ull << i;
9514 if (M == SM_SentinelZero) {
9515 if (V1IsZeroOrUndef) {
9520 if (V2IsZeroOrUndef) {
9522 BlendMask |= 1ull << i;
9523 TargetMask[i] = i + Size;
9532 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9534 uint64_t ScaledMask = 0;
9535 for (int i = 0; i != Size; ++i)
9536 if (BlendMask & (1ull << i))
9537 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9541 /// \brief Try to emit a blend instruction for a shuffle.
9543 /// This doesn't do any checks for the availability of instructions for blending
9544 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9545 /// be matched in the backend with the type given. What it does check for is
9546 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9547 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9548 SDValue V2, ArrayRef<int> Original,
9549 const APInt &Zeroable,
9550 const X86Subtarget &Subtarget,
9551 SelectionDAG &DAG) {
9552 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9554 uint64_t BlendMask = 0;
9555 bool ForceV1Zero = false, ForceV2Zero = false;
9556 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9560 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9562 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9564 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9566 switch (VT.SimpleTy) {
9571 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9572 DAG.getConstant(BlendMask, DL, MVT::i8));
9576 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9580 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9581 // that instruction.
9582 if (Subtarget.hasAVX2()) {
9583 // Scale the blend by the number of 32-bit dwords per element.
9584 int Scale = VT.getScalarSizeInBits() / 32;
9585 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9586 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9587 V1 = DAG.getBitcast(BlendVT, V1);
9588 V2 = DAG.getBitcast(BlendVT, V2);
9589 return DAG.getBitcast(
9590 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9591 DAG.getConstant(BlendMask, DL, MVT::i8)));
9595 // For integer shuffles we need to expand the mask and cast the inputs to
9596 // v8i16s prior to blending.
9597 int Scale = 8 / VT.getVectorNumElements();
9598 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9599 V1 = DAG.getBitcast(MVT::v8i16, V1);
9600 V2 = DAG.getBitcast(MVT::v8i16, V2);
9601 return DAG.getBitcast(VT,
9602 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9603 DAG.getConstant(BlendMask, DL, MVT::i8)));
9607 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9608 SmallVector<int, 8> RepeatedMask;
9609 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9610 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9611 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9613 for (int i = 0; i < 8; ++i)
9614 if (RepeatedMask[i] >= 8)
9615 BlendMask |= 1ull << i;
9616 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9617 DAG.getConstant(BlendMask, DL, MVT::i8));
9623 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9624 "256-bit byte-blends require AVX2 support!");
9626 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9628 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9629 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9630 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9633 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9634 if (SDValue Masked =
9635 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9638 // Scale the blend by the number of bytes per element.
9639 int Scale = VT.getScalarSizeInBits() / 8;
9641 // This form of blend is always done on bytes. Compute the byte vector
9643 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9645 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9646 // mix of LLVM's code generator and the x86 backend. We tell the code
9647 // generator that boolean values in the elements of an x86 vector register
9648 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9649 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9650 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9651 // of the element (the remaining are ignored) and 0 in that high bit would
9652 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9653 // the LLVM model for boolean values in vector elements gets the relevant
9654 // bit set, it is set backwards and over constrained relative to x86's
9656 SmallVector<SDValue, 32> VSELECTMask;
9657 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9658 for (int j = 0; j < Scale; ++j)
9659 VSELECTMask.push_back(
9660 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9661 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9664 V1 = DAG.getBitcast(BlendVT, V1);
9665 V2 = DAG.getBitcast(BlendVT, V2);
9666 return DAG.getBitcast(
9668 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9678 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9679 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9680 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9683 llvm_unreachable("Not a supported integer vector type!");
9687 /// \brief Try to lower as a blend of elements from two inputs followed by
9688 /// a single-input permutation.
9690 /// This matches the pattern where we can blend elements from two inputs and
9691 /// then reduce the shuffle to a single-input permutation.
9692 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9693 SDValue V1, SDValue V2,
9695 SelectionDAG &DAG) {
9696 // We build up the blend mask while checking whether a blend is a viable way
9697 // to reduce the shuffle.
9698 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9699 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9701 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9705 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9707 if (BlendMask[Mask[i] % Size] < 0)
9708 BlendMask[Mask[i] % Size] = Mask[i];
9709 else if (BlendMask[Mask[i] % Size] != Mask[i])
9710 return SDValue(); // Can't blend in the needed input!
9712 PermuteMask[i] = Mask[i] % Size;
9715 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9716 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9719 /// \brief Generic routine to decompose a shuffle and blend into independent
9720 /// blends and permutes.
9722 /// This matches the extremely common pattern for handling combined
9723 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9724 /// operations. It will try to pick the best arrangement of shuffles and
9726 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9730 SelectionDAG &DAG) {
9731 // Shuffle the input elements into the desired positions in V1 and V2 and
9732 // blend them together.
9733 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9734 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9735 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9736 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9737 if (Mask[i] >= 0 && Mask[i] < Size) {
9738 V1Mask[i] = Mask[i];
9740 } else if (Mask[i] >= Size) {
9741 V2Mask[i] = Mask[i] - Size;
9742 BlendMask[i] = i + Size;
9745 // Try to lower with the simpler initial blend strategy unless one of the
9746 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9747 // shuffle may be able to fold with a load or other benefit. However, when
9748 // we'll have to do 2x as many shuffles in order to achieve this, blending
9749 // first is a better strategy.
9750 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9751 if (SDValue BlendPerm =
9752 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9755 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9756 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9757 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9760 /// \brief Try to lower a vector shuffle as a rotation.
9762 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9763 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9764 ArrayRef<int> Mask) {
9765 int NumElts = Mask.size();
9767 // We need to detect various ways of spelling a rotation:
9768 // [11, 12, 13, 14, 15, 0, 1, 2]
9769 // [-1, 12, 13, 14, -1, -1, 1, -1]
9770 // [-1, -1, -1, -1, -1, -1, 1, 2]
9771 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9772 // [-1, 4, 5, 6, -1, -1, 9, -1]
9773 // [-1, 4, 5, 6, -1, -1, -1, -1]
9776 for (int i = 0; i < NumElts; ++i) {
9778 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9779 "Unexpected mask index.");
9783 // Determine where a rotated vector would have started.
9784 int StartIdx = i - (M % NumElts);
9786 // The identity rotation isn't interesting, stop.
9789 // If we found the tail of a vector the rotation must be the missing
9790 // front. If we found the head of a vector, it must be how much of the
9792 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9795 Rotation = CandidateRotation;
9796 else if (Rotation != CandidateRotation)
9797 // The rotations don't match, so we can't match this mask.
9800 // Compute which value this mask is pointing at.
9801 SDValue MaskV = M < NumElts ? V1 : V2;
9803 // Compute which of the two target values this index should be assigned
9804 // to. This reflects whether the high elements are remaining or the low
9805 // elements are remaining.
9806 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9808 // Either set up this value if we've not encountered it before, or check
9809 // that it remains consistent.
9812 else if (TargetV != MaskV)
9813 // This may be a rotation, but it pulls from the inputs in some
9814 // unsupported interleaving.
9818 // Check that we successfully analyzed the mask, and normalize the results.
9819 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9820 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9832 /// \brief Try to lower a vector shuffle as a byte rotation.
9834 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9835 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9836 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9837 /// try to generically lower a vector shuffle through such an pattern. It
9838 /// does not check for the profitability of lowering either as PALIGNR or
9839 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9840 /// This matches shuffle vectors that look like:
9842 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9844 /// Essentially it concatenates V1 and V2, shifts right by some number of
9845 /// elements, and takes the low elements as the result. Note that while this is
9846 /// specified as a *right shift* because x86 is little-endian, it is a *left
9847 /// rotate* of the vector lanes.
9848 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9849 ArrayRef<int> Mask) {
9850 // Don't accept any shuffles with zero elements.
9851 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9854 // PALIGNR works on 128-bit lanes.
9855 SmallVector<int, 16> RepeatedMask;
9856 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9859 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9863 // PALIGNR rotates bytes, so we need to scale the
9864 // rotation based on how many bytes are in the vector lane.
9865 int NumElts = RepeatedMask.size();
9866 int Scale = 16 / NumElts;
9867 return Rotation * Scale;
9870 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9871 SDValue V1, SDValue V2,
9873 const X86Subtarget &Subtarget,
9874 SelectionDAG &DAG) {
9875 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9877 SDValue Lo = V1, Hi = V2;
9878 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9879 if (ByteRotation <= 0)
9882 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9884 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9885 Lo = DAG.getBitcast(ByteVT, Lo);
9886 Hi = DAG.getBitcast(ByteVT, Hi);
9888 // SSSE3 targets can use the palignr instruction.
9889 if (Subtarget.hasSSSE3()) {
9890 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9891 "512-bit PALIGNR requires BWI instructions");
9892 return DAG.getBitcast(
9893 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9894 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9897 assert(VT.is128BitVector() &&
9898 "Rotate-based lowering only supports 128-bit lowering!");
9899 assert(Mask.size() <= 16 &&
9900 "Can shuffle at most 16 bytes in a 128-bit vector!");
9901 assert(ByteVT == MVT::v16i8 &&
9902 "SSE2 rotate lowering only needed for v16i8!");
9904 // Default SSE2 implementation
9905 int LoByteShift = 16 - ByteRotation;
9906 int HiByteShift = ByteRotation;
9908 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9909 DAG.getConstant(LoByteShift, DL, MVT::i8));
9910 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9911 DAG.getConstant(HiByteShift, DL, MVT::i8));
9912 return DAG.getBitcast(VT,
9913 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9916 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9918 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9919 /// rotation of the concatenation of two vectors; This routine will
9920 /// try to generically lower a vector shuffle through such an pattern.
9922 /// Essentially it concatenates V1 and V2, shifts right by some number of
9923 /// elements, and takes the low elements as the result. Note that while this is
9924 /// specified as a *right shift* because x86 is little-endian, it is a *left
9925 /// rotate* of the vector lanes.
9926 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9927 SDValue V1, SDValue V2,
9929 const X86Subtarget &Subtarget,
9930 SelectionDAG &DAG) {
9931 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9932 "Only 32-bit and 64-bit elements are supported!");
9934 // 128/256-bit vectors are only supported with VLX.
9935 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9936 && "VLX required for 128/256-bit vectors");
9938 SDValue Lo = V1, Hi = V2;
9939 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9943 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9944 DAG.getConstant(Rotation, DL, MVT::i8));
9947 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9949 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9950 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9951 /// matches elements from one of the input vectors shuffled to the left or
9952 /// right with zeroable elements 'shifted in'. It handles both the strictly
9953 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9956 /// PSHL : (little-endian) left bit shift.
9957 /// [ zz, 0, zz, 2 ]
9958 /// [ -1, 4, zz, -1 ]
9959 /// PSRL : (little-endian) right bit shift.
9961 /// [ -1, -1, 7, zz]
9962 /// PSLLDQ : (little-endian) left byte shift
9963 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9964 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9965 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9966 /// PSRLDQ : (little-endian) right byte shift
9967 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9968 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9969 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9970 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9971 unsigned ScalarSizeInBits,
9972 ArrayRef<int> Mask, int MaskOffset,
9973 const APInt &Zeroable,
9974 const X86Subtarget &Subtarget) {
9975 int Size = Mask.size();
9976 unsigned SizeInBits = Size * ScalarSizeInBits;
9978 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9979 for (int i = 0; i < Size; i += Scale)
9980 for (int j = 0; j < Shift; ++j)
9981 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9987 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9988 for (int i = 0; i != Size; i += Scale) {
9989 unsigned Pos = Left ? i + Shift : i;
9990 unsigned Low = Left ? i : i + Shift;
9991 unsigned Len = Scale - Shift;
9992 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9996 int ShiftEltBits = ScalarSizeInBits * Scale;
9997 bool ByteShift = ShiftEltBits > 64;
9998 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9999 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
10000 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
10002 // Normalize the scale for byte shifts to still produce an i64 element
10004 Scale = ByteShift ? Scale / 2 : Scale;
10006 // We need to round trip through the appropriate type for the shift.
10007 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
10008 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
10009 : MVT::getVectorVT(ShiftSVT, Size / Scale);
10010 return (int)ShiftAmt;
10013 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
10014 // keep doubling the size of the integer elements up to that. We can
10015 // then shift the elements of the integer vector by whole multiples of
10016 // their width within the elements of the larger integer vector. Test each
10017 // multiple to see if we can find a match with the moved element indices
10018 // and that the shifted in elements are all zeroable.
10019 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
10020 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
10021 for (int Shift = 1; Shift != Scale; ++Shift)
10022 for (bool Left : {true, false})
10023 if (CheckZeros(Shift, Scale, Left)) {
10024 int ShiftAmt = MatchShift(Shift, Scale, Left);
10033 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
10034 SDValue V2, ArrayRef<int> Mask,
10035 const APInt &Zeroable,
10036 const X86Subtarget &Subtarget,
10037 SelectionDAG &DAG) {
10038 int Size = Mask.size();
10039 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10045 // Try to match shuffle against V1 shift.
10046 int ShiftAmt = matchVectorShuffleAsShift(
10047 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
10049 // If V1 failed, try to match shuffle against V2 shift.
10050 if (ShiftAmt < 0) {
10052 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
10053 Mask, Size, Zeroable, Subtarget);
10060 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
10061 "Illegal integer vector type");
10062 V = DAG.getBitcast(ShiftVT, V);
10063 V = DAG.getNode(Opcode, DL, ShiftVT, V,
10064 DAG.getConstant(ShiftAmt, DL, MVT::i8));
10065 return DAG.getBitcast(VT, V);
10068 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
10069 // Remainder of lower half result is zero and upper half is all undef.
10070 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
10071 ArrayRef<int> Mask, uint64_t &BitLen,
10072 uint64_t &BitIdx, const APInt &Zeroable) {
10073 int Size = Mask.size();
10074 int HalfSize = Size / 2;
10075 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10076 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
10078 // Upper half must be undefined.
10079 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10082 // Determine the extraction length from the part of the
10083 // lower half that isn't zeroable.
10084 int Len = HalfSize;
10085 for (; Len > 0; --Len)
10086 if (!Zeroable[Len - 1])
10088 assert(Len > 0 && "Zeroable shuffle mask");
10090 // Attempt to match first Len sequential elements from the lower half.
10093 for (int i = 0; i != Len; ++i) {
10095 if (M == SM_SentinelUndef)
10097 SDValue &V = (M < Size ? V1 : V2);
10100 // The extracted elements must start at a valid index and all mask
10101 // elements must be in the lower half.
10102 if (i > M || M >= HalfSize)
10105 if (Idx < 0 || (Src == V && Idx == (M - i))) {
10113 if (!Src || Idx < 0)
10116 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
10117 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10118 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10123 // INSERTQ: Extract lowest Len elements from lower half of second source and
10124 // insert over first source, starting at Idx.
10125 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
10126 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
10127 ArrayRef<int> Mask, uint64_t &BitLen,
10128 uint64_t &BitIdx) {
10129 int Size = Mask.size();
10130 int HalfSize = Size / 2;
10131 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
10133 // Upper half must be undefined.
10134 if (!isUndefInRange(Mask, HalfSize, HalfSize))
10137 for (int Idx = 0; Idx != HalfSize; ++Idx) {
10140 // Attempt to match first source from mask before insertion point.
10141 if (isUndefInRange(Mask, 0, Idx)) {
10143 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
10145 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
10151 // Extend the extraction length looking to match both the insertion of
10152 // the second source and the remaining elements of the first.
10153 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
10155 int Len = Hi - Idx;
10157 // Match insertion.
10158 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
10160 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
10166 // Match the remaining elements of the lower half.
10167 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
10169 } else if ((!Base || (Base == V1)) &&
10170 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
10172 } else if ((!Base || (Base == V2)) &&
10173 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
10180 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
10181 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
10191 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
10192 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
10193 SDValue V2, ArrayRef<int> Mask,
10194 const APInt &Zeroable,
10195 SelectionDAG &DAG) {
10196 uint64_t BitLen, BitIdx;
10197 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
10198 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
10199 DAG.getConstant(BitLen, DL, MVT::i8),
10200 DAG.getConstant(BitIdx, DL, MVT::i8));
10202 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
10203 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
10204 V2 ? V2 : DAG.getUNDEF(VT),
10205 DAG.getConstant(BitLen, DL, MVT::i8),
10206 DAG.getConstant(BitIdx, DL, MVT::i8));
10211 /// \brief Lower a vector shuffle as a zero or any extension.
10213 /// Given a specific number of elements, element bit width, and extension
10214 /// stride, produce either a zero or any extension based on the available
10215 /// features of the subtarget. The extended elements are consecutive and
10216 /// begin and can start from an offsetted element index in the input; to
10217 /// avoid excess shuffling the offset must either being in the bottom lane
10218 /// or at the start of a higher lane. All extended elements must be from
10220 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10221 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
10222 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10223 assert(Scale > 1 && "Need a scale to extend.");
10224 int EltBits = VT.getScalarSizeInBits();
10225 int NumElements = VT.getVectorNumElements();
10226 int NumEltsPerLane = 128 / EltBits;
10227 int OffsetLane = Offset / NumEltsPerLane;
10228 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
10229 "Only 8, 16, and 32 bit elements can be extended.");
10230 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
10231 assert(0 <= Offset && "Extension offset must be positive.");
10232 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
10233 "Extension offset must be in the first lane or start an upper lane.");
10235 // Check that an index is in same lane as the base offset.
10236 auto SafeOffset = [&](int Idx) {
10237 return OffsetLane == (Idx / NumEltsPerLane);
10240 // Shift along an input so that the offset base moves to the first element.
10241 auto ShuffleOffset = [&](SDValue V) {
10245 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10246 for (int i = 0; i * Scale < NumElements; ++i) {
10247 int SrcIdx = i + Offset;
10248 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
10250 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
10253 // Found a valid zext mask! Try various lowering strategies based on the
10254 // input type and available ISA extensions.
10255 if (Subtarget.hasSSE41()) {
10256 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
10257 // PUNPCK will catch this in a later shuffle match.
10258 if (Offset && Scale == 2 && VT.is128BitVector())
10260 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
10261 NumElements / Scale);
10262 InputV = ShuffleOffset(InputV);
10263 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
10264 return DAG.getBitcast(VT, InputV);
10267 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
10269 // For any extends we can cheat for larger element sizes and use shuffle
10270 // instructions that can fold with a load and/or copy.
10271 if (AnyExt && EltBits == 32) {
10272 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
10274 return DAG.getBitcast(
10275 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10276 DAG.getBitcast(MVT::v4i32, InputV),
10277 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10279 if (AnyExt && EltBits == 16 && Scale > 2) {
10280 int PSHUFDMask[4] = {Offset / 2, -1,
10281 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
10282 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
10283 DAG.getBitcast(MVT::v4i32, InputV),
10284 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10285 int PSHUFWMask[4] = {1, -1, -1, -1};
10286 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
10287 return DAG.getBitcast(
10288 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
10289 DAG.getBitcast(MVT::v8i16, InputV),
10290 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
10293 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
10295 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
10296 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
10297 assert(VT.is128BitVector() && "Unexpected vector width!");
10299 int LoIdx = Offset * EltBits;
10300 SDValue Lo = DAG.getBitcast(
10301 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10302 DAG.getConstant(EltBits, DL, MVT::i8),
10303 DAG.getConstant(LoIdx, DL, MVT::i8)));
10305 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
10306 !SafeOffset(Offset + 1))
10307 return DAG.getBitcast(VT, Lo);
10309 int HiIdx = (Offset + 1) * EltBits;
10310 SDValue Hi = DAG.getBitcast(
10311 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
10312 DAG.getConstant(EltBits, DL, MVT::i8),
10313 DAG.getConstant(HiIdx, DL, MVT::i8)));
10314 return DAG.getBitcast(VT,
10315 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
10318 // If this would require more than 2 unpack instructions to expand, use
10319 // pshufb when available. We can only use more than 2 unpack instructions
10320 // when zero extending i8 elements which also makes it easier to use pshufb.
10321 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
10322 assert(NumElements == 16 && "Unexpected byte vector width!");
10323 SDValue PSHUFBMask[16];
10324 for (int i = 0; i < 16; ++i) {
10325 int Idx = Offset + (i / Scale);
10326 PSHUFBMask[i] = DAG.getConstant(
10327 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
10329 InputV = DAG.getBitcast(MVT::v16i8, InputV);
10330 return DAG.getBitcast(
10331 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
10332 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
10335 // If we are extending from an offset, ensure we start on a boundary that
10336 // we can unpack from.
10337 int AlignToUnpack = Offset % (NumElements / Scale);
10338 if (AlignToUnpack) {
10339 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
10340 for (int i = AlignToUnpack; i < NumElements; ++i)
10341 ShMask[i - AlignToUnpack] = i;
10342 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
10343 Offset -= AlignToUnpack;
10346 // Otherwise emit a sequence of unpacks.
10348 unsigned UnpackLoHi = X86ISD::UNPCKL;
10349 if (Offset >= (NumElements / 2)) {
10350 UnpackLoHi = X86ISD::UNPCKH;
10351 Offset -= (NumElements / 2);
10354 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
10355 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
10356 : getZeroVector(InputVT, Subtarget, DAG, DL);
10357 InputV = DAG.getBitcast(InputVT, InputV);
10358 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
10362 } while (Scale > 1);
10363 return DAG.getBitcast(VT, InputV);
10366 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
10368 /// This routine will try to do everything in its power to cleverly lower
10369 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
10370 /// check for the profitability of this lowering, it tries to aggressively
10371 /// match this pattern. It will use all of the micro-architectural details it
10372 /// can to emit an efficient lowering. It handles both blends with all-zero
10373 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
10374 /// masking out later).
10376 /// The reason we have dedicated lowering for zext-style shuffles is that they
10377 /// are both incredibly common and often quite performance sensitive.
10378 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
10379 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10380 const APInt &Zeroable, const X86Subtarget &Subtarget,
10381 SelectionDAG &DAG) {
10382 int Bits = VT.getSizeInBits();
10383 int NumLanes = Bits / 128;
10384 int NumElements = VT.getVectorNumElements();
10385 int NumEltsPerLane = NumElements / NumLanes;
10386 assert(VT.getScalarSizeInBits() <= 32 &&
10387 "Exceeds 32-bit integer zero extension limit");
10388 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
10390 // Define a helper function to check a particular ext-scale and lower to it if
10392 auto Lower = [&](int Scale) -> SDValue {
10394 bool AnyExt = true;
10397 for (int i = 0; i < NumElements; ++i) {
10400 continue; // Valid anywhere but doesn't tell us anything.
10401 if (i % Scale != 0) {
10402 // Each of the extended elements need to be zeroable.
10406 // We no longer are in the anyext case.
10411 // Each of the base elements needs to be consecutive indices into the
10412 // same input vector.
10413 SDValue V = M < NumElements ? V1 : V2;
10414 M = M % NumElements;
10417 Offset = M - (i / Scale);
10418 } else if (InputV != V)
10419 return SDValue(); // Flip-flopping inputs.
10421 // Offset must start in the lowest 128-bit lane or at the start of an
10423 // FIXME: Is it ever worth allowing a negative base offset?
10424 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10425 (Offset % NumEltsPerLane) == 0))
10428 // If we are offsetting, all referenced entries must come from the same
10430 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10433 if ((M % NumElements) != (Offset + (i / Scale)))
10434 return SDValue(); // Non-consecutive strided elements.
10438 // If we fail to find an input, we have a zero-shuffle which should always
10439 // have already been handled.
10440 // FIXME: Maybe handle this here in case during blending we end up with one?
10444 // If we are offsetting, don't extend if we only match a single input, we
10445 // can always do better by using a basic PSHUF or PUNPCK.
10446 if (Offset != 0 && Matches < 2)
10449 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10450 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10453 // The widest scale possible for extending is to a 64-bit integer.
10454 assert(Bits % 64 == 0 &&
10455 "The number of bits in a vector must be divisible by 64 on x86!");
10456 int NumExtElements = Bits / 64;
10458 // Each iteration, try extending the elements half as much, but into twice as
10460 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10461 assert(NumElements % NumExtElements == 0 &&
10462 "The input vector size must be divisible by the extended size.");
10463 if (SDValue V = Lower(NumElements / NumExtElements))
10467 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10471 // Returns one of the source operands if the shuffle can be reduced to a
10472 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10473 auto CanZExtLowHalf = [&]() {
10474 for (int i = NumElements / 2; i != NumElements; ++i)
10477 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10479 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10484 if (SDValue V = CanZExtLowHalf()) {
10485 V = DAG.getBitcast(MVT::v2i64, V);
10486 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10487 return DAG.getBitcast(VT, V);
10490 // No viable ext lowering found.
10494 /// \brief Try to get a scalar value for a specific element of a vector.
10496 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10497 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10498 SelectionDAG &DAG) {
10499 MVT VT = V.getSimpleValueType();
10500 MVT EltVT = VT.getVectorElementType();
10501 V = peekThroughBitcasts(V);
10503 // If the bitcasts shift the element size, we can't extract an equivalent
10504 // element from it.
10505 MVT NewVT = V.getSimpleValueType();
10506 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10509 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10510 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10511 // Ensure the scalar operand is the same size as the destination.
10512 // FIXME: Add support for scalar truncation where possible.
10513 SDValue S = V.getOperand(Idx);
10514 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10515 return DAG.getBitcast(EltVT, S);
10521 /// \brief Helper to test for a load that can be folded with x86 shuffles.
10523 /// This is particularly important because the set of instructions varies
10524 /// significantly based on whether the operand is a load or not.
10525 static bool isShuffleFoldableLoad(SDValue V) {
10526 V = peekThroughBitcasts(V);
10527 return ISD::isNON_EXTLoad(V.getNode());
10530 /// \brief Try to lower insertion of a single element into a zero vector.
10532 /// This is a common pattern that we have especially efficient patterns to lower
10533 /// across all subtarget feature sets.
10534 static SDValue lowerVectorShuffleAsElementInsertion(
10535 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10536 const APInt &Zeroable, const X86Subtarget &Subtarget,
10537 SelectionDAG &DAG) {
10539 MVT EltVT = VT.getVectorElementType();
10542 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10544 bool IsV1Zeroable = true;
10545 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10546 if (i != V2Index && !Zeroable[i]) {
10547 IsV1Zeroable = false;
10551 // Check for a single input from a SCALAR_TO_VECTOR node.
10552 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10553 // all the smarts here sunk into that routine. However, the current
10554 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10555 // vector shuffle lowering is dead.
10556 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10558 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10559 // We need to zext the scalar if it is smaller than an i32.
10560 V2S = DAG.getBitcast(EltVT, V2S);
10561 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10562 // Using zext to expand a narrow element won't work for non-zero
10567 // Zero-extend directly to i32.
10568 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10569 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10571 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10572 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10573 EltVT == MVT::i16) {
10574 // Either not inserting from the low element of the input or the input
10575 // element size is too small to use VZEXT_MOVL to clear the high bits.
10579 if (!IsV1Zeroable) {
10580 // If V1 can't be treated as a zero vector we have fewer options to lower
10581 // this. We can't support integer vectors or non-zero targets cheaply, and
10582 // the V1 elements can't be permuted in any way.
10583 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10584 if (!VT.isFloatingPoint() || V2Index != 0)
10586 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10587 V1Mask[V2Index] = -1;
10588 if (!isNoopShuffleMask(V1Mask))
10590 if (!VT.is128BitVector())
10593 // Otherwise, use MOVSD or MOVSS.
10594 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10595 "Only two types of floating point element types to handle!");
10596 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10600 // This lowering only works for the low element with floating point vectors.
10601 if (VT.isFloatingPoint() && V2Index != 0)
10604 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10606 V2 = DAG.getBitcast(VT, V2);
10608 if (V2Index != 0) {
10609 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10610 // the desired position. Otherwise it is more efficient to do a vector
10611 // shift left. We know that we can do a vector shift left because all
10612 // the inputs are zero.
10613 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10614 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10615 V2Shuffle[V2Index] = 0;
10616 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10618 V2 = DAG.getBitcast(MVT::v16i8, V2);
10620 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10621 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10622 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10623 DAG.getDataLayout(), VT)));
10624 V2 = DAG.getBitcast(VT, V2);
10630 /// Try to lower broadcast of a single - truncated - integer element,
10631 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10633 /// This assumes we have AVX2.
10634 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10635 SDValue V0, int BroadcastIdx,
10636 const X86Subtarget &Subtarget,
10637 SelectionDAG &DAG) {
10638 assert(Subtarget.hasAVX2() &&
10639 "We can only lower integer broadcasts with AVX2!");
10641 EVT EltVT = VT.getVectorElementType();
10642 EVT V0VT = V0.getValueType();
10644 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10645 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10647 EVT V0EltVT = V0VT.getVectorElementType();
10648 if (!V0EltVT.isInteger())
10651 const unsigned EltSize = EltVT.getSizeInBits();
10652 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10654 // This is only a truncation if the original element type is larger.
10655 if (V0EltSize <= EltSize)
10658 assert(((V0EltSize % EltSize) == 0) &&
10659 "Scalar type sizes must all be powers of 2 on x86!");
10661 const unsigned V0Opc = V0.getOpcode();
10662 const unsigned Scale = V0EltSize / EltSize;
10663 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10665 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10666 V0Opc != ISD::BUILD_VECTOR)
10669 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10671 // If we're extracting non-least-significant bits, shift so we can truncate.
10672 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10673 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10674 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10675 if (const int OffsetIdx = BroadcastIdx % Scale)
10676 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10677 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10679 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10680 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10683 /// \brief Try to lower broadcast of a single element.
10685 /// For convenience, this code also bundles all of the subtarget feature set
10686 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10687 /// a convenient way to factor it out.
10688 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10689 SDValue V1, SDValue V2,
10690 ArrayRef<int> Mask,
10691 const X86Subtarget &Subtarget,
10692 SelectionDAG &DAG) {
10693 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10694 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10695 (Subtarget.hasAVX2() && VT.isInteger())))
10698 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10699 // we can only broadcast from a register with AVX2.
10700 unsigned NumElts = Mask.size();
10701 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10703 : X86ISD::VBROADCAST;
10704 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10706 // Check that the mask is a broadcast.
10707 int BroadcastIdx = -1;
10708 for (int i = 0; i != (int)NumElts; ++i) {
10709 SmallVector<int, 8> BroadcastMask(NumElts, i);
10710 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10716 if (BroadcastIdx < 0)
10718 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10719 "a sorted mask where the broadcast "
10722 // Go up the chain of (vector) values to find a scalar load that we can
10723 // combine with the broadcast.
10726 switch (V.getOpcode()) {
10727 case ISD::BITCAST: {
10728 // Peek through bitcasts as long as BroadcastIdx can be adjusted.
10729 SDValue VSrc = V.getOperand(0);
10730 unsigned NumEltBits = V.getScalarValueSizeInBits();
10731 unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
10732 if ((NumEltBits % NumSrcBits) == 0)
10733 BroadcastIdx *= (NumEltBits / NumSrcBits);
10734 else if ((NumSrcBits % NumEltBits) == 0 &&
10735 (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
10736 BroadcastIdx /= (NumSrcBits / NumEltBits);
10742 case ISD::CONCAT_VECTORS: {
10743 int OperandSize = Mask.size() / V.getNumOperands();
10744 V = V.getOperand(BroadcastIdx / OperandSize);
10745 BroadcastIdx %= OperandSize;
10748 case ISD::INSERT_SUBVECTOR: {
10749 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10750 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10754 int BeginIdx = (int)ConstantIdx->getZExtValue();
10756 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10757 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10758 BroadcastIdx -= BeginIdx;
10769 // Ensure the source vector and BroadcastIdx are for a suitable type.
10770 if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
10771 unsigned NumEltBits = VT.getScalarSizeInBits();
10772 unsigned NumSrcBits = V.getScalarValueSizeInBits();
10773 if ((NumSrcBits % NumEltBits) == 0)
10774 BroadcastIdx *= (NumSrcBits / NumEltBits);
10775 else if ((NumEltBits % NumSrcBits) == 0 &&
10776 (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
10777 BroadcastIdx /= (NumEltBits / NumSrcBits);
10781 unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
10782 MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
10783 V = DAG.getBitcast(SrcVT, V);
10786 // Check if this is a broadcast of a scalar. We special case lowering
10787 // for scalars so that we can more effectively fold with loads.
10788 // First, look through bitcast: if the original value has a larger element
10789 // type than the shuffle, the broadcast element is in essence truncated.
10790 // Make that explicit to ease folding.
10791 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10792 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10793 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10794 return TruncBroadcast;
10796 MVT BroadcastVT = VT;
10798 // Peek through any bitcast (only useful for loads).
10799 SDValue BC = peekThroughBitcasts(V);
10801 // Also check the simpler case, where we can directly reuse the scalar.
10802 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10803 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10804 V = V.getOperand(BroadcastIdx);
10806 // If we can't broadcast from a register, check that the input is a load.
10807 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10809 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10810 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10811 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10812 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10813 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10818 // If we are broadcasting a load that is only used by the shuffle
10819 // then we can reduce the vector load to the broadcasted scalar load.
10820 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10821 SDValue BaseAddr = Ld->getOperand(1);
10822 EVT SVT = BroadcastVT.getScalarType();
10823 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10824 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10825 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10826 DAG.getMachineFunction().getMachineMemOperand(
10827 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10828 DAG.makeEquivalentMemoryOrdering(Ld, V);
10829 } else if (!BroadcastFromReg) {
10830 // We can't broadcast from a vector register.
10832 } else if (BroadcastIdx != 0) {
10833 // We can only broadcast from the zero-element of a vector register,
10834 // but it can be advantageous to broadcast from the zero-element of a
10836 if (!VT.is256BitVector() && !VT.is512BitVector())
10839 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10840 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10843 // Only broadcast the zero-element of a 128-bit subvector.
10844 unsigned EltSize = VT.getScalarSizeInBits();
10845 if (((BroadcastIdx * EltSize) % 128) != 0)
10848 // The shuffle input might have been a bitcast we looked through; look at
10849 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10850 // later bitcast it to BroadcastVT.
10851 assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10852 "Unexpected vector element size");
10853 assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
10854 "Unexpected vector size");
10855 V = extract128BitVector(V, BroadcastIdx, DAG, DL);
10858 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10859 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10860 DAG.getBitcast(MVT::f64, V));
10862 // Bitcast back to the same scalar type as BroadcastVT.
10863 MVT SrcVT = V.getSimpleValueType();
10864 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10865 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10866 "Unexpected vector element size");
10867 if (SrcVT.isVector()) {
10868 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10869 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10871 SrcVT = BroadcastVT.getScalarType();
10873 V = DAG.getBitcast(SrcVT, V);
10876 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10877 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10878 V = DAG.getBitcast(MVT::f64, V);
10879 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10880 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10883 // We only support broadcasting from 128-bit vectors to minimize the
10884 // number of patterns we need to deal with in isel. So extract down to
10885 // 128-bits, removing as many bitcasts as possible.
10886 if (SrcVT.getSizeInBits() > 128) {
10887 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
10888 128 / SrcVT.getScalarSizeInBits());
10889 V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
10890 V = DAG.getBitcast(ExtVT, V);
10893 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10896 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10897 // INSERTPS when the V1 elements are already in the correct locations
10898 // because otherwise we can just always use two SHUFPS instructions which
10899 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10900 // perform INSERTPS if a single V1 element is out of place and all V2
10901 // elements are zeroable.
10902 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10903 unsigned &InsertPSMask,
10904 const APInt &Zeroable,
10905 ArrayRef<int> Mask,
10906 SelectionDAG &DAG) {
10907 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10908 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10909 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10911 // Attempt to match INSERTPS with one element from VA or VB being
10912 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10914 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10915 ArrayRef<int> CandidateMask) {
10916 unsigned ZMask = 0;
10917 int VADstIndex = -1;
10918 int VBDstIndex = -1;
10919 bool VAUsedInPlace = false;
10921 for (int i = 0; i < 4; ++i) {
10922 // Synthesize a zero mask from the zeroable elements (includes undefs).
10928 // Flag if we use any VA inputs in place.
10929 if (i == CandidateMask[i]) {
10930 VAUsedInPlace = true;
10934 // We can only insert a single non-zeroable element.
10935 if (VADstIndex >= 0 || VBDstIndex >= 0)
10938 if (CandidateMask[i] < 4) {
10939 // VA input out of place for insertion.
10942 // VB input for insertion.
10947 // Don't bother if we have no (non-zeroable) element for insertion.
10948 if (VADstIndex < 0 && VBDstIndex < 0)
10951 // Determine element insertion src/dst indices. The src index is from the
10952 // start of the inserted vector, not the start of the concatenated vector.
10953 unsigned VBSrcIndex = 0;
10954 if (VADstIndex >= 0) {
10955 // If we have a VA input out of place, we use VA as the V2 element
10956 // insertion and don't use the original V2 at all.
10957 VBSrcIndex = CandidateMask[VADstIndex];
10958 VBDstIndex = VADstIndex;
10961 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10964 // If no V1 inputs are used in place, then the result is created only from
10965 // the zero mask and the V2 insertion - so remove V1 dependency.
10966 if (!VAUsedInPlace)
10967 VA = DAG.getUNDEF(MVT::v4f32);
10969 // Update V1, V2 and InsertPSMask accordingly.
10973 // Insert the V2 element into the desired position.
10974 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10975 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10979 if (matchAsInsertPS(V1, V2, Mask))
10982 // Commute and try again.
10983 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10984 ShuffleVectorSDNode::commuteMask(CommutedMask);
10985 if (matchAsInsertPS(V2, V1, CommutedMask))
10991 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10992 SDValue V2, ArrayRef<int> Mask,
10993 const APInt &Zeroable,
10994 SelectionDAG &DAG) {
10995 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10996 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10998 // Attempt to match the insertps pattern.
10999 unsigned InsertPSMask;
11000 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
11003 // Insert the V2 element into the desired position.
11004 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
11005 DAG.getConstant(InsertPSMask, DL, MVT::i8));
11008 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
11009 /// UNPCK instruction.
11011 /// This specifically targets cases where we end up with alternating between
11012 /// the two inputs, and so can permute them into something that feeds a single
11013 /// UNPCK instruction. Note that this routine only targets integer vectors
11014 /// because for floating point vectors we have a generalized SHUFPS lowering
11015 /// strategy that handles everything that doesn't *exactly* match an unpack,
11016 /// making this clever lowering unnecessary.
11017 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
11018 SDValue V1, SDValue V2,
11019 ArrayRef<int> Mask,
11020 SelectionDAG &DAG) {
11021 assert(!VT.isFloatingPoint() &&
11022 "This routine only supports integer vectors.");
11023 assert(VT.is128BitVector() &&
11024 "This routine only works on 128-bit vectors.");
11025 assert(!V2.isUndef() &&
11026 "This routine should only be used when blending two inputs.");
11027 assert(Mask.size() >= 2 && "Single element masks are invalid.");
11029 int Size = Mask.size();
11032 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
11034 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
11036 bool UnpackLo = NumLoInputs >= NumHiInputs;
11038 auto TryUnpack = [&](int ScalarSize, int Scale) {
11039 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
11040 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
11042 for (int i = 0; i < Size; ++i) {
11046 // Each element of the unpack contains Scale elements from this mask.
11047 int UnpackIdx = i / Scale;
11049 // We only handle the case where V1 feeds the first slots of the unpack.
11050 // We rely on canonicalization to ensure this is the case.
11051 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
11054 // Setup the mask for this input. The indexing is tricky as we have to
11055 // handle the unpack stride.
11056 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
11057 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
11061 // If we will have to shuffle both inputs to use the unpack, check whether
11062 // we can just unpack first and shuffle the result. If so, skip this unpack.
11063 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
11064 !isNoopShuffleMask(V2Mask))
11067 // Shuffle the inputs into place.
11068 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
11069 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
11071 // Cast the inputs to the type we will use to unpack them.
11072 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
11073 V1 = DAG.getBitcast(UnpackVT, V1);
11074 V2 = DAG.getBitcast(UnpackVT, V2);
11076 // Unpack the inputs and cast the result back to the desired type.
11077 return DAG.getBitcast(
11078 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11079 UnpackVT, V1, V2));
11082 // We try each unpack from the largest to the smallest to try and find one
11083 // that fits this mask.
11084 int OrigScalarSize = VT.getScalarSizeInBits();
11085 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
11086 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
11089 // If none of the unpack-rooted lowerings worked (or were profitable) try an
11091 if (NumLoInputs == 0 || NumHiInputs == 0) {
11092 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
11093 "We have to have *some* inputs!");
11094 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
11096 // FIXME: We could consider the total complexity of the permute of each
11097 // possible unpacking. Or at the least we should consider how many
11098 // half-crossings are created.
11099 // FIXME: We could consider commuting the unpacks.
11101 SmallVector<int, 32> PermMask((unsigned)Size, -1);
11102 for (int i = 0; i < Size; ++i) {
11106 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
11109 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
11111 return DAG.getVectorShuffle(
11112 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
11114 DAG.getUNDEF(VT), PermMask);
11120 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
11122 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
11123 /// support for floating point shuffles but not integer shuffles. These
11124 /// instructions will incur a domain crossing penalty on some chips though so
11125 /// it is better to avoid lowering through this for integer vectors where
11127 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11128 const APInt &Zeroable,
11129 SDValue V1, SDValue V2,
11130 const X86Subtarget &Subtarget,
11131 SelectionDAG &DAG) {
11132 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11133 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
11134 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11136 if (V2.isUndef()) {
11137 // Check for being able to broadcast a single element.
11138 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11139 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
11142 // Straight shuffle of a single input vector. Simulate this by using the
11143 // single input as both of the "inputs" to this instruction..
11144 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
11146 if (Subtarget.hasAVX()) {
11147 // If we have AVX, we can use VPERMILPS which will allow folding a load
11148 // into the shuffle.
11149 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
11150 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11153 return DAG.getNode(
11154 X86ISD::SHUFP, DL, MVT::v2f64,
11155 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11156 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
11157 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11159 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
11160 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
11162 // If we have a single input, insert that into V1 if we can do so cheaply.
11163 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
11164 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11165 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11167 // Try inverting the insertion since for v2 masks it is easy to do and we
11168 // can't reliably sort the mask one way or the other.
11169 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
11170 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
11171 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11172 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11176 // Try to use one of the special instruction patterns to handle two common
11177 // blend patterns if a zero-blend above didn't work.
11178 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
11179 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
11180 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
11181 // We can either use a special instruction to load over the low double or
11182 // to move just the low double.
11183 return DAG.getNode(
11184 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
11185 DL, MVT::v2f64, V2,
11186 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
11188 if (Subtarget.hasSSE41())
11189 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
11190 Zeroable, Subtarget, DAG))
11193 // Use dedicated unpack instructions for masks that match their pattern.
11195 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
11198 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
11199 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
11200 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
11203 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
11205 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
11206 /// the integer unit to minimize domain crossing penalties. However, for blends
11207 /// it falls back to the floating point shuffle operation with appropriate bit
11209 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11210 const APInt &Zeroable,
11211 SDValue V1, SDValue V2,
11212 const X86Subtarget &Subtarget,
11213 SelectionDAG &DAG) {
11214 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11215 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
11216 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
11218 if (V2.isUndef()) {
11219 // Check for being able to broadcast a single element.
11220 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11221 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11224 // Straight shuffle of a single input vector. For everything from SSE2
11225 // onward this has a single fast instruction with no scary immediates.
11226 // We have to map the mask as it is actually a v4i32 shuffle instruction.
11227 V1 = DAG.getBitcast(MVT::v4i32, V1);
11228 int WidenedMask[4] = {
11229 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
11230 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
11231 return DAG.getBitcast(
11233 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11234 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
11236 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
11237 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
11238 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
11239 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
11241 // Try to use shift instructions.
11242 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
11243 Zeroable, Subtarget, DAG))
11246 // When loading a scalar and then shuffling it into a vector we can often do
11247 // the insertion cheaply.
11248 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11249 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
11251 // Try inverting the insertion since for v2 masks it is easy to do and we
11252 // can't reliably sort the mask one way or the other.
11253 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
11254 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
11255 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
11258 // We have different paths for blend lowering, but they all must use the
11259 // *exact* same predicate.
11260 bool IsBlendSupported = Subtarget.hasSSE41();
11261 if (IsBlendSupported)
11262 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
11263 Zeroable, Subtarget, DAG))
11266 // Use dedicated unpack instructions for masks that match their pattern.
11268 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
11271 // Try to use byte rotation instructions.
11272 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11273 if (Subtarget.hasSSSE3()) {
11274 if (Subtarget.hasVLX())
11275 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
11276 Mask, Subtarget, DAG))
11279 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11280 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
11284 // If we have direct support for blends, we should lower by decomposing into
11285 // a permute. That will be faster than the domain cross.
11286 if (IsBlendSupported)
11287 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
11290 // We implement this with SHUFPD which is pretty lame because it will likely
11291 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
11292 // However, all the alternatives are still more cycles and newer chips don't
11293 // have this problem. It would be really nice if x86 had better shuffles here.
11294 V1 = DAG.getBitcast(MVT::v2f64, V1);
11295 V2 = DAG.getBitcast(MVT::v2f64, V2);
11296 return DAG.getBitcast(MVT::v2i64,
11297 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
11300 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
11302 /// This is used to disable more specialized lowerings when the shufps lowering
11303 /// will happen to be efficient.
11304 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
11305 // This routine only handles 128-bit shufps.
11306 assert(Mask.size() == 4 && "Unsupported mask size!");
11307 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
11308 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
11309 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
11310 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
11312 // To lower with a single SHUFPS we need to have the low half and high half
11313 // each requiring a single input.
11314 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
11316 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
11322 /// \brief Lower a vector shuffle using the SHUFPS instruction.
11324 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
11325 /// It makes no assumptions about whether this is the *best* lowering, it simply
11327 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
11328 ArrayRef<int> Mask, SDValue V1,
11329 SDValue V2, SelectionDAG &DAG) {
11330 SDValue LowV = V1, HighV = V2;
11331 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
11333 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11335 if (NumV2Elements == 1) {
11336 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
11338 // Compute the index adjacent to V2Index and in the same half by toggling
11340 int V2AdjIndex = V2Index ^ 1;
11342 if (Mask[V2AdjIndex] < 0) {
11343 // Handles all the cases where we have a single V2 element and an undef.
11344 // This will only ever happen in the high lanes because we commute the
11345 // vector otherwise.
11347 std::swap(LowV, HighV);
11348 NewMask[V2Index] -= 4;
11350 // Handle the case where the V2 element ends up adjacent to a V1 element.
11351 // To make this work, blend them together as the first step.
11352 int V1Index = V2AdjIndex;
11353 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
11354 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
11355 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11357 // Now proceed to reconstruct the final blend as we have the necessary
11358 // high or low half formed.
11365 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
11366 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
11368 } else if (NumV2Elements == 2) {
11369 if (Mask[0] < 4 && Mask[1] < 4) {
11370 // Handle the easy case where we have V1 in the low lanes and V2 in the
11374 } else if (Mask[2] < 4 && Mask[3] < 4) {
11375 // We also handle the reversed case because this utility may get called
11376 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
11377 // arrange things in the right direction.
11383 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
11384 // trying to place elements directly, just blend them and set up the final
11385 // shuffle to place them.
11387 // The first two blend mask elements are for V1, the second two are for
11389 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
11390 Mask[2] < 4 ? Mask[2] : Mask[3],
11391 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
11392 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
11393 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11394 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
11396 // Now we do a normal shuffle of V1 by giving V1 as both operands to
11399 NewMask[0] = Mask[0] < 4 ? 0 : 2;
11400 NewMask[1] = Mask[0] < 4 ? 2 : 0;
11401 NewMask[2] = Mask[2] < 4 ? 1 : 3;
11402 NewMask[3] = Mask[2] < 4 ? 3 : 1;
11405 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
11406 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
11409 /// \brief Lower 4-lane 32-bit floating point shuffles.
11411 /// Uses instructions exclusively from the floating point unit to minimize
11412 /// domain crossing penalties, as these are sufficient to implement all v4f32
11414 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11415 const APInt &Zeroable,
11416 SDValue V1, SDValue V2,
11417 const X86Subtarget &Subtarget,
11418 SelectionDAG &DAG) {
11419 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11420 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
11421 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11423 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11425 if (NumV2Elements == 0) {
11426 // Check for being able to broadcast a single element.
11427 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11428 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
11431 // Use even/odd duplicate instructions for masks that match their pattern.
11432 if (Subtarget.hasSSE3()) {
11433 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
11434 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
11435 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11436 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11439 if (Subtarget.hasAVX()) {
11440 // If we have AVX, we can use VPERMILPS which will allow folding a load
11441 // into the shuffle.
11442 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11443 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11446 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11447 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11448 if (!Subtarget.hasSSE2()) {
11449 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11450 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11451 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11452 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11455 // Otherwise, use a straight shuffle of a single input vector. We pass the
11456 // input vector to both operands to simulate this with a SHUFPS.
11457 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11458 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11461 // There are special ways we can lower some single-element blends. However, we
11462 // have custom ways we can lower more complex single-element blends below that
11463 // we defer to if both this and BLENDPS fail to match, so restrict this to
11464 // when the V2 input is targeting element 0 of the mask -- that is the fast
11466 if (NumV2Elements == 1 && Mask[0] >= 4)
11467 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11468 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11471 if (Subtarget.hasSSE41()) {
11472 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11473 Zeroable, Subtarget, DAG))
11476 // Use INSERTPS if we can complete the shuffle efficiently.
11478 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11481 if (!isSingleSHUFPSMask(Mask))
11482 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11483 DL, MVT::v4f32, V1, V2, Mask, DAG))
11487 // Use low/high mov instructions. These are only valid in SSE1 because
11488 // otherwise they are widened to v2f64 and never get here.
11489 if (!Subtarget.hasSSE2()) {
11490 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11491 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11492 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11493 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11496 // Use dedicated unpack instructions for masks that match their pattern.
11498 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11501 // Otherwise fall back to a SHUFPS lowering strategy.
11502 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11505 /// \brief Lower 4-lane i32 vector shuffles.
11507 /// We try to handle these with integer-domain shuffles where we can, but for
11508 /// blends we use the floating point domain blend instructions.
11509 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11510 const APInt &Zeroable,
11511 SDValue V1, SDValue V2,
11512 const X86Subtarget &Subtarget,
11513 SelectionDAG &DAG) {
11514 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11515 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11516 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11518 // Whenever we can lower this as a zext, that instruction is strictly faster
11519 // than any alternative. It also allows us to fold memory operands into the
11520 // shuffle in many cases.
11521 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11522 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11525 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11527 if (NumV2Elements == 0) {
11528 // Check for being able to broadcast a single element.
11529 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11530 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11533 // Straight shuffle of a single input vector. For everything from SSE2
11534 // onward this has a single fast instruction with no scary immediates.
11535 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11536 // but we aren't actually going to use the UNPCK instruction because doing
11537 // so prevents folding a load into this instruction or making a copy.
11538 const int UnpackLoMask[] = {0, 0, 1, 1};
11539 const int UnpackHiMask[] = {2, 2, 3, 3};
11540 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11541 Mask = UnpackLoMask;
11542 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11543 Mask = UnpackHiMask;
11545 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11546 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11549 // Try to use shift instructions.
11550 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11551 Zeroable, Subtarget, DAG))
11554 // There are special ways we can lower some single-element blends.
11555 if (NumV2Elements == 1)
11556 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11557 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11560 // We have different paths for blend lowering, but they all must use the
11561 // *exact* same predicate.
11562 bool IsBlendSupported = Subtarget.hasSSE41();
11563 if (IsBlendSupported)
11564 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11565 Zeroable, Subtarget, DAG))
11568 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11572 // Use dedicated unpack instructions for masks that match their pattern.
11574 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11577 // Try to use byte rotation instructions.
11578 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11579 if (Subtarget.hasSSSE3()) {
11580 if (Subtarget.hasVLX())
11581 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11582 Mask, Subtarget, DAG))
11585 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11586 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11590 // Assume that a single SHUFPS is faster than an alternative sequence of
11591 // multiple instructions (even if the CPU has a domain penalty).
11592 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11593 if (!isSingleSHUFPSMask(Mask)) {
11594 // If we have direct support for blends, we should lower by decomposing into
11595 // a permute. That will be faster than the domain cross.
11596 if (IsBlendSupported)
11597 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11600 // Try to lower by permuting the inputs into an unpack instruction.
11601 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11602 DL, MVT::v4i32, V1, V2, Mask, DAG))
11606 // We implement this with SHUFPS because it can blend from two vectors.
11607 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11608 // up the inputs, bypassing domain shift penalties that we would incur if we
11609 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11611 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11612 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11613 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11614 return DAG.getBitcast(MVT::v4i32, ShufPS);
11617 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11618 /// shuffle lowering, and the most complex part.
11620 /// The lowering strategy is to try to form pairs of input lanes which are
11621 /// targeted at the same half of the final vector, and then use a dword shuffle
11622 /// to place them onto the right half, and finally unpack the paired lanes into
11623 /// their final position.
11625 /// The exact breakdown of how to form these dword pairs and align them on the
11626 /// correct sides is really tricky. See the comments within the function for
11627 /// more of the details.
11629 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11630 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11631 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11632 /// vector, form the analogous 128-bit 8-element Mask.
11633 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11634 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11635 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11636 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11637 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11639 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11640 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11641 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11643 // Attempt to directly match PSHUFLW or PSHUFHW.
11644 if (isUndefOrInRange(LoMask, 0, 4) &&
11645 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
11646 return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11647 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11649 if (isUndefOrInRange(HiMask, 4, 8) &&
11650 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
11651 for (int i = 0; i != 4; ++i)
11652 HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4));
11653 return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11654 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11657 SmallVector<int, 4> LoInputs;
11658 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11659 std::sort(LoInputs.begin(), LoInputs.end());
11660 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11661 SmallVector<int, 4> HiInputs;
11662 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11663 std::sort(HiInputs.begin(), HiInputs.end());
11664 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11666 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11667 int NumHToL = LoInputs.size() - NumLToL;
11669 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11670 int NumHToH = HiInputs.size() - NumLToH;
11671 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11672 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11673 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11674 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11676 // If we are shuffling values from one half - check how many different DWORD
11677 // pairs we need to create. If only 1 or 2 then we can perform this as a
11678 // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below.
11679 auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask,
11680 ArrayRef<int> PSHUFDMask, unsigned ShufWOp) {
11681 V = DAG.getNode(ShufWOp, DL, VT, V,
11682 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11683 V = DAG.getBitcast(PSHUFDVT, V);
11684 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11685 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11686 return DAG.getBitcast(VT, V);
11689 if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) {
11690 int PSHUFDMask[4] = { -1, -1, -1, -1 };
11691 SmallVector<std::pair<int, int>, 4> DWordPairs;
11692 int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2);
11694 // Collect the different DWORD pairs.
11695 for (int DWord = 0; DWord != 4; ++DWord) {
11696 int M0 = Mask[2 * DWord + 0];
11697 int M1 = Mask[2 * DWord + 1];
11698 M0 = (M0 >= 0 ? M0 % 4 : M0);
11699 M1 = (M1 >= 0 ? M1 % 4 : M1);
11700 if (M0 < 0 && M1 < 0)
11703 bool Match = false;
11704 for (int j = 0, e = DWordPairs.size(); j < e; ++j) {
11705 auto &DWordPair = DWordPairs[j];
11706 if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) &&
11707 (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) {
11708 DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first);
11709 DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second);
11710 PSHUFDMask[DWord] = DOffset + j;
11716 PSHUFDMask[DWord] = DOffset + DWordPairs.size();
11717 DWordPairs.push_back(std::make_pair(M0, M1));
11721 if (DWordPairs.size() <= 2) {
11722 DWordPairs.resize(2, std::make_pair(-1, -1));
11723 int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second,
11724 DWordPairs[1].first, DWordPairs[1].second};
11725 if ((NumHToL + NumHToH) == 0)
11726 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW);
11727 if ((NumLToL + NumLToH) == 0)
11728 return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW);
11732 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11733 // such inputs we can swap two of the dwords across the half mark and end up
11734 // with <=2 inputs to each half in each half. Once there, we can fall through
11735 // to the generic code below. For example:
11737 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11738 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11740 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11741 // and an existing 2-into-2 on the other half. In this case we may have to
11742 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11743 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11744 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11745 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11746 // half than the one we target for fixing) will be fixed when we re-enter this
11747 // path. We will also combine away any sequence of PSHUFD instructions that
11748 // result into a single instruction. Here is an example of the tricky case:
11750 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11751 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11753 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11755 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11756 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11758 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11759 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11761 // The result is fine to be handled by the generic logic.
11762 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11763 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11764 int AOffset, int BOffset) {
11765 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11766 "Must call this with A having 3 or 1 inputs from the A half.");
11767 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11768 "Must call this with B having 1 or 3 inputs from the B half.");
11769 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11770 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11772 bool ThreeAInputs = AToAInputs.size() == 3;
11774 // Compute the index of dword with only one word among the three inputs in
11775 // a half by taking the sum of the half with three inputs and subtracting
11776 // the sum of the actual three inputs. The difference is the remaining
11778 int ADWord, BDWord;
11779 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11780 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11781 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11782 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11783 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11784 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11785 int TripleNonInputIdx =
11786 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11787 TripleDWord = TripleNonInputIdx / 2;
11789 // We use xor with one to compute the adjacent DWord to whichever one the
11791 OneInputDWord = (OneInput / 2) ^ 1;
11793 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11794 // and BToA inputs. If there is also such a problem with the BToB and AToB
11795 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11796 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11797 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11798 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11799 // Compute how many inputs will be flipped by swapping these DWords. We
11801 // to balance this to ensure we don't form a 3-1 shuffle in the other
11803 int NumFlippedAToBInputs =
11804 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11805 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11806 int NumFlippedBToBInputs =
11807 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11808 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11809 if ((NumFlippedAToBInputs == 1 &&
11810 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11811 (NumFlippedBToBInputs == 1 &&
11812 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11813 // We choose whether to fix the A half or B half based on whether that
11814 // half has zero flipped inputs. At zero, we may not be able to fix it
11815 // with that half. We also bias towards fixing the B half because that
11816 // will more commonly be the high half, and we have to bias one way.
11817 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11818 ArrayRef<int> Inputs) {
11819 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11820 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11821 // Determine whether the free index is in the flipped dword or the
11822 // unflipped dword based on where the pinned index is. We use this bit
11823 // in an xor to conditionally select the adjacent dword.
11824 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11825 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11826 if (IsFixIdxInput == IsFixFreeIdxInput)
11828 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11829 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11830 "We need to be changing the number of flipped inputs!");
11831 int PSHUFHalfMask[] = {0, 1, 2, 3};
11832 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11834 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11835 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11836 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11838 for (int &M : Mask)
11839 if (M >= 0 && M == FixIdx)
11841 else if (M >= 0 && M == FixFreeIdx)
11844 if (NumFlippedBToBInputs != 0) {
11846 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11847 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11849 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11850 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11851 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11856 int PSHUFDMask[] = {0, 1, 2, 3};
11857 PSHUFDMask[ADWord] = BDWord;
11858 PSHUFDMask[BDWord] = ADWord;
11859 V = DAG.getBitcast(
11861 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11862 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11864 // Adjust the mask to match the new locations of A and B.
11865 for (int &M : Mask)
11866 if (M >= 0 && M/2 == ADWord)
11867 M = 2 * BDWord + M % 2;
11868 else if (M >= 0 && M/2 == BDWord)
11869 M = 2 * ADWord + M % 2;
11871 // Recurse back into this routine to re-compute state now that this isn't
11872 // a 3 and 1 problem.
11873 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11876 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11877 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11878 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11879 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11881 // At this point there are at most two inputs to the low and high halves from
11882 // each half. That means the inputs can always be grouped into dwords and
11883 // those dwords can then be moved to the correct half with a dword shuffle.
11884 // We use at most one low and one high word shuffle to collect these paired
11885 // inputs into dwords, and finally a dword shuffle to place them.
11886 int PSHUFLMask[4] = {-1, -1, -1, -1};
11887 int PSHUFHMask[4] = {-1, -1, -1, -1};
11888 int PSHUFDMask[4] = {-1, -1, -1, -1};
11890 // First fix the masks for all the inputs that are staying in their
11891 // original halves. This will then dictate the targets of the cross-half
11893 auto fixInPlaceInputs =
11894 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11895 MutableArrayRef<int> SourceHalfMask,
11896 MutableArrayRef<int> HalfMask, int HalfOffset) {
11897 if (InPlaceInputs.empty())
11899 if (InPlaceInputs.size() == 1) {
11900 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11901 InPlaceInputs[0] - HalfOffset;
11902 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11905 if (IncomingInputs.empty()) {
11906 // Just fix all of the in place inputs.
11907 for (int Input : InPlaceInputs) {
11908 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11909 PSHUFDMask[Input / 2] = Input / 2;
11914 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11915 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11916 InPlaceInputs[0] - HalfOffset;
11917 // Put the second input next to the first so that they are packed into
11918 // a dword. We find the adjacent index by toggling the low bit.
11919 int AdjIndex = InPlaceInputs[0] ^ 1;
11920 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11921 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11922 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11924 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11925 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11927 // Now gather the cross-half inputs and place them into a free dword of
11928 // their target half.
11929 // FIXME: This operation could almost certainly be simplified dramatically to
11930 // look more like the 3-1 fixing operation.
11931 auto moveInputsToRightHalf = [&PSHUFDMask](
11932 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11933 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11934 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11936 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11937 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11939 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11941 int LowWord = Word & ~1;
11942 int HighWord = Word | 1;
11943 return isWordClobbered(SourceHalfMask, LowWord) ||
11944 isWordClobbered(SourceHalfMask, HighWord);
11947 if (IncomingInputs.empty())
11950 if (ExistingInputs.empty()) {
11951 // Map any dwords with inputs from them into the right half.
11952 for (int Input : IncomingInputs) {
11953 // If the source half mask maps over the inputs, turn those into
11954 // swaps and use the swapped lane.
11955 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11956 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11957 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11958 Input - SourceOffset;
11959 // We have to swap the uses in our half mask in one sweep.
11960 for (int &M : HalfMask)
11961 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11963 else if (M == Input)
11964 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11966 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11967 Input - SourceOffset &&
11968 "Previous placement doesn't match!");
11970 // Note that this correctly re-maps both when we do a swap and when
11971 // we observe the other side of the swap above. We rely on that to
11972 // avoid swapping the members of the input list directly.
11973 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11976 // Map the input's dword into the correct half.
11977 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11978 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11980 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11982 "Previous placement doesn't match!");
11985 // And just directly shift any other-half mask elements to be same-half
11986 // as we will have mirrored the dword containing the element into the
11987 // same position within that half.
11988 for (int &M : HalfMask)
11989 if (M >= SourceOffset && M < SourceOffset + 4) {
11990 M = M - SourceOffset + DestOffset;
11991 assert(M >= 0 && "This should never wrap below zero!");
11996 // Ensure we have the input in a viable dword of its current half. This
11997 // is particularly tricky because the original position may be clobbered
11998 // by inputs being moved and *staying* in that half.
11999 if (IncomingInputs.size() == 1) {
12000 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12001 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
12003 SourceHalfMask[InputFixed - SourceOffset] =
12004 IncomingInputs[0] - SourceOffset;
12005 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
12007 IncomingInputs[0] = InputFixed;
12009 } else if (IncomingInputs.size() == 2) {
12010 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
12011 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
12012 // We have two non-adjacent or clobbered inputs we need to extract from
12013 // the source half. To do this, we need to map them into some adjacent
12014 // dword slot in the source mask.
12015 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
12016 IncomingInputs[1] - SourceOffset};
12018 // If there is a free slot in the source half mask adjacent to one of
12019 // the inputs, place the other input in it. We use (Index XOR 1) to
12020 // compute an adjacent index.
12021 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
12022 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
12023 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
12024 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12025 InputsFixed[1] = InputsFixed[0] ^ 1;
12026 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
12027 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
12028 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
12029 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
12030 InputsFixed[0] = InputsFixed[1] ^ 1;
12031 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
12032 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
12033 // The two inputs are in the same DWord but it is clobbered and the
12034 // adjacent DWord isn't used at all. Move both inputs to the free
12036 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
12037 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
12038 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
12039 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
12041 // The only way we hit this point is if there is no clobbering
12042 // (because there are no off-half inputs to this half) and there is no
12043 // free slot adjacent to one of the inputs. In this case, we have to
12044 // swap an input with a non-input.
12045 for (int i = 0; i < 4; ++i)
12046 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
12047 "We can't handle any clobbers here!");
12048 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
12049 "Cannot have adjacent inputs here!");
12051 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
12052 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
12054 // We also have to update the final source mask in this case because
12055 // it may need to undo the above swap.
12056 for (int &M : FinalSourceHalfMask)
12057 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
12058 M = InputsFixed[1] + SourceOffset;
12059 else if (M == InputsFixed[1] + SourceOffset)
12060 M = (InputsFixed[0] ^ 1) + SourceOffset;
12062 InputsFixed[1] = InputsFixed[0] ^ 1;
12065 // Point everything at the fixed inputs.
12066 for (int &M : HalfMask)
12067 if (M == IncomingInputs[0])
12068 M = InputsFixed[0] + SourceOffset;
12069 else if (M == IncomingInputs[1])
12070 M = InputsFixed[1] + SourceOffset;
12072 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
12073 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
12076 llvm_unreachable("Unhandled input size!");
12079 // Now hoist the DWord down to the right half.
12080 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
12081 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
12082 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
12083 for (int &M : HalfMask)
12084 for (int Input : IncomingInputs)
12086 M = FreeDWord * 2 + Input % 2;
12088 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
12089 /*SourceOffset*/ 4, /*DestOffset*/ 0);
12090 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
12091 /*SourceOffset*/ 0, /*DestOffset*/ 4);
12093 // Now enact all the shuffles we've computed to move the inputs into their
12095 if (!isNoopShuffleMask(PSHUFLMask))
12096 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12097 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
12098 if (!isNoopShuffleMask(PSHUFHMask))
12099 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12100 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
12101 if (!isNoopShuffleMask(PSHUFDMask))
12102 V = DAG.getBitcast(
12104 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
12105 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12107 // At this point, each half should contain all its inputs, and we can then
12108 // just shuffle them into their final position.
12109 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
12110 "Failed to lift all the high half inputs to the low mask!");
12111 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
12112 "Failed to lift all the low half inputs to the high mask!");
12114 // Do a half shuffle for the low mask.
12115 if (!isNoopShuffleMask(LoMask))
12116 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
12117 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
12119 // Do a half shuffle with the high mask after shifting its values down.
12120 for (int &M : HiMask)
12123 if (!isNoopShuffleMask(HiMask))
12124 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
12125 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
12130 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
12131 /// blend if only one input is used.
12132 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
12133 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12134 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
12136 SDValue V1Mask[16];
12137 SDValue V2Mask[16];
12141 int Size = Mask.size();
12142 int Scale = 16 / Size;
12143 for (int i = 0; i < 16; ++i) {
12144 if (Mask[i / Scale] < 0) {
12145 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
12147 const int ZeroMask = 0x80;
12148 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
12150 int V2Idx = Mask[i / Scale] < Size
12152 : (Mask[i / Scale] - Size) * Scale + i % Scale;
12153 if (Zeroable[i / Scale])
12154 V1Idx = V2Idx = ZeroMask;
12155 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
12156 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
12157 V1InUse |= (ZeroMask != V1Idx);
12158 V2InUse |= (ZeroMask != V2Idx);
12163 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12164 DAG.getBitcast(MVT::v16i8, V1),
12165 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
12167 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
12168 DAG.getBitcast(MVT::v16i8, V2),
12169 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
12171 // If we need shuffled inputs from both, blend the two.
12173 if (V1InUse && V2InUse)
12174 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
12176 V = V1InUse ? V1 : V2;
12178 // Cast the result back to the correct type.
12179 return DAG.getBitcast(VT, V);
12182 /// \brief Generic lowering of 8-lane i16 shuffles.
12184 /// This handles both single-input shuffles and combined shuffle/blends with
12185 /// two inputs. The single input shuffles are immediately delegated to
12186 /// a dedicated lowering routine.
12188 /// The blends are lowered in one of three fundamental ways. If there are few
12189 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
12190 /// of the input is significantly cheaper when lowered as an interleaving of
12191 /// the two inputs, try to interleave them. Otherwise, blend the low and high
12192 /// halves of the inputs separately (making them have relatively few inputs)
12193 /// and then concatenate them.
12194 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12195 const APInt &Zeroable,
12196 SDValue V1, SDValue V2,
12197 const X86Subtarget &Subtarget,
12198 SelectionDAG &DAG) {
12199 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12200 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
12201 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12203 // Whenever we can lower this as a zext, that instruction is strictly faster
12204 // than any alternative.
12205 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12206 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12209 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
12211 if (NumV2Inputs == 0) {
12212 // Check for being able to broadcast a single element.
12213 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12214 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12217 // Try to use shift instructions.
12218 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
12219 Zeroable, Subtarget, DAG))
12222 // Use dedicated unpack instructions for masks that match their pattern.
12224 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12227 // Use dedicated pack instructions for masks that match their pattern.
12228 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
12232 // Try to use byte rotation instructions.
12233 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
12234 Mask, Subtarget, DAG))
12237 // Make a copy of the mask so it can be modified.
12238 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
12239 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
12240 MutableMask, Subtarget,
12244 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
12245 "All single-input shuffles should be canonicalized to be V1-input "
12248 // Try to use shift instructions.
12249 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
12250 Zeroable, Subtarget, DAG))
12253 // See if we can use SSE4A Extraction / Insertion.
12254 if (Subtarget.hasSSE4A())
12255 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
12259 // There are special ways we can lower some single-element blends.
12260 if (NumV2Inputs == 1)
12261 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12262 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12265 // We have different paths for blend lowering, but they all must use the
12266 // *exact* same predicate.
12267 bool IsBlendSupported = Subtarget.hasSSE41();
12268 if (IsBlendSupported)
12269 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
12270 Zeroable, Subtarget, DAG))
12273 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
12277 // Use dedicated unpack instructions for masks that match their pattern.
12279 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
12282 // Use dedicated pack instructions for masks that match their pattern.
12283 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
12287 // Try to use byte rotation instructions.
12288 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12289 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
12292 if (SDValue BitBlend =
12293 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
12296 // Try to lower by permuting the inputs into an unpack instruction.
12297 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
12301 // If we can't directly blend but can use PSHUFB, that will be better as it
12302 // can both shuffle and set up the inefficient blend.
12303 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
12304 bool V1InUse, V2InUse;
12305 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
12306 Zeroable, DAG, V1InUse, V2InUse);
12309 // We can always bit-blend if we have to so the fallback strategy is to
12310 // decompose into single-input permutes and blends.
12311 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
12315 /// \brief Check whether a compaction lowering can be done by dropping even
12316 /// elements and compute how many times even elements must be dropped.
12318 /// This handles shuffles which take every Nth element where N is a power of
12319 /// two. Example shuffle masks:
12321 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
12322 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
12323 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
12324 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
12325 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
12326 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
12328 /// Any of these lanes can of course be undef.
12330 /// This routine only supports N <= 3.
12331 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
12334 /// \returns N above, or the number of times even elements must be dropped if
12335 /// there is such a number. Otherwise returns zero.
12336 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
12337 bool IsSingleInput) {
12338 // The modulus for the shuffle vector entries is based on whether this is
12339 // a single input or not.
12340 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
12341 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
12342 "We should only be called with masks with a power-of-2 size!");
12344 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
12346 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
12347 // and 2^3 simultaneously. This is because we may have ambiguity with
12348 // partially undef inputs.
12349 bool ViableForN[3] = {true, true, true};
12351 for (int i = 0, e = Mask.size(); i < e; ++i) {
12352 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
12357 bool IsAnyViable = false;
12358 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12359 if (ViableForN[j]) {
12360 uint64_t N = j + 1;
12362 // The shuffle mask must be equal to (i * 2^N) % M.
12363 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
12364 IsAnyViable = true;
12366 ViableForN[j] = false;
12368 // Early exit if we exhaust the possible powers of two.
12373 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
12377 // Return 0 as there is no viable power of two.
12381 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12382 ArrayRef<int> Mask, SDValue V1,
12383 SDValue V2, SelectionDAG &DAG) {
12384 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12385 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12387 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12389 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12391 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12394 /// \brief Generic lowering of v16i8 shuffles.
12396 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
12397 /// detect any complexity reducing interleaving. If that doesn't help, it uses
12398 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
12399 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
12401 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12402 const APInt &Zeroable,
12403 SDValue V1, SDValue V2,
12404 const X86Subtarget &Subtarget,
12405 SelectionDAG &DAG) {
12406 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12407 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
12408 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12410 // Try to use shift instructions.
12411 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
12412 Zeroable, Subtarget, DAG))
12415 // Try to use byte rotation instructions.
12416 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12417 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12420 // Use dedicated pack instructions for masks that match their pattern.
12421 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
12425 // Try to use a zext lowering.
12426 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12427 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12430 // See if we can use SSE4A Extraction / Insertion.
12431 if (Subtarget.hasSSE4A())
12432 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
12436 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
12438 // For single-input shuffles, there are some nicer lowering tricks we can use.
12439 if (NumV2Elements == 0) {
12440 // Check for being able to broadcast a single element.
12441 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12442 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
12445 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
12446 // Notably, this handles splat and partial-splat shuffles more efficiently.
12447 // However, it only makes sense if the pre-duplication shuffle simplifies
12448 // things significantly. Currently, this means we need to be able to
12449 // express the pre-duplication shuffle as an i16 shuffle.
12451 // FIXME: We should check for other patterns which can be widened into an
12452 // i16 shuffle as well.
12453 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
12454 for (int i = 0; i < 16; i += 2)
12455 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
12460 auto tryToWidenViaDuplication = [&]() -> SDValue {
12461 if (!canWidenViaDuplication(Mask))
12463 SmallVector<int, 4> LoInputs;
12464 copy_if(Mask, std::back_inserter(LoInputs),
12465 [](int M) { return M >= 0 && M < 8; });
12466 std::sort(LoInputs.begin(), LoInputs.end());
12467 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
12469 SmallVector<int, 4> HiInputs;
12470 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
12471 std::sort(HiInputs.begin(), HiInputs.end());
12472 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
12475 bool TargetLo = LoInputs.size() >= HiInputs.size();
12476 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
12477 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
12479 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
12480 SmallDenseMap<int, int, 8> LaneMap;
12481 for (int I : InPlaceInputs) {
12482 PreDupI16Shuffle[I/2] = I/2;
12485 int j = TargetLo ? 0 : 4, je = j + 4;
12486 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
12487 // Check if j is already a shuffle of this input. This happens when
12488 // there are two adjacent bytes after we move the low one.
12489 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
12490 // If we haven't yet mapped the input, search for a slot into which
12492 while (j < je && PreDupI16Shuffle[j] >= 0)
12496 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12499 // Map this input with the i16 shuffle.
12500 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12503 // Update the lane map based on the mapping we ended up with.
12504 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12506 V1 = DAG.getBitcast(
12508 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12509 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12511 // Unpack the bytes to form the i16s that will be shuffled into place.
12512 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12513 MVT::v16i8, V1, V1);
12515 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12516 for (int i = 0; i < 16; ++i)
12517 if (Mask[i] >= 0) {
12518 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12519 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12520 if (PostDupI16Shuffle[i / 2] < 0)
12521 PostDupI16Shuffle[i / 2] = MappedMask;
12523 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12524 "Conflicting entries in the original shuffle!");
12526 return DAG.getBitcast(
12528 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12529 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12531 if (SDValue V = tryToWidenViaDuplication())
12535 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12539 // Use dedicated unpack instructions for masks that match their pattern.
12541 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12544 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12545 // with PSHUFB. It is important to do this before we attempt to generate any
12546 // blends but after all of the single-input lowerings. If the single input
12547 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12548 // want to preserve that and we can DAG combine any longer sequences into
12549 // a PSHUFB in the end. But once we start blending from multiple inputs,
12550 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12551 // and there are *very* few patterns that would actually be faster than the
12552 // PSHUFB approach because of its ability to zero lanes.
12554 // FIXME: The only exceptions to the above are blends which are exact
12555 // interleavings with direct instructions supporting them. We currently don't
12556 // handle those well here.
12557 if (Subtarget.hasSSSE3()) {
12558 bool V1InUse = false;
12559 bool V2InUse = false;
12561 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12562 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12564 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12565 // do so. This avoids using them to handle blends-with-zero which is
12566 // important as a single pshufb is significantly faster for that.
12567 if (V1InUse && V2InUse) {
12568 if (Subtarget.hasSSE41())
12569 if (SDValue Blend = lowerVectorShuffleAsBlend(
12570 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12573 // We can use an unpack to do the blending rather than an or in some
12574 // cases. Even though the or may be (very minorly) more efficient, we
12575 // preference this lowering because there are common cases where part of
12576 // the complexity of the shuffles goes away when we do the final blend as
12578 // FIXME: It might be worth trying to detect if the unpack-feeding
12579 // shuffles will both be pshufb, in which case we shouldn't bother with
12581 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12582 DL, MVT::v16i8, V1, V2, Mask, DAG))
12585 // If we have VBMI we can use one VPERM instead of multiple PSHUFBs.
12586 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
12587 return lowerVectorShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, DAG);
12593 // There are special ways we can lower some single-element blends.
12594 if (NumV2Elements == 1)
12595 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12596 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12599 if (SDValue BitBlend =
12600 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12603 // Check whether a compaction lowering can be done. This handles shuffles
12604 // which take every Nth element for some even N. See the helper function for
12607 // We special case these as they can be particularly efficiently handled with
12608 // the PACKUSB instruction on x86 and they show up in common patterns of
12609 // rearranging bytes to truncate wide elements.
12610 bool IsSingleInput = V2.isUndef();
12611 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12612 // NumEvenDrops is the power of two stride of the elements. Another way of
12613 // thinking about it is that we need to drop the even elements this many
12614 // times to get the original input.
12616 // First we need to zero all the dropped bytes.
12617 assert(NumEvenDrops <= 3 &&
12618 "No support for dropping even elements more than 3 times.");
12619 // We use the mask type to pick which bytes are preserved based on how many
12620 // elements are dropped.
12621 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12622 SDValue ByteClearMask = DAG.getBitcast(
12623 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12624 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12625 if (!IsSingleInput)
12626 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12628 // Now pack things back together.
12629 V1 = DAG.getBitcast(MVT::v8i16, V1);
12630 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12631 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12632 for (int i = 1; i < NumEvenDrops; ++i) {
12633 Result = DAG.getBitcast(MVT::v8i16, Result);
12634 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12640 // Handle multi-input cases by blending single-input shuffles.
12641 if (NumV2Elements > 0)
12642 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12645 // The fallback path for single-input shuffles widens this into two v8i16
12646 // vectors with unpacks, shuffles those, and then pulls them back together
12650 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12651 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12652 for (int i = 0; i < 16; ++i)
12654 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12656 SDValue VLoHalf, VHiHalf;
12657 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12658 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12660 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12661 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12662 // Use a mask to drop the high bytes.
12663 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12664 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12665 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12667 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12668 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12670 // Squash the masks to point directly into VLoHalf.
12671 for (int &M : LoBlendMask)
12674 for (int &M : HiBlendMask)
12678 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12679 // VHiHalf so that we can blend them as i16s.
12680 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12682 VLoHalf = DAG.getBitcast(
12683 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12684 VHiHalf = DAG.getBitcast(
12685 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12688 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12689 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12691 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12694 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12696 /// This routine breaks down the specific type of 128-bit shuffle and
12697 /// dispatches to the lowering routines accordingly.
12698 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12699 MVT VT, SDValue V1, SDValue V2,
12700 const APInt &Zeroable,
12701 const X86Subtarget &Subtarget,
12702 SelectionDAG &DAG) {
12703 switch (VT.SimpleTy) {
12705 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12707 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12709 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12711 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12713 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12715 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12718 llvm_unreachable("Unimplemented!");
12722 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
12724 /// This routine just extracts two subvectors, shuffles them independently, and
12725 /// then concatenates them back together. This should work effectively with all
12726 /// AVX vector shuffle types.
12727 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12728 SDValue V2, ArrayRef<int> Mask,
12729 SelectionDAG &DAG) {
12730 assert(VT.getSizeInBits() >= 256 &&
12731 "Only for 256-bit or wider vector shuffles!");
12732 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12733 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12735 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12736 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12738 int NumElements = VT.getVectorNumElements();
12739 int SplitNumElements = NumElements / 2;
12740 MVT ScalarVT = VT.getVectorElementType();
12741 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12743 // Rather than splitting build-vectors, just build two narrower build
12744 // vectors. This helps shuffling with splats and zeros.
12745 auto SplitVector = [&](SDValue V) {
12746 V = peekThroughBitcasts(V);
12748 MVT OrigVT = V.getSimpleValueType();
12749 int OrigNumElements = OrigVT.getVectorNumElements();
12750 int OrigSplitNumElements = OrigNumElements / 2;
12751 MVT OrigScalarVT = OrigVT.getVectorElementType();
12752 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12756 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12758 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12759 DAG.getIntPtrConstant(0, DL));
12760 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12761 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12764 SmallVector<SDValue, 16> LoOps, HiOps;
12765 for (int i = 0; i < OrigSplitNumElements; ++i) {
12766 LoOps.push_back(BV->getOperand(i));
12767 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12769 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12770 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12772 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12773 DAG.getBitcast(SplitVT, HiV));
12776 SDValue LoV1, HiV1, LoV2, HiV2;
12777 std::tie(LoV1, HiV1) = SplitVector(V1);
12778 std::tie(LoV2, HiV2) = SplitVector(V2);
12780 // Now create two 4-way blends of these half-width vectors.
12781 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12782 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12783 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12784 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12785 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12786 for (int i = 0; i < SplitNumElements; ++i) {
12787 int M = HalfMask[i];
12788 if (M >= NumElements) {
12789 if (M >= NumElements + SplitNumElements)
12793 V2BlendMask[i] = M - NumElements;
12794 BlendMask[i] = SplitNumElements + i;
12795 } else if (M >= 0) {
12796 if (M >= SplitNumElements)
12800 V1BlendMask[i] = M;
12805 // Because the lowering happens after all combining takes place, we need to
12806 // manually combine these blend masks as much as possible so that we create
12807 // a minimal number of high-level vector shuffle nodes.
12809 // First try just blending the halves of V1 or V2.
12810 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12811 return DAG.getUNDEF(SplitVT);
12812 if (!UseLoV2 && !UseHiV2)
12813 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12814 if (!UseLoV1 && !UseHiV1)
12815 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12817 SDValue V1Blend, V2Blend;
12818 if (UseLoV1 && UseHiV1) {
12820 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12822 // We only use half of V1 so map the usage down into the final blend mask.
12823 V1Blend = UseLoV1 ? LoV1 : HiV1;
12824 for (int i = 0; i < SplitNumElements; ++i)
12825 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12826 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12828 if (UseLoV2 && UseHiV2) {
12830 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12832 // We only use half of V2 so map the usage down into the final blend mask.
12833 V2Blend = UseLoV2 ? LoV2 : HiV2;
12834 for (int i = 0; i < SplitNumElements; ++i)
12835 if (BlendMask[i] >= SplitNumElements)
12836 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12838 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12840 SDValue Lo = HalfBlend(LoMask);
12841 SDValue Hi = HalfBlend(HiMask);
12842 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12845 /// \brief Either split a vector in halves or decompose the shuffles and the
12848 /// This is provided as a good fallback for many lowerings of non-single-input
12849 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12850 /// between splitting the shuffle into 128-bit components and stitching those
12851 /// back together vs. extracting the single-input shuffles and blending those
12853 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12854 SDValue V1, SDValue V2,
12855 ArrayRef<int> Mask,
12856 SelectionDAG &DAG) {
12857 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12858 "shuffles as it could then recurse on itself.");
12859 int Size = Mask.size();
12861 // If this can be modeled as a broadcast of two elements followed by a blend,
12862 // prefer that lowering. This is especially important because broadcasts can
12863 // often fold with memory operands.
12864 auto DoBothBroadcast = [&] {
12865 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12868 if (V2BroadcastIdx < 0)
12869 V2BroadcastIdx = M - Size;
12870 else if (M - Size != V2BroadcastIdx)
12872 } else if (M >= 0) {
12873 if (V1BroadcastIdx < 0)
12874 V1BroadcastIdx = M;
12875 else if (M != V1BroadcastIdx)
12880 if (DoBothBroadcast())
12881 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12884 // If the inputs all stem from a single 128-bit lane of each input, then we
12885 // split them rather than blending because the split will decompose to
12886 // unusually few instructions.
12887 int LaneCount = VT.getSizeInBits() / 128;
12888 int LaneSize = Size / LaneCount;
12889 SmallBitVector LaneInputs[2];
12890 LaneInputs[0].resize(LaneCount, false);
12891 LaneInputs[1].resize(LaneCount, false);
12892 for (int i = 0; i < Size; ++i)
12894 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12895 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12896 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12898 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12899 // that the decomposed single-input shuffles don't end up here.
12900 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12903 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12904 /// a permutation and blend of those lanes.
12906 /// This essentially blends the out-of-lane inputs to each lane into the lane
12907 /// from a permuted copy of the vector. This lowering strategy results in four
12908 /// instructions in the worst case for a single-input cross lane shuffle which
12909 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12910 /// of. Special cases for each particular shuffle pattern should be handled
12911 /// prior to trying this lowering.
12912 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12913 SDValue V1, SDValue V2,
12914 ArrayRef<int> Mask,
12916 const X86Subtarget &Subtarget) {
12917 // FIXME: This should probably be generalized for 512-bit vectors as well.
12918 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12919 int Size = Mask.size();
12920 int LaneSize = Size / 2;
12922 // If there are only inputs from one 128-bit lane, splitting will in fact be
12923 // less expensive. The flags track whether the given lane contains an element
12924 // that crosses to another lane.
12925 if (!Subtarget.hasAVX2()) {
12926 bool LaneCrossing[2] = {false, false};
12927 for (int i = 0; i < Size; ++i)
12928 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12929 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12930 if (!LaneCrossing[0] || !LaneCrossing[1])
12931 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12933 bool LaneUsed[2] = {false, false};
12934 for (int i = 0; i < Size; ++i)
12936 LaneUsed[(Mask[i] / LaneSize)] = true;
12937 if (!LaneUsed[0] || !LaneUsed[1])
12938 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12941 assert(V2.isUndef() &&
12942 "This last part of this routine only works on single input shuffles");
12944 SmallVector<int, 32> FlippedBlendMask(Size);
12945 for (int i = 0; i < Size; ++i)
12946 FlippedBlendMask[i] =
12947 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12949 : Mask[i] % LaneSize +
12950 (i / LaneSize) * LaneSize + Size);
12952 // Flip the vector, and blend the results which should now be in-lane.
12953 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12954 SDValue Flipped = DAG.getBitcast(PVT, V1);
12955 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12957 Flipped = DAG.getBitcast(VT, Flipped);
12958 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12961 /// \brief Handle lowering 2-lane 128-bit shuffles.
12962 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12963 SDValue V2, ArrayRef<int> Mask,
12964 const APInt &Zeroable,
12965 const X86Subtarget &Subtarget,
12966 SelectionDAG &DAG) {
12967 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12968 if (Subtarget.hasAVX2() && V2.isUndef())
12971 SmallVector<int, 4> WidenedMask;
12972 if (!canWidenShuffleElements(Mask, WidenedMask))
12975 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12976 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12978 // Try to use an insert into a zero vector.
12979 if (WidenedMask[0] == 0 && IsHighZero) {
12980 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12981 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12982 DAG.getIntPtrConstant(0, DL));
12983 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
12984 getZeroVector(VT, Subtarget, DAG, DL), LoV,
12985 DAG.getIntPtrConstant(0, DL));
12988 // TODO: If minimizing size and one of the inputs is a zero vector and the
12989 // the zero vector has only one use, we could use a VPERM2X128 to save the
12990 // instruction bytes needed to explicitly generate the zero vector.
12992 // Blends are faster and handle all the non-lane-crossing cases.
12993 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12994 Zeroable, Subtarget, DAG))
12997 // If either input operand is a zero vector, use VPERM2X128 because its mask
12998 // allows us to replace the zero input with an implicit zero.
12999 if (!IsLowZero && !IsHighZero) {
13000 // Check for patterns which can be matched with a single insert of a 128-bit
13002 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
13003 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
13005 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
13006 // this will likely become vinsertf128 which can't fold a 256-bit memop.
13007 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
13008 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13009 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13010 OnlyUsesV1 ? V1 : V2,
13011 DAG.getIntPtrConstant(0, DL));
13012 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
13013 DAG.getIntPtrConstant(2, DL));
13017 // Try to use SHUF128 if possible.
13018 if (Subtarget.hasVLX()) {
13019 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
13020 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
13021 ((WidenedMask[1] % 2) << 1);
13022 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
13023 DAG.getConstant(PermMask, DL, MVT::i8));
13028 // Otherwise form a 128-bit permutation. After accounting for undefs,
13029 // convert the 64-bit shuffle mask selection values into 128-bit
13030 // selection bits by dividing the indexes by 2 and shifting into positions
13031 // defined by a vperm2*128 instruction's immediate control byte.
13033 // The immediate permute control byte looks like this:
13034 // [1:0] - select 128 bits from sources for low half of destination
13036 // [3] - zero low half of destination
13037 // [5:4] - select 128 bits from sources for high half of destination
13039 // [7] - zero high half of destination
13041 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
13043 unsigned PermMask = 0;
13044 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
13045 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
13047 // Check the immediate mask and replace unused sources with undef.
13048 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
13049 V1 = DAG.getUNDEF(VT);
13050 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
13051 V2 = DAG.getUNDEF(VT);
13053 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
13054 DAG.getConstant(PermMask, DL, MVT::i8));
13057 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
13058 /// shuffling each lane.
13060 /// This will only succeed when the result of fixing the 128-bit lanes results
13061 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
13062 /// each 128-bit lanes. This handles many cases where we can quickly blend away
13063 /// the lane crosses early and then use simpler shuffles within each lane.
13065 /// FIXME: It might be worthwhile at some point to support this without
13066 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
13067 /// in x86 only floating point has interesting non-repeating shuffles, and even
13068 /// those are still *marginally* more expensive.
13069 static SDValue lowerVectorShuffleByMerging128BitLanes(
13070 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13071 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13072 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
13074 int Size = Mask.size();
13075 int LaneSize = 128 / VT.getScalarSizeInBits();
13076 int NumLanes = Size / LaneSize;
13077 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
13079 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
13080 // check whether the in-128-bit lane shuffles share a repeating pattern.
13081 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
13082 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
13083 for (int i = 0; i < Size; ++i) {
13087 int j = i / LaneSize;
13089 if (Lanes[j] < 0) {
13090 // First entry we've seen for this lane.
13091 Lanes[j] = Mask[i] / LaneSize;
13092 } else if (Lanes[j] != Mask[i] / LaneSize) {
13093 // This doesn't match the lane selected previously!
13097 // Check that within each lane we have a consistent shuffle mask.
13098 int k = i % LaneSize;
13099 if (InLaneMask[k] < 0) {
13100 InLaneMask[k] = Mask[i] % LaneSize;
13101 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
13102 // This doesn't fit a repeating in-lane mask.
13107 // First shuffle the lanes into place.
13108 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
13109 VT.getSizeInBits() / 64);
13110 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
13111 for (int i = 0; i < NumLanes; ++i)
13112 if (Lanes[i] >= 0) {
13113 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
13114 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
13117 V1 = DAG.getBitcast(LaneVT, V1);
13118 V2 = DAG.getBitcast(LaneVT, V2);
13119 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
13121 // Cast it back to the type we actually want.
13122 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
13124 // Now do a simple shuffle that isn't lane crossing.
13125 SmallVector<int, 8> NewMask((unsigned)Size, -1);
13126 for (int i = 0; i < Size; ++i)
13128 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
13129 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
13130 "Must not introduce lane crosses at this point!");
13132 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
13135 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
13136 /// This allows for fast cases such as subvector extraction/insertion
13137 /// or shuffling smaller vector types which can lower more efficiently.
13138 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
13139 SDValue V1, SDValue V2,
13140 ArrayRef<int> Mask,
13141 const X86Subtarget &Subtarget,
13142 SelectionDAG &DAG) {
13143 assert((VT.is256BitVector() || VT.is512BitVector()) &&
13144 "Expected 256-bit or 512-bit vector");
13146 unsigned NumElts = VT.getVectorNumElements();
13147 unsigned HalfNumElts = NumElts / 2;
13148 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
13150 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
13151 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
13152 if (!UndefLower && !UndefUpper)
13155 // Upper half is undef and lower half is whole upper subvector.
13156 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
13158 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
13159 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13160 DAG.getIntPtrConstant(HalfNumElts, DL));
13161 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13162 DAG.getIntPtrConstant(0, DL));
13165 // Lower half is undef and upper half is whole lower subvector.
13166 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
13168 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
13169 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
13170 DAG.getIntPtrConstant(0, DL));
13171 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
13172 DAG.getIntPtrConstant(HalfNumElts, DL));
13175 // If the shuffle only uses two of the four halves of the input operands,
13176 // then extract them and perform the 'half' shuffle at half width.
13177 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
13178 int HalfIdx1 = -1, HalfIdx2 = -1;
13179 SmallVector<int, 8> HalfMask(HalfNumElts);
13180 unsigned Offset = UndefLower ? HalfNumElts : 0;
13181 for (unsigned i = 0; i != HalfNumElts; ++i) {
13182 int M = Mask[i + Offset];
13188 // Determine which of the 4 half vectors this element is from.
13189 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
13190 int HalfIdx = M / HalfNumElts;
13192 // Determine the element index into its half vector source.
13193 int HalfElt = M % HalfNumElts;
13195 // We can shuffle with up to 2 half vectors, set the new 'half'
13196 // shuffle mask accordingly.
13197 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
13198 HalfMask[i] = HalfElt;
13199 HalfIdx1 = HalfIdx;
13202 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
13203 HalfMask[i] = HalfElt + HalfNumElts;
13204 HalfIdx2 = HalfIdx;
13208 // Too many half vectors referenced.
13211 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
13213 // Only shuffle the halves of the inputs when useful.
13214 int NumLowerHalves =
13215 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
13216 int NumUpperHalves =
13217 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
13219 // uuuuXXXX - don't extract uppers just to insert again.
13220 if (UndefLower && NumUpperHalves != 0)
13223 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
13224 if (UndefUpper && NumUpperHalves == 2)
13227 // AVX2 - XXXXuuuu - always extract lowers.
13228 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
13229 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
13230 if (VT == MVT::v4f64 || VT == MVT::v4i64)
13232 // AVX2 supports variable 32-bit element cross-lane shuffles.
13233 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
13234 // XXXXuuuu - don't extract lowers and uppers.
13235 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
13240 // AVX512 - XXXXuuuu - always extract lowers.
13241 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
13244 auto GetHalfVector = [&](int HalfIdx) {
13246 return DAG.getUNDEF(HalfVT);
13247 SDValue V = (HalfIdx < 2 ? V1 : V2);
13248 HalfIdx = (HalfIdx % 2) * HalfNumElts;
13249 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
13250 DAG.getIntPtrConstant(HalfIdx, DL));
13253 SDValue Half1 = GetHalfVector(HalfIdx1);
13254 SDValue Half2 = GetHalfVector(HalfIdx2);
13255 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
13256 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
13257 DAG.getIntPtrConstant(Offset, DL));
13260 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
13263 /// This returns true if the elements from a particular input are already in the
13264 /// slot required by the given mask and require no permutation.
13265 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
13266 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
13267 int Size = Mask.size();
13268 for (int i = 0; i < Size; ++i)
13269 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
13275 /// Handle case where shuffle sources are coming from the same 128-bit lane and
13276 /// every lane can be represented as the same repeating mask - allowing us to
13277 /// shuffle the sources with the repeating shuffle and then permute the result
13278 /// to the destination lanes.
13279 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
13280 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
13281 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
13282 int NumElts = VT.getVectorNumElements();
13283 int NumLanes = VT.getSizeInBits() / 128;
13284 int NumLaneElts = NumElts / NumLanes;
13286 // On AVX2 we may be able to just shuffle the lowest elements and then
13287 // broadcast the result.
13288 if (Subtarget.hasAVX2()) {
13289 for (unsigned BroadcastSize : {16, 32, 64}) {
13290 if (BroadcastSize <= VT.getScalarSizeInBits())
13292 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
13294 // Attempt to match a repeating pattern every NumBroadcastElts,
13295 // accounting for UNDEFs but only references the lowest 128-bit
13296 // lane of the inputs.
13297 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
13298 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13299 for (int j = 0; j != NumBroadcastElts; ++j) {
13300 int M = Mask[i + j];
13303 int &R = RepeatMask[j];
13304 if (0 != ((M % NumElts) / NumLaneElts))
13306 if (0 <= R && R != M)
13313 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
13314 if (!FindRepeatingBroadcastMask(RepeatMask))
13317 // Shuffle the (lowest) repeated elements in place for broadcast.
13318 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
13320 // Shuffle the actual broadcast.
13321 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
13322 for (int i = 0; i != NumElts; i += NumBroadcastElts)
13323 for (int j = 0; j != NumBroadcastElts; ++j)
13324 BroadcastMask[i + j] = j;
13325 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
13330 // Bail if the shuffle mask doesn't cross 128-bit lanes.
13331 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
13334 // Bail if we already have a repeated lane shuffle mask.
13335 SmallVector<int, 8> RepeatedShuffleMask;
13336 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
13339 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
13340 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
13341 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
13342 int NumSubLanes = NumLanes * SubLaneScale;
13343 int NumSubLaneElts = NumLaneElts / SubLaneScale;
13345 // Check that all the sources are coming from the same lane and see if we can
13346 // form a repeating shuffle mask (local to each sub-lane). At the same time,
13347 // determine the source sub-lane for each destination sub-lane.
13348 int TopSrcSubLane = -1;
13349 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
13350 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
13351 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
13352 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
13354 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
13355 // Extract the sub-lane mask, check that it all comes from the same lane
13356 // and normalize the mask entries to come from the first lane.
13358 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
13359 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13360 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
13363 int Lane = (M % NumElts) / NumLaneElts;
13364 if ((0 <= SrcLane) && (SrcLane != Lane))
13367 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
13368 SubLaneMask[Elt] = LocalM;
13371 // Whole sub-lane is UNDEF.
13375 // Attempt to match against the candidate repeated sub-lane masks.
13376 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
13377 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
13378 for (int i = 0; i != NumSubLaneElts; ++i) {
13379 if (M1[i] < 0 || M2[i] < 0)
13381 if (M1[i] != M2[i])
13387 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
13388 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
13391 // Merge the sub-lane mask into the matching repeated sub-lane mask.
13392 for (int i = 0; i != NumSubLaneElts; ++i) {
13393 int M = SubLaneMask[i];
13396 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
13397 "Unexpected mask element");
13398 RepeatedSubLaneMask[i] = M;
13401 // Track the top most source sub-lane - by setting the remaining to UNDEF
13402 // we can greatly simplify shuffle matching.
13403 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
13404 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
13405 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
13409 // Bail if we failed to find a matching repeated sub-lane mask.
13410 if (Dst2SrcSubLanes[DstSubLane] < 0)
13413 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
13414 "Unexpected source lane");
13416 // Create a repeating shuffle mask for the entire vector.
13417 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
13418 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
13419 int Lane = SubLane / SubLaneScale;
13420 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
13421 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
13422 int M = RepeatedSubLaneMask[Elt];
13425 int Idx = (SubLane * NumSubLaneElts) + Elt;
13426 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
13429 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
13431 // Shuffle each source sub-lane to its destination.
13432 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
13433 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
13434 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
13435 if (SrcSubLane < 0)
13437 for (int j = 0; j != NumSubLaneElts; ++j)
13438 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
13441 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
13445 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
13446 unsigned &ShuffleImm,
13447 ArrayRef<int> Mask) {
13448 int NumElts = VT.getVectorNumElements();
13449 assert(VT.getScalarSizeInBits() == 64 &&
13450 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
13451 "Unexpected data type for VSHUFPD");
13453 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
13454 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
13456 bool ShufpdMask = true;
13457 bool CommutableMask = true;
13458 for (int i = 0; i < NumElts; ++i) {
13459 if (Mask[i] == SM_SentinelUndef)
13463 int Val = (i & 6) + NumElts * (i & 1);
13464 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
13465 if (Mask[i] < Val || Mask[i] > Val + 1)
13466 ShufpdMask = false;
13467 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
13468 CommutableMask = false;
13469 ShuffleImm |= (Mask[i] % 2) << i;
13474 if (CommutableMask) {
13482 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
13483 ArrayRef<int> Mask, SDValue V1,
13484 SDValue V2, SelectionDAG &DAG) {
13485 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
13486 "Unexpected data type for VSHUFPD");
13488 unsigned Immediate = 0;
13489 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
13492 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
13493 DAG.getConstant(Immediate, DL, MVT::i8));
13496 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
13498 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13499 /// isn't available.
13500 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13501 const APInt &Zeroable,
13502 SDValue V1, SDValue V2,
13503 const X86Subtarget &Subtarget,
13504 SelectionDAG &DAG) {
13505 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13506 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13507 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13509 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13510 Zeroable, Subtarget, DAG))
13513 if (V2.isUndef()) {
13514 // Check for being able to broadcast a single element.
13515 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13516 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13519 // Use low duplicate instructions for masks that match their pattern.
13520 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13521 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13523 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13524 // Non-half-crossing single input shuffles can be lowered with an
13525 // interleaved permutation.
13526 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13527 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13528 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13529 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13532 // With AVX2 we have direct support for this permutation.
13533 if (Subtarget.hasAVX2())
13534 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13535 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13537 // Try to create an in-lane repeating shuffle mask and then shuffle the
13538 // the results into the target lanes.
13539 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13540 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13543 // Otherwise, fall back.
13544 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13548 // Use dedicated unpack instructions for masks that match their pattern.
13550 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13553 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13554 Zeroable, Subtarget, DAG))
13557 // Check if the blend happens to exactly fit that of SHUFPD.
13559 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13562 // Try to create an in-lane repeating shuffle mask and then shuffle the
13563 // the results into the target lanes.
13564 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13565 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13568 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13569 // shuffle. However, if we have AVX2 and either inputs are already in place,
13570 // we will be able to shuffle even across lanes the other input in a single
13571 // instruction so skip this pattern.
13572 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13573 isShuffleMaskInputInPlace(1, Mask))))
13574 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13575 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13577 // If we have VLX support, we can use VEXPAND.
13578 if (Subtarget.hasVLX())
13579 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13580 V1, V2, DAG, Subtarget))
13583 // If we have AVX2 then we always want to lower with a blend because an v4 we
13584 // can fully permute the elements.
13585 if (Subtarget.hasAVX2())
13586 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13589 // Otherwise fall back on generic lowering.
13590 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13593 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13595 /// This routine is only called when we have AVX2 and thus a reasonable
13596 /// instruction set for v4i64 shuffling..
13597 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13598 const APInt &Zeroable,
13599 SDValue V1, SDValue V2,
13600 const X86Subtarget &Subtarget,
13601 SelectionDAG &DAG) {
13602 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13603 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13604 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13605 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13607 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13608 Zeroable, Subtarget, DAG))
13611 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13612 Zeroable, Subtarget, DAG))
13615 // Check for being able to broadcast a single element.
13616 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13617 Mask, Subtarget, DAG))
13620 if (V2.isUndef()) {
13621 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13622 // can use lower latency instructions that will operate on both lanes.
13623 SmallVector<int, 2> RepeatedMask;
13624 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13625 SmallVector<int, 4> PSHUFDMask;
13626 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13627 return DAG.getBitcast(
13629 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13630 DAG.getBitcast(MVT::v8i32, V1),
13631 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13634 // AVX2 provides a direct instruction for permuting a single input across
13636 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13637 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13640 // Try to use shift instructions.
13641 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13642 Zeroable, Subtarget, DAG))
13645 // If we have VLX support, we can use VALIGN or VEXPAND.
13646 if (Subtarget.hasVLX()) {
13647 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13648 Mask, Subtarget, DAG))
13651 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13652 V1, V2, DAG, Subtarget))
13656 // Try to use PALIGNR.
13657 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13658 Mask, Subtarget, DAG))
13661 // Use dedicated unpack instructions for masks that match their pattern.
13663 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13666 // Try to create an in-lane repeating shuffle mask and then shuffle the
13667 // the results into the target lanes.
13668 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13669 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13672 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13673 // shuffle. However, if we have AVX2 and either inputs are already in place,
13674 // we will be able to shuffle even across lanes the other input in a single
13675 // instruction so skip this pattern.
13676 if (!isShuffleMaskInputInPlace(0, Mask) &&
13677 !isShuffleMaskInputInPlace(1, Mask))
13678 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13679 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13682 // Otherwise fall back on generic blend lowering.
13683 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13687 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13689 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13690 /// isn't available.
13691 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13692 const APInt &Zeroable,
13693 SDValue V1, SDValue V2,
13694 const X86Subtarget &Subtarget,
13695 SelectionDAG &DAG) {
13696 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13697 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13698 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13700 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13701 Zeroable, Subtarget, DAG))
13704 // Check for being able to broadcast a single element.
13705 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13706 Mask, Subtarget, DAG))
13709 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13710 // options to efficiently lower the shuffle.
13711 SmallVector<int, 4> RepeatedMask;
13712 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13713 assert(RepeatedMask.size() == 4 &&
13714 "Repeated masks must be half the mask width!");
13716 // Use even/odd duplicate instructions for masks that match their pattern.
13717 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13718 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13719 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13720 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13723 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13724 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13726 // Use dedicated unpack instructions for masks that match their pattern.
13728 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13731 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13732 // have already handled any direct blends.
13733 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13736 // Try to create an in-lane repeating shuffle mask and then shuffle the
13737 // the results into the target lanes.
13738 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13739 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13742 // If we have a single input shuffle with different shuffle patterns in the
13743 // two 128-bit lanes use the variable mask to VPERMILPS.
13744 if (V2.isUndef()) {
13745 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13746 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13747 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13749 if (Subtarget.hasAVX2())
13750 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13752 // Otherwise, fall back.
13753 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13757 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13759 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13760 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13762 // If we have VLX support, we can use VEXPAND.
13763 if (Subtarget.hasVLX())
13764 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13765 V1, V2, DAG, Subtarget))
13768 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13769 // since after split we get a more efficient code using vpunpcklwd and
13770 // vpunpckhwd instrs than vblend.
13771 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13772 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13776 // If we have AVX2 then we always want to lower with a blend because at v8 we
13777 // can fully permute the elements.
13778 if (Subtarget.hasAVX2())
13779 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13782 // Otherwise fall back on generic lowering.
13783 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13786 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13788 /// This routine is only called when we have AVX2 and thus a reasonable
13789 /// instruction set for v8i32 shuffling..
13790 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13791 const APInt &Zeroable,
13792 SDValue V1, SDValue V2,
13793 const X86Subtarget &Subtarget,
13794 SelectionDAG &DAG) {
13795 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13796 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13797 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13798 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13800 // Whenever we can lower this as a zext, that instruction is strictly faster
13801 // than any alternative. It also allows us to fold memory operands into the
13802 // shuffle in many cases.
13803 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13804 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13807 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13808 // since after split we get a more efficient code than vblend by using
13809 // vpunpcklwd and vpunpckhwd instrs.
13810 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13811 !Subtarget.hasAVX512())
13813 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13816 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13817 Zeroable, Subtarget, DAG))
13820 // Check for being able to broadcast a single element.
13821 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13822 Mask, Subtarget, DAG))
13825 // If the shuffle mask is repeated in each 128-bit lane we can use more
13826 // efficient instructions that mirror the shuffles across the two 128-bit
13828 SmallVector<int, 4> RepeatedMask;
13829 bool Is128BitLaneRepeatedShuffle =
13830 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13831 if (Is128BitLaneRepeatedShuffle) {
13832 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13834 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13835 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13837 // Use dedicated unpack instructions for masks that match their pattern.
13839 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13843 // Try to use shift instructions.
13844 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13845 Zeroable, Subtarget, DAG))
13848 // If we have VLX support, we can use VALIGN or EXPAND.
13849 if (Subtarget.hasVLX()) {
13850 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13851 Mask, Subtarget, DAG))
13854 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13855 V1, V2, DAG, Subtarget))
13859 // Try to use byte rotation instructions.
13860 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13861 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13864 // Try to create an in-lane repeating shuffle mask and then shuffle the
13865 // results into the target lanes.
13866 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13867 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13870 // If the shuffle patterns aren't repeated but it is a single input, directly
13871 // generate a cross-lane VPERMD instruction.
13872 if (V2.isUndef()) {
13873 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13874 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13877 // Assume that a single SHUFPS is faster than an alternative sequence of
13878 // multiple instructions (even if the CPU has a domain penalty).
13879 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13880 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13881 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13882 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13883 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13884 CastV1, CastV2, DAG);
13885 return DAG.getBitcast(MVT::v8i32, ShufPS);
13888 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13890 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13891 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13894 // Otherwise fall back on generic blend lowering.
13895 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13899 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13901 /// This routine is only called when we have AVX2 and thus a reasonable
13902 /// instruction set for v16i16 shuffling..
13903 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13904 const APInt &Zeroable,
13905 SDValue V1, SDValue V2,
13906 const X86Subtarget &Subtarget,
13907 SelectionDAG &DAG) {
13908 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13909 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13910 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13911 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13913 // Whenever we can lower this as a zext, that instruction is strictly faster
13914 // than any alternative. It also allows us to fold memory operands into the
13915 // shuffle in many cases.
13916 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13917 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13920 // Check for being able to broadcast a single element.
13921 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13922 Mask, Subtarget, DAG))
13925 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13926 Zeroable, Subtarget, DAG))
13929 // Use dedicated unpack instructions for masks that match their pattern.
13931 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13934 // Use dedicated pack instructions for masks that match their pattern.
13935 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13939 // Try to use shift instructions.
13940 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13941 Zeroable, Subtarget, DAG))
13944 // Try to use byte rotation instructions.
13945 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13946 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13949 // Try to create an in-lane repeating shuffle mask and then shuffle the
13950 // the results into the target lanes.
13951 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13952 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13955 if (V2.isUndef()) {
13956 // There are no generalized cross-lane shuffle operations available on i16
13958 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13959 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13960 Mask, DAG, Subtarget);
13962 SmallVector<int, 8> RepeatedMask;
13963 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13964 // As this is a single-input shuffle, the repeated mask should be
13965 // a strictly valid v8i16 mask that we can pass through to the v8i16
13966 // lowering to handle even the v16 case.
13967 return lowerV8I16GeneralSingleInputVectorShuffle(
13968 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13972 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13973 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13976 // AVX512BWVL can lower to VPERMW.
13977 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13978 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13980 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13982 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13983 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13986 // Otherwise fall back on generic lowering.
13987 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13990 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13992 /// This routine is only called when we have AVX2 and thus a reasonable
13993 /// instruction set for v32i8 shuffling..
13994 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13995 const APInt &Zeroable,
13996 SDValue V1, SDValue V2,
13997 const X86Subtarget &Subtarget,
13998 SelectionDAG &DAG) {
13999 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14000 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
14001 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14002 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
14004 // Whenever we can lower this as a zext, that instruction is strictly faster
14005 // than any alternative. It also allows us to fold memory operands into the
14006 // shuffle in many cases.
14007 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14008 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14011 // Check for being able to broadcast a single element.
14012 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
14013 Mask, Subtarget, DAG))
14016 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
14017 Zeroable, Subtarget, DAG))
14020 // Use dedicated unpack instructions for masks that match their pattern.
14022 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
14025 // Use dedicated pack instructions for masks that match their pattern.
14026 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
14030 // Try to use shift instructions.
14031 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
14032 Zeroable, Subtarget, DAG))
14035 // Try to use byte rotation instructions.
14036 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14037 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14040 // Try to create an in-lane repeating shuffle mask and then shuffle the
14041 // the results into the target lanes.
14042 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14043 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14046 // There are no generalized cross-lane shuffle operations available on i8
14048 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
14049 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
14052 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14053 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14056 // AVX512VBMIVL can lower to VPERMB.
14057 if (Subtarget.hasVBMI() && Subtarget.hasVLX())
14058 return lowerVectorShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, DAG);
14060 // Try to simplify this by merging 128-bit lanes to enable a lane-based
14062 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
14063 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
14066 // Otherwise fall back on generic lowering.
14067 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
14070 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
14072 /// This routine either breaks down the specific type of a 256-bit x86 vector
14073 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
14074 /// together based on the available instructions.
14075 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14076 MVT VT, SDValue V1, SDValue V2,
14077 const APInt &Zeroable,
14078 const X86Subtarget &Subtarget,
14079 SelectionDAG &DAG) {
14080 // If we have a single input to the zero element, insert that into V1 if we
14081 // can do so cheaply.
14082 int NumElts = VT.getVectorNumElements();
14083 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14085 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14086 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14087 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14090 // Handle special cases where the lower or upper half is UNDEF.
14092 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14095 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
14096 // can check for those subtargets here and avoid much of the subtarget
14097 // querying in the per-vector-type lowering routines. With AVX1 we have
14098 // essentially *zero* ability to manipulate a 256-bit vector with integer
14099 // types. Since we'll use floating point types there eventually, just
14100 // immediately cast everything to a float and operate entirely in that domain.
14101 if (VT.isInteger() && !Subtarget.hasAVX2()) {
14102 int ElementBits = VT.getScalarSizeInBits();
14103 if (ElementBits < 32) {
14104 // No floating point type available, if we can't use the bit operations
14105 // for masking/blending then decompose into 128-bit vectors.
14107 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
14109 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
14111 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
14114 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
14115 VT.getVectorNumElements());
14116 V1 = DAG.getBitcast(FpVT, V1);
14117 V2 = DAG.getBitcast(FpVT, V2);
14118 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
14121 switch (VT.SimpleTy) {
14123 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14125 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14127 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14129 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14131 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14133 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14136 llvm_unreachable("Not a valid 256-bit x86 vector type!");
14140 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
14141 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
14142 ArrayRef<int> Mask,
14143 const APInt &Zeroable,
14144 SDValue V1, SDValue V2,
14145 const X86Subtarget &Subtarget,
14146 SelectionDAG &DAG) {
14147 assert(VT.getScalarSizeInBits() == 64 &&
14148 "Unexpected element type size for 128bit shuffle.");
14150 // To handle 256 bit vector requires VLX and most probably
14151 // function lowerV2X128VectorShuffle() is better solution.
14152 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
14154 SmallVector<int, 4> WidenedMask;
14155 if (!canWidenShuffleElements(Mask, WidenedMask))
14158 // Try to use an insert into a zero vector.
14159 if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
14160 (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
14161 unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
14162 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
14163 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
14164 DAG.getIntPtrConstant(0, DL));
14165 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14166 getZeroVector(VT, Subtarget, DAG, DL), LoV,
14167 DAG.getIntPtrConstant(0, DL));
14170 // Check for patterns which can be matched with a single insert of a 256-bit
14172 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
14173 {0, 1, 2, 3, 0, 1, 2, 3});
14174 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
14175 {0, 1, 2, 3, 8, 9, 10, 11})) {
14176 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
14177 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
14178 OnlyUsesV1 ? V1 : V2,
14179 DAG.getIntPtrConstant(0, DL));
14180 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
14181 DAG.getIntPtrConstant(4, DL));
14184 assert(WidenedMask.size() == 4);
14186 // See if this is an insertion of the lower 128-bits of V2 into V1.
14187 bool IsInsert = true;
14189 for (int i = 0; i < 4; ++i) {
14190 assert(WidenedMask[i] >= -1);
14191 if (WidenedMask[i] < 0)
14194 // Make sure all V1 subvectors are in place.
14195 if (WidenedMask[i] < 4) {
14196 if (WidenedMask[i] != i) {
14201 // Make sure we only have a single V2 index and its the lowest 128-bits.
14202 if (V2Index >= 0 || WidenedMask[i] != 4) {
14209 if (IsInsert && V2Index >= 0) {
14210 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
14211 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
14212 DAG.getIntPtrConstant(0, DL));
14213 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
14216 // Try to lower to vshuf64x2/vshuf32x4.
14217 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
14218 unsigned PermMask = 0;
14219 // Insure elements came from the same Op.
14220 for (int i = 0; i < 4; ++i) {
14221 assert(WidenedMask[i] >= -1);
14222 if (WidenedMask[i] < 0)
14225 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
14226 unsigned OpIndex = i / 2;
14227 if (Ops[OpIndex].isUndef())
14229 else if (Ops[OpIndex] != Op)
14232 // Convert the 128-bit shuffle mask selection values into 128-bit selection
14233 // bits defined by a vshuf64x2 instruction's immediate control byte.
14234 PermMask |= (WidenedMask[i] % 4) << (i * 2);
14237 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
14238 DAG.getConstant(PermMask, DL, MVT::i8));
14241 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
14242 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14243 const APInt &Zeroable,
14244 SDValue V1, SDValue V2,
14245 const X86Subtarget &Subtarget,
14246 SelectionDAG &DAG) {
14247 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14248 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
14249 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14251 if (V2.isUndef()) {
14252 // Use low duplicate instructions for masks that match their pattern.
14253 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
14254 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
14256 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
14257 // Non-half-crossing single input shuffles can be lowered with an
14258 // interleaved permutation.
14259 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
14260 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
14261 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
14262 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
14263 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
14264 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
14267 SmallVector<int, 4> RepeatedMask;
14268 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
14269 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
14270 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14273 if (SDValue Shuf128 =
14274 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, Zeroable, V1, V2,
14278 if (SDValue Unpck =
14279 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
14282 // Check if the blend happens to exactly fit that of SHUFPD.
14284 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
14287 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
14288 V2, DAG, Subtarget))
14291 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
14292 Zeroable, Subtarget, DAG))
14295 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
14298 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
14299 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14300 const APInt &Zeroable,
14301 SDValue V1, SDValue V2,
14302 const X86Subtarget &Subtarget,
14303 SelectionDAG &DAG) {
14304 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14305 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
14306 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14308 // If the shuffle mask is repeated in each 128-bit lane, we have many more
14309 // options to efficiently lower the shuffle.
14310 SmallVector<int, 4> RepeatedMask;
14311 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
14312 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14314 // Use even/odd duplicate instructions for masks that match their pattern.
14315 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
14316 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
14317 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
14318 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
14321 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
14322 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14324 // Use dedicated unpack instructions for masks that match their pattern.
14325 if (SDValue Unpck =
14326 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
14329 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
14330 Zeroable, Subtarget, DAG))
14333 // Otherwise, fall back to a SHUFPS sequence.
14334 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
14337 // If we have a single input shuffle with different shuffle patterns in the
14338 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
14339 if (V2.isUndef() &&
14340 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
14341 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
14342 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
14345 // If we have AVX512F support, we can use VEXPAND.
14346 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
14347 V1, V2, DAG, Subtarget))
14350 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
14353 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
14354 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14355 const APInt &Zeroable,
14356 SDValue V1, SDValue V2,
14357 const X86Subtarget &Subtarget,
14358 SelectionDAG &DAG) {
14359 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14360 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
14361 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
14363 if (V2.isUndef()) {
14364 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
14365 // can use lower latency instructions that will operate on all four
14367 SmallVector<int, 2> Repeated128Mask;
14368 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
14369 SmallVector<int, 4> PSHUFDMask;
14370 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
14371 return DAG.getBitcast(
14373 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
14374 DAG.getBitcast(MVT::v16i32, V1),
14375 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
14378 SmallVector<int, 4> Repeated256Mask;
14379 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
14380 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
14381 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
14384 if (SDValue Shuf128 =
14385 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, Zeroable,
14386 V1, V2, Subtarget, DAG))
14389 // Try to use shift instructions.
14390 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
14391 Zeroable, Subtarget, DAG))
14394 // Try to use VALIGN.
14395 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
14396 Mask, Subtarget, DAG))
14399 // Try to use PALIGNR.
14400 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
14401 Mask, Subtarget, DAG))
14404 if (SDValue Unpck =
14405 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
14407 // If we have AVX512F support, we can use VEXPAND.
14408 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
14409 V2, DAG, Subtarget))
14412 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
14413 Zeroable, Subtarget, DAG))
14416 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
14419 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
14420 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14421 const APInt &Zeroable,
14422 SDValue V1, SDValue V2,
14423 const X86Subtarget &Subtarget,
14424 SelectionDAG &DAG) {
14425 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14426 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
14427 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
14429 // Whenever we can lower this as a zext, that instruction is strictly faster
14430 // than any alternative. It also allows us to fold memory operands into the
14431 // shuffle in many cases.
14432 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14433 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
14436 // If the shuffle mask is repeated in each 128-bit lane we can use more
14437 // efficient instructions that mirror the shuffles across the four 128-bit
14439 SmallVector<int, 4> RepeatedMask;
14440 bool Is128BitLaneRepeatedShuffle =
14441 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
14442 if (Is128BitLaneRepeatedShuffle) {
14443 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
14445 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
14446 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
14448 // Use dedicated unpack instructions for masks that match their pattern.
14450 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
14454 // Try to use shift instructions.
14455 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
14456 Zeroable, Subtarget, DAG))
14459 // Try to use VALIGN.
14460 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
14461 Mask, Subtarget, DAG))
14464 // Try to use byte rotation instructions.
14465 if (Subtarget.hasBWI())
14466 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14467 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
14470 // Assume that a single SHUFPS is faster than using a permv shuffle.
14471 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
14472 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
14473 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
14474 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
14475 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
14476 CastV1, CastV2, DAG);
14477 return DAG.getBitcast(MVT::v16i32, ShufPS);
14479 // If we have AVX512F support, we can use VEXPAND.
14480 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
14481 V1, V2, DAG, Subtarget))
14484 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
14485 Zeroable, Subtarget, DAG))
14487 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
14490 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
14491 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14492 const APInt &Zeroable,
14493 SDValue V1, SDValue V2,
14494 const X86Subtarget &Subtarget,
14495 SelectionDAG &DAG) {
14496 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14497 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
14498 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
14499 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
14501 // Whenever we can lower this as a zext, that instruction is strictly faster
14502 // than any alternative. It also allows us to fold memory operands into the
14503 // shuffle in many cases.
14504 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14505 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
14508 // Use dedicated unpack instructions for masks that match their pattern.
14510 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
14513 // Try to use shift instructions.
14514 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
14515 Zeroable, Subtarget, DAG))
14518 // Try to use byte rotation instructions.
14519 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14520 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
14523 if (V2.isUndef()) {
14524 SmallVector<int, 8> RepeatedMask;
14525 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14526 // As this is a single-input shuffle, the repeated mask should be
14527 // a strictly valid v8i16 mask that we can pass through to the v8i16
14528 // lowering to handle even the v32 case.
14529 return lowerV8I16GeneralSingleInputVectorShuffle(
14530 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14534 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14535 Zeroable, Subtarget, DAG))
14538 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14539 DL, MVT::v32i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
14542 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14545 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14546 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14547 const APInt &Zeroable,
14548 SDValue V1, SDValue V2,
14549 const X86Subtarget &Subtarget,
14550 SelectionDAG &DAG) {
14551 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14552 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14553 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14554 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14556 // Whenever we can lower this as a zext, that instruction is strictly faster
14557 // than any alternative. It also allows us to fold memory operands into the
14558 // shuffle in many cases.
14559 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14560 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14563 // Use dedicated unpack instructions for masks that match their pattern.
14565 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14568 // Try to use shift instructions.
14569 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14570 Zeroable, Subtarget, DAG))
14573 // Try to use byte rotation instructions.
14574 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14575 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14578 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14579 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14582 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14583 if (Subtarget.hasVBMI())
14584 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14586 // Try to create an in-lane repeating shuffle mask and then shuffle the
14587 // the results into the target lanes.
14588 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14589 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14592 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14593 Zeroable, Subtarget, DAG))
14596 // FIXME: Implement direct support for this type!
14597 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14600 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14602 /// This routine either breaks down the specific type of a 512-bit x86 vector
14603 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14604 /// together based on the available instructions.
14605 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14606 MVT VT, SDValue V1, SDValue V2,
14607 const APInt &Zeroable,
14608 const X86Subtarget &Subtarget,
14609 SelectionDAG &DAG) {
14610 assert(Subtarget.hasAVX512() &&
14611 "Cannot lower 512-bit vectors w/ basic ISA!");
14613 // If we have a single input to the zero element, insert that into V1 if we
14614 // can do so cheaply.
14615 int NumElts = Mask.size();
14616 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14618 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14619 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14620 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14623 // Handle special cases where the lower or upper half is UNDEF.
14625 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14628 // Check for being able to broadcast a single element.
14629 if (SDValue Broadcast =
14630 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14633 // Dispatch to each element type for lowering. If we don't have support for
14634 // specific element type shuffles at 512 bits, immediately split them and
14635 // lower them. Each lowering routine of a given type is allowed to assume that
14636 // the requisite ISA extensions for that element type are available.
14637 switch (VT.SimpleTy) {
14639 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14641 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14643 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14645 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14647 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14649 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14652 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14656 // Lower vXi1 vector shuffles.
14657 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14658 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14659 // vector, shuffle and then truncate it back.
14660 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14661 MVT VT, SDValue V1, SDValue V2,
14662 const APInt &Zeroable,
14663 const X86Subtarget &Subtarget,
14664 SelectionDAG &DAG) {
14665 unsigned NumElts = Mask.size();
14667 // Try to recognize shuffles that are just padding a subvector with zeros.
14668 unsigned SubvecElts = 0;
14669 for (int i = 0; i != (int)NumElts; ++i) {
14670 if (Mask[i] >= 0 && Mask[i] != i)
14675 assert(SubvecElts != NumElts && "Identity shuffle?");
14677 // Clip to a power 2.
14678 SubvecElts = PowerOf2Floor(SubvecElts);
14680 // Make sure the number of zeroable bits in the top at least covers the bits
14681 // not covered by the subvector.
14682 if (Zeroable.countLeadingOnes() >= (NumElts - SubvecElts)) {
14683 MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts);
14684 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT,
14685 V1, DAG.getIntPtrConstant(0, DL));
14686 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
14687 getZeroVector(VT, Subtarget, DAG, DL),
14688 Extract, DAG.getIntPtrConstant(0, DL));
14692 assert(Subtarget.hasAVX512() &&
14693 "Cannot lower 512-bit vectors w/o basic ISA!");
14695 switch (VT.SimpleTy) {
14697 llvm_unreachable("Expected a vector of i1 elements");
14699 ExtVT = MVT::v2i64;
14702 ExtVT = MVT::v4i32;
14705 // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit
14707 ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64;
14710 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14711 // 256-bit operation available.
14712 ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16;
14715 // Take 512-bit type, unless we are avoiding 512-bit types and have the
14716 // 256-bit operation available.
14717 assert(Subtarget.hasBWI() && "Expected AVX512BW support");
14718 ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
14721 ExtVT = MVT::v64i8;
14725 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14726 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14728 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14729 // i1 was sign extended we can use X86ISD::CVT2MASK.
14730 int NumElems = VT.getVectorNumElements();
14731 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14732 (Subtarget.hasDQI() && (NumElems < 32)))
14733 return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, ExtVT),
14734 Shuffle, DAG.getConstant(6, DL, MVT::i8));
14736 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14739 /// Helper function that returns true if the shuffle mask should be
14740 /// commuted to improve canonicalization.
14741 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14742 int NumElements = Mask.size();
14744 int NumV1Elements = 0, NumV2Elements = 0;
14748 else if (M < NumElements)
14753 // Commute the shuffle as needed such that more elements come from V1 than
14754 // V2. This allows us to match the shuffle pattern strictly on how many
14755 // elements come from V1 without handling the symmetric cases.
14756 if (NumV2Elements > NumV1Elements)
14759 assert(NumV1Elements > 0 && "No V1 indices");
14761 if (NumV2Elements == 0)
14764 // When the number of V1 and V2 elements are the same, try to minimize the
14765 // number of uses of V2 in the low half of the vector. When that is tied,
14766 // ensure that the sum of indices for V1 is equal to or lower than the sum
14767 // indices for V2. When those are equal, try to ensure that the number of odd
14768 // indices for V1 is lower than the number of odd indices for V2.
14769 if (NumV1Elements == NumV2Elements) {
14770 int LowV1Elements = 0, LowV2Elements = 0;
14771 for (int M : Mask.slice(0, NumElements / 2))
14772 if (M >= NumElements)
14776 if (LowV2Elements > LowV1Elements)
14778 if (LowV2Elements == LowV1Elements) {
14779 int SumV1Indices = 0, SumV2Indices = 0;
14780 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14781 if (Mask[i] >= NumElements)
14783 else if (Mask[i] >= 0)
14785 if (SumV2Indices < SumV1Indices)
14787 if (SumV2Indices == SumV1Indices) {
14788 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14789 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14790 if (Mask[i] >= NumElements)
14791 NumV2OddIndices += i % 2;
14792 else if (Mask[i] >= 0)
14793 NumV1OddIndices += i % 2;
14794 if (NumV2OddIndices < NumV1OddIndices)
14803 /// \brief Top-level lowering for x86 vector shuffles.
14805 /// This handles decomposition, canonicalization, and lowering of all x86
14806 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14807 /// above in helper routines. The canonicalization attempts to widen shuffles
14808 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14809 /// s.t. only one of the two inputs needs to be tested, etc.
14810 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14811 SelectionDAG &DAG) {
14812 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14813 ArrayRef<int> Mask = SVOp->getMask();
14814 SDValue V1 = Op.getOperand(0);
14815 SDValue V2 = Op.getOperand(1);
14816 MVT VT = Op.getSimpleValueType();
14817 int NumElements = VT.getVectorNumElements();
14819 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14821 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14822 "Can't lower MMX shuffles");
14824 bool V1IsUndef = V1.isUndef();
14825 bool V2IsUndef = V2.isUndef();
14826 if (V1IsUndef && V2IsUndef)
14827 return DAG.getUNDEF(VT);
14829 // When we create a shuffle node we put the UNDEF node to second operand,
14830 // but in some cases the first operand may be transformed to UNDEF.
14831 // In this case we should just commute the node.
14833 return DAG.getCommutedVectorShuffle(*SVOp);
14835 // Check for non-undef masks pointing at an undef vector and make the masks
14836 // undef as well. This makes it easier to match the shuffle based solely on
14840 if (M >= NumElements) {
14841 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14842 for (int &M : NewMask)
14843 if (M >= NumElements)
14845 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14848 // Check for illegal shuffle mask element index values.
14849 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14850 assert(llvm::all_of(Mask,
14851 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14852 "Out of bounds shuffle index");
14854 // We actually see shuffles that are entirely re-arrangements of a set of
14855 // zero inputs. This mostly happens while decomposing complex shuffles into
14856 // simple ones. Directly lower these as a buildvector of zeros.
14857 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14858 if (Zeroable.isAllOnesValue())
14859 return getZeroVector(VT, Subtarget, DAG, DL);
14861 // Try to collapse shuffles into using a vector type with fewer elements but
14862 // wider element types. We cap this to not form integers or floating point
14863 // elements wider than 64 bits, but it might be interesting to form i128
14864 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14865 SmallVector<int, 16> WidenedMask;
14866 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14867 canWidenShuffleElements(Mask, WidenedMask)) {
14868 MVT NewEltVT = VT.isFloatingPoint()
14869 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14870 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14871 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14872 // Make sure that the new vector type is legal. For example, v2f64 isn't
14874 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14875 V1 = DAG.getBitcast(NewVT, V1);
14876 V2 = DAG.getBitcast(NewVT, V2);
14877 return DAG.getBitcast(
14878 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14882 // Commute the shuffle if it will improve canonicalization.
14883 if (canonicalizeShuffleMaskWithCommute(Mask))
14884 return DAG.getCommutedVectorShuffle(*SVOp);
14886 // For each vector width, delegate to a specialized lowering routine.
14887 if (VT.is128BitVector())
14888 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14891 if (VT.is256BitVector())
14892 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14895 if (VT.is512BitVector())
14896 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14900 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14903 llvm_unreachable("Unimplemented!");
14906 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14907 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14908 const X86Subtarget &Subtarget,
14909 SelectionDAG &DAG) {
14910 SDValue Cond = Op.getOperand(0);
14911 SDValue LHS = Op.getOperand(1);
14912 SDValue RHS = Op.getOperand(2);
14914 MVT VT = Op.getSimpleValueType();
14916 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14918 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14920 // Only non-legal VSELECTs reach this lowering, convert those into generic
14921 // shuffles and re-use the shuffle lowering path for blends.
14922 SmallVector<int, 32> Mask;
14923 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14924 SDValue CondElt = CondBV->getOperand(i);
14926 // We can't map undef to undef here. They have different meanings. Treat
14927 // as the same as zero.
14928 if (CondElt.isUndef() || isNullConstant(CondElt))
14932 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14935 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14936 // A vselect where all conditions and data are constants can be optimized into
14937 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14938 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14939 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14940 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14943 // Try to lower this to a blend-style vector shuffle. This can handle all
14944 // constant condition cases.
14945 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14948 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14949 // with patterns on the mask registers on AVX-512.
14950 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14953 // Variable blends are only legal from SSE4.1 onward.
14954 if (!Subtarget.hasSSE41())
14958 MVT VT = Op.getSimpleValueType();
14960 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14961 // into an i1 condition so that we can use the mask-based 512-bit blend
14963 if (VT.getSizeInBits() == 512) {
14964 SDValue Cond = Op.getOperand(0);
14965 // The vNi1 condition case should be handled above as it can be trivially
14967 assert(Cond.getValueType().getScalarSizeInBits() ==
14968 VT.getScalarSizeInBits() &&
14969 "Should have a size-matched integer condition!");
14970 // Build a mask by testing the condition against zero.
14971 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14972 SDValue Mask = DAG.getNode(X86ISD::CMPM, dl, MaskVT, Cond,
14973 getZeroVector(VT, Subtarget, DAG, dl),
14974 DAG.getConstant(4, dl, MVT::i8));
14975 // Now return a new VSELECT using the mask.
14976 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14979 // Only some types will be legal on some subtargets. If we can emit a legal
14980 // VSELECT-matching blend, return Op, and but if we need to expand, return
14982 switch (VT.SimpleTy) {
14984 // Most of the vector types have blends past SSE4.1.
14988 // The byte blends for AVX vectors were introduced only in AVX2.
14989 if (Subtarget.hasAVX2())
14996 // FIXME: We should custom lower this by fixing the condition and using i8
15002 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
15003 MVT VT = Op.getSimpleValueType();
15006 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
15009 if (VT.getSizeInBits() == 8) {
15010 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
15011 Op.getOperand(0), Op.getOperand(1));
15012 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15015 if (VT == MVT::f32) {
15016 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
15017 // the result back to FR32 register. It's only worth matching if the
15018 // result has a single use which is a store or a bitcast to i32. And in
15019 // the case of a store, it's not worth it if the index is a constant 0,
15020 // because a MOVSSmr can be used instead, which is smaller and faster.
15021 if (!Op.hasOneUse())
15023 SDNode *User = *Op.getNode()->use_begin();
15024 if ((User->getOpcode() != ISD::STORE ||
15025 isNullConstant(Op.getOperand(1))) &&
15026 (User->getOpcode() != ISD::BITCAST ||
15027 User->getValueType(0) != MVT::i32))
15029 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15030 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
15032 return DAG.getBitcast(MVT::f32, Extract);
15035 if (VT == MVT::i32 || VT == MVT::i64) {
15036 // ExtractPS/pextrq works with constant index.
15037 if (isa<ConstantSDNode>(Op.getOperand(1)))
15044 /// Extract one bit from mask vector, like v16i1 or v8i1.
15045 /// AVX-512 feature.
15046 static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
15047 const X86Subtarget &Subtarget) {
15048 SDValue Vec = Op.getOperand(0);
15050 MVT VecVT = Vec.getSimpleValueType();
15051 SDValue Idx = Op.getOperand(1);
15052 MVT EltVT = Op.getSimpleValueType();
15054 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
15055 "Unexpected vector type in ExtractBitFromMaskVector");
15057 // variable index can't be handled in mask registers,
15058 // extend vector to VR512/128
15059 if (!isa<ConstantSDNode>(Idx)) {
15060 unsigned NumElts = VecVT.getVectorNumElements();
15061 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
15062 // than extending to 128/256bit.
15063 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15064 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15065 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec);
15066 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx);
15067 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
15070 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15072 // If the kshift instructions of the correct width aren't natively supported
15073 // then we need to promote the vector to the native size to get the correct
15074 // zeroing behavior.
15075 if (VecVT.getVectorNumElements() < 16) {
15076 VecVT = MVT::v16i1;
15077 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
15078 DAG.getUNDEF(VecVT), Vec,
15079 DAG.getIntPtrConstant(0, dl));
15082 // Extracts from element 0 are always allowed.
15084 // Use kshiftr instruction to move to the lower element.
15085 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
15086 DAG.getConstant(IdxVal, dl, MVT::i8));
15089 // Shrink to v16i1 since that's always legal.
15090 if (VecVT.getVectorNumElements() > 16) {
15091 VecVT = MVT::v16i1;
15092 Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Vec,
15093 DAG.getIntPtrConstant(0, dl));
15096 // Convert to a bitcast+aext/trunc.
15097 MVT CastVT = MVT::getIntegerVT(VecVT.getVectorNumElements());
15098 return DAG.getAnyExtOrTrunc(DAG.getBitcast(CastVT, Vec), dl, EltVT);
15102 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
15103 SelectionDAG &DAG) const {
15105 SDValue Vec = Op.getOperand(0);
15106 MVT VecVT = Vec.getSimpleValueType();
15107 SDValue Idx = Op.getOperand(1);
15109 if (VecVT.getVectorElementType() == MVT::i1)
15110 return ExtractBitFromMaskVector(Op, DAG, Subtarget);
15112 if (!isa<ConstantSDNode>(Idx)) {
15113 // Its more profitable to go through memory (1 cycles throughput)
15114 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
15115 // IACA tool was used to get performance estimation
15116 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
15118 // example : extractelement <16 x i8> %a, i32 %i
15120 // Block Throughput: 3.00 Cycles
15121 // Throughput Bottleneck: Port5
15123 // | Num Of | Ports pressure in cycles | |
15124 // | Uops | 0 - DV | 5 | 6 | 7 | |
15125 // ---------------------------------------------
15126 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
15127 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
15128 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
15129 // Total Num Of Uops: 4
15132 // Block Throughput: 1.00 Cycles
15133 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
15135 // | | Ports pressure in cycles | |
15136 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
15137 // ---------------------------------------------------------
15138 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
15139 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
15140 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
15141 // Total Num Of Uops: 4
15146 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15148 // If this is a 256-bit vector result, first extract the 128-bit vector and
15149 // then extract the element from the 128-bit vector.
15150 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
15151 // Get the 128-bit vector.
15152 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
15153 MVT EltVT = VecVT.getVectorElementType();
15155 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
15156 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
15158 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
15159 // this can be done with a mask.
15160 IdxVal &= ElemsPerChunk - 1;
15161 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
15162 DAG.getConstant(IdxVal, dl, MVT::i32));
15165 assert(VecVT.is128BitVector() && "Unexpected vector length");
15167 MVT VT = Op.getSimpleValueType();
15169 if (VT.getSizeInBits() == 16) {
15170 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
15171 // we're going to zero extend the register or fold the store (SSE41 only).
15172 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
15173 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
15174 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
15175 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15176 DAG.getBitcast(MVT::v4i32, Vec), Idx));
15178 // Transform it so it match pextrw which produces a 32-bit result.
15179 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
15180 Op.getOperand(0), Op.getOperand(1));
15181 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
15184 if (Subtarget.hasSSE41())
15185 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
15188 // TODO: We only extract a single element from v16i8, we can probably afford
15189 // to be more aggressive here before using the default approach of spilling to
15191 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
15192 // Extract either the lowest i32 or any i16, and extract the sub-byte.
15193 int DWordIdx = IdxVal / 4;
15194 if (DWordIdx == 0) {
15195 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
15196 DAG.getBitcast(MVT::v4i32, Vec),
15197 DAG.getIntPtrConstant(DWordIdx, dl));
15198 int ShiftVal = (IdxVal % 4) * 8;
15200 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
15201 DAG.getConstant(ShiftVal, dl, MVT::i32));
15202 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15205 int WordIdx = IdxVal / 2;
15206 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
15207 DAG.getBitcast(MVT::v8i16, Vec),
15208 DAG.getIntPtrConstant(WordIdx, dl));
15209 int ShiftVal = (IdxVal % 2) * 8;
15211 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
15212 DAG.getConstant(ShiftVal, dl, MVT::i16));
15213 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
15216 if (VT.getSizeInBits() == 32) {
15220 // SHUFPS the element to the lowest double word, then movss.
15221 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
15222 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15223 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15224 DAG.getIntPtrConstant(0, dl));
15227 if (VT.getSizeInBits() == 64) {
15228 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
15229 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
15230 // to match extract_elt for f64.
15234 // UNPCKHPD the element to the lowest double word, then movsd.
15235 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
15236 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
15237 int Mask[2] = { 1, -1 };
15238 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
15239 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
15240 DAG.getIntPtrConstant(0, dl));
15246 /// Insert one bit to mask vector, like v16i1 or v8i1.
15247 /// AVX-512 feature.
15248 static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
15249 const X86Subtarget &Subtarget) {
15251 SDValue Vec = Op.getOperand(0);
15252 SDValue Elt = Op.getOperand(1);
15253 SDValue Idx = Op.getOperand(2);
15254 MVT VecVT = Vec.getSimpleValueType();
15256 if (!isa<ConstantSDNode>(Idx)) {
15257 // Non constant index. Extend source and destination,
15258 // insert element and then truncate the result.
15259 unsigned NumElts = VecVT.getVectorNumElements();
15260 MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8;
15261 MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts);
15262 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
15263 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec),
15264 DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx);
15265 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
15268 // Copy into a k-register, extract to v1i1 and insert_subvector.
15269 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
15271 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
15275 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
15276 SelectionDAG &DAG) const {
15277 MVT VT = Op.getSimpleValueType();
15278 MVT EltVT = VT.getVectorElementType();
15279 unsigned NumElts = VT.getVectorNumElements();
15281 if (EltVT == MVT::i1)
15282 return InsertBitToMaskVector(Op, DAG, Subtarget);
15285 SDValue N0 = Op.getOperand(0);
15286 SDValue N1 = Op.getOperand(1);
15287 SDValue N2 = Op.getOperand(2);
15288 if (!isa<ConstantSDNode>(N2))
15290 auto *N2C = cast<ConstantSDNode>(N2);
15291 unsigned IdxVal = N2C->getZExtValue();
15293 bool IsZeroElt = X86::isZeroNode(N1);
15294 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
15296 // If we are inserting a element, see if we can do this more efficiently with
15297 // a blend shuffle with a rematerializable vector than a costly integer
15299 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
15300 16 <= EltVT.getSizeInBits()) {
15301 SmallVector<int, 8> BlendMask;
15302 for (unsigned i = 0; i != NumElts; ++i)
15303 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
15304 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
15305 : getOnesVector(VT, DAG, dl);
15306 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
15309 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
15310 // into that, and then insert the subvector back into the result.
15311 if (VT.is256BitVector() || VT.is512BitVector()) {
15312 // With a 256-bit vector, we can insert into the zero element efficiently
15313 // using a blend if we have AVX or AVX2 and the right data type.
15314 if (VT.is256BitVector() && IdxVal == 0) {
15315 // TODO: It is worthwhile to cast integer to floating point and back
15316 // and incur a domain crossing penalty if that's what we'll end up
15317 // doing anyway after extracting to a 128-bit vector.
15318 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
15319 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
15320 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
15321 N2 = DAG.getIntPtrConstant(1, dl);
15322 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
15326 // Get the desired 128-bit vector chunk.
15327 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
15329 // Insert the element into the desired chunk.
15330 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
15331 assert(isPowerOf2_32(NumEltsIn128));
15332 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
15333 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
15335 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
15336 DAG.getConstant(IdxIn128, dl, MVT::i32));
15338 // Insert the changed part back into the bigger vector
15339 return insert128BitVector(N0, V, IdxVal, DAG, dl);
15341 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
15343 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
15344 // argument. SSE41 required for pinsrb.
15345 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
15347 if (VT == MVT::v8i16) {
15348 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
15349 Opc = X86ISD::PINSRW;
15351 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
15352 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
15353 Opc = X86ISD::PINSRB;
15356 if (N1.getValueType() != MVT::i32)
15357 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
15358 if (N2.getValueType() != MVT::i32)
15359 N2 = DAG.getIntPtrConstant(IdxVal, dl);
15360 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
15363 if (Subtarget.hasSSE41()) {
15364 if (EltVT == MVT::f32) {
15365 // Bits [7:6] of the constant are the source select. This will always be
15366 // zero here. The DAG Combiner may combine an extract_elt index into
15367 // these bits. For example (insert (extract, 3), 2) could be matched by
15368 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
15369 // Bits [5:4] of the constant are the destination select. This is the
15370 // value of the incoming immediate.
15371 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
15372 // combine either bitwise AND or insert of float 0.0 to set these bits.
15374 bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
15375 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
15376 // If this is an insertion of 32-bits into the low 32-bits of
15377 // a vector, we prefer to generate a blend with immediate rather
15378 // than an insertps. Blends are simpler operations in hardware and so
15379 // will always have equal or better performance than insertps.
15380 // But if optimizing for size and there's a load folding opportunity,
15381 // generate insertps because blendps does not have a 32-bit memory
15383 N2 = DAG.getIntPtrConstant(1, dl);
15384 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15385 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
15387 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
15388 // Create this as a scalar to vector..
15389 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
15390 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
15393 // PINSR* works with constant index.
15394 if (EltVT == MVT::i32 || EltVT == MVT::i64)
15401 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
15402 SelectionDAG &DAG) {
15404 MVT OpVT = Op.getSimpleValueType();
15406 // It's always cheaper to replace a xor+movd with xorps and simplifies further
15408 if (X86::isZeroNode(Op.getOperand(0)))
15409 return getZeroVector(OpVT, Subtarget, DAG, dl);
15411 // If this is a 256-bit vector result, first insert into a 128-bit
15412 // vector and then insert into the 256-bit vector.
15413 if (!OpVT.is128BitVector()) {
15414 // Insert into a 128-bit vector.
15415 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
15416 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
15417 OpVT.getVectorNumElements() / SizeFactor);
15419 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
15421 // Insert the 128-bit vector.
15422 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
15424 assert(OpVT.is128BitVector() && "Expected an SSE type!");
15426 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
15427 if (OpVT == MVT::v4i32)
15430 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
15431 return DAG.getBitcast(
15432 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
15435 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
15436 // simple superregister reference or explicit instructions to insert
15437 // the upper bits of a vector.
15438 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15439 SelectionDAG &DAG) {
15440 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
15442 return insert1BitVector(Op, DAG, Subtarget);
15445 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
15446 SelectionDAG &DAG) {
15447 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
15448 "Only vXi1 extract_subvectors need custom lowering");
15451 SDValue Vec = Op.getOperand(0);
15452 SDValue Idx = Op.getOperand(1);
15454 if (!isa<ConstantSDNode>(Idx))
15457 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
15458 if (IdxVal == 0) // the operation is legal
15461 MVT VecVT = Vec.getSimpleValueType();
15462 unsigned NumElems = VecVT.getVectorNumElements();
15464 // Extend to natively supported kshift.
15465 MVT WideVecVT = VecVT;
15466 if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) {
15467 WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
15468 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT,
15469 DAG.getUNDEF(WideVecVT), Vec,
15470 DAG.getIntPtrConstant(0, dl));
15473 // Shift to the LSB.
15474 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec,
15475 DAG.getConstant(IdxVal, dl, MVT::i8));
15477 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec,
15478 DAG.getIntPtrConstant(0, dl));
15481 // Returns the appropriate wrapper opcode for a global reference.
15482 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
15483 // References to absolute symbols are never PC-relative.
15484 if (GV && GV->isAbsoluteSymbolRef())
15485 return X86ISD::Wrapper;
15487 CodeModel::Model M = getTargetMachine().getCodeModel();
15488 if (Subtarget.isPICStyleRIPRel() &&
15489 (M == CodeModel::Small || M == CodeModel::Kernel))
15490 return X86ISD::WrapperRIP;
15492 return X86ISD::Wrapper;
15495 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
15496 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
15497 // one of the above mentioned nodes. It has to be wrapped because otherwise
15498 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
15499 // be used to form addressing mode. These wrapped nodes will be selected
15502 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
15503 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
15505 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15506 // global base reg.
15507 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15509 auto PtrVT = getPointerTy(DAG.getDataLayout());
15510 SDValue Result = DAG.getTargetConstantPool(
15511 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
15513 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15514 // With PIC, the address is actually $g + Offset.
15517 DAG.getNode(ISD::ADD, DL, PtrVT,
15518 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15524 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
15525 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
15527 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15528 // global base reg.
15529 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
15531 auto PtrVT = getPointerTy(DAG.getDataLayout());
15532 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
15534 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15536 // With PIC, the address is actually $g + Offset.
15539 DAG.getNode(ISD::ADD, DL, PtrVT,
15540 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15546 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
15547 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
15549 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15550 // global base reg.
15551 const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
15552 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
15554 auto PtrVT = getPointerTy(DAG.getDataLayout());
15555 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
15558 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
15560 // With PIC, the address is actually $g + Offset.
15561 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15563 DAG.getNode(ISD::ADD, DL, PtrVT,
15564 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15567 // For symbols that require a load from a stub to get the address, emit the
15569 if (isGlobalStubReference(OpFlag))
15570 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15571 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15577 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15578 // Create the TargetBlockAddressAddress node.
15579 unsigned char OpFlags =
15580 Subtarget.classifyBlockAddressReference();
15581 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15582 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15584 auto PtrVT = getPointerTy(DAG.getDataLayout());
15585 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15586 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15588 // With PIC, the address is actually $g + Offset.
15589 if (isGlobalRelativeToPICBase(OpFlags)) {
15590 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15591 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15597 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15598 const SDLoc &dl, int64_t Offset,
15599 SelectionDAG &DAG) const {
15600 // Create the TargetGlobalAddress node, folding in the constant
15601 // offset if it is legal.
15602 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15603 CodeModel::Model M = DAG.getTarget().getCodeModel();
15604 auto PtrVT = getPointerTy(DAG.getDataLayout());
15606 if (OpFlags == X86II::MO_NO_FLAG &&
15607 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15608 // A direct static reference to a global.
15609 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15612 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15615 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15617 // With PIC, the address is actually $g + Offset.
15618 if (isGlobalRelativeToPICBase(OpFlags)) {
15619 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15620 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15623 // For globals that require a load from a stub to get the address, emit the
15625 if (isGlobalStubReference(OpFlags))
15626 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15627 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15629 // If there was a non-zero offset that we didn't fold, create an explicit
15630 // addition for it.
15632 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15633 DAG.getConstant(Offset, dl, PtrVT));
15639 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15640 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15641 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15642 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15646 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15647 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15648 unsigned char OperandFlags, bool LocalDynamic = false) {
15649 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15650 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15652 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15653 GA->getValueType(0),
15657 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15661 SDValue Ops[] = { Chain, TGA, *InFlag };
15662 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15664 SDValue Ops[] = { Chain, TGA };
15665 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15668 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15669 MFI.setAdjustsStack(true);
15670 MFI.setHasCalls(true);
15672 SDValue Flag = Chain.getValue(1);
15673 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15676 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15678 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15681 SDLoc dl(GA); // ? function entry point might be better
15682 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15683 DAG.getNode(X86ISD::GlobalBaseReg,
15684 SDLoc(), PtrVT), InFlag);
15685 InFlag = Chain.getValue(1);
15687 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15690 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15692 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15694 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15695 X86::RAX, X86II::MO_TLSGD);
15698 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15704 // Get the start address of the TLS block for this module.
15705 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15706 .getInfo<X86MachineFunctionInfo>();
15707 MFI->incNumLocalDynamicTLSAccesses();
15711 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15712 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15715 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15716 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15717 InFlag = Chain.getValue(1);
15718 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15719 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15722 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15726 unsigned char OperandFlags = X86II::MO_DTPOFF;
15727 unsigned WrapperKind = X86ISD::Wrapper;
15728 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15729 GA->getValueType(0),
15730 GA->getOffset(), OperandFlags);
15731 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15733 // Add x@dtpoff with the base.
15734 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15737 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15738 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15739 const EVT PtrVT, TLSModel::Model model,
15740 bool is64Bit, bool isPIC) {
15743 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15744 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15745 is64Bit ? 257 : 256));
15747 SDValue ThreadPointer =
15748 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15749 MachinePointerInfo(Ptr));
15751 unsigned char OperandFlags = 0;
15752 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15754 unsigned WrapperKind = X86ISD::Wrapper;
15755 if (model == TLSModel::LocalExec) {
15756 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15757 } else if (model == TLSModel::InitialExec) {
15759 OperandFlags = X86II::MO_GOTTPOFF;
15760 WrapperKind = X86ISD::WrapperRIP;
15762 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15765 llvm_unreachable("Unexpected model");
15768 // emit "addl x@ntpoff,%eax" (local exec)
15769 // or "addl x@indntpoff,%eax" (initial exec)
15770 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15772 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15773 GA->getOffset(), OperandFlags);
15774 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15776 if (model == TLSModel::InitialExec) {
15777 if (isPIC && !is64Bit) {
15778 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15779 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15783 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15784 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15787 // The address of the thread local variable is the add of the thread
15788 // pointer with the offset of the variable.
15789 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15793 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15795 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15797 if (DAG.getTarget().useEmulatedTLS())
15798 return LowerToTLSEmulatedModel(GA, DAG);
15800 const GlobalValue *GV = GA->getGlobal();
15801 auto PtrVT = getPointerTy(DAG.getDataLayout());
15802 bool PositionIndependent = isPositionIndependent();
15804 if (Subtarget.isTargetELF()) {
15805 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15807 case TLSModel::GeneralDynamic:
15808 if (Subtarget.is64Bit())
15809 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15810 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15811 case TLSModel::LocalDynamic:
15812 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15813 Subtarget.is64Bit());
15814 case TLSModel::InitialExec:
15815 case TLSModel::LocalExec:
15816 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15817 PositionIndependent);
15819 llvm_unreachable("Unknown TLS model.");
15822 if (Subtarget.isTargetDarwin()) {
15823 // Darwin only has one model of TLS. Lower to that.
15824 unsigned char OpFlag = 0;
15825 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15826 X86ISD::WrapperRIP : X86ISD::Wrapper;
15828 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15829 // global base reg.
15830 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15832 OpFlag = X86II::MO_TLVP_PIC_BASE;
15834 OpFlag = X86II::MO_TLVP;
15836 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15837 GA->getValueType(0),
15838 GA->getOffset(), OpFlag);
15839 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15841 // With PIC32, the address is actually $g + Offset.
15843 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15844 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15847 // Lowering the machine isd will make sure everything is in the right
15849 SDValue Chain = DAG.getEntryNode();
15850 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15851 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15852 SDValue Args[] = { Chain, Offset };
15853 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15854 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15855 DAG.getIntPtrConstant(0, DL, true),
15856 Chain.getValue(1), DL);
15858 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15859 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15860 MFI.setAdjustsStack(true);
15862 // And our return value (tls address) is in the standard call return value
15864 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15865 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15868 if (Subtarget.isTargetKnownWindowsMSVC() ||
15869 Subtarget.isTargetWindowsItanium() ||
15870 Subtarget.isTargetWindowsGNU()) {
15871 // Just use the implicit TLS architecture
15872 // Need to generate something similar to:
15873 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15875 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15876 // mov rcx, qword [rdx+rcx*8]
15877 // mov eax, .tls$:tlsvar
15878 // [rax+rcx] contains the address
15879 // Windows 64bit: gs:0x58
15880 // Windows 32bit: fs:__tls_array
15883 SDValue Chain = DAG.getEntryNode();
15885 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15886 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15887 // use its literal value of 0x2C.
15888 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15889 ? Type::getInt8PtrTy(*DAG.getContext(),
15891 : Type::getInt32PtrTy(*DAG.getContext(),
15894 SDValue TlsArray = Subtarget.is64Bit()
15895 ? DAG.getIntPtrConstant(0x58, dl)
15896 : (Subtarget.isTargetWindowsGNU()
15897 ? DAG.getIntPtrConstant(0x2C, dl)
15898 : DAG.getExternalSymbol("_tls_array", PtrVT));
15900 SDValue ThreadPointer =
15901 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15904 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15905 res = ThreadPointer;
15907 // Load the _tls_index variable
15908 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15909 if (Subtarget.is64Bit())
15910 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15911 MachinePointerInfo(), MVT::i32);
15913 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15915 auto &DL = DAG.getDataLayout();
15917 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15918 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15920 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15923 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15925 // Get the offset of start of .tls section
15926 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15927 GA->getValueType(0),
15928 GA->getOffset(), X86II::MO_SECREL);
15929 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15931 // The address of the thread local variable is the add of the thread
15932 // pointer with the offset of the variable.
15933 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15936 llvm_unreachable("TLS not implemented for this target.");
15939 /// Lower SRA_PARTS and friends, which return two i32 values
15940 /// and take a 2 x i32 value to shift plus a shift amount.
15941 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15942 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15943 MVT VT = Op.getSimpleValueType();
15944 unsigned VTBits = VT.getSizeInBits();
15946 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15947 SDValue ShOpLo = Op.getOperand(0);
15948 SDValue ShOpHi = Op.getOperand(1);
15949 SDValue ShAmt = Op.getOperand(2);
15950 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15951 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15953 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15954 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15955 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15956 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15957 : DAG.getConstant(0, dl, VT);
15959 SDValue Tmp2, Tmp3;
15960 if (Op.getOpcode() == ISD::SHL_PARTS) {
15961 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15962 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15964 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15965 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15968 // If the shift amount is larger or equal than the width of a part we can't
15969 // rely on the results of shld/shrd. Insert a test and select the appropriate
15970 // values for large shift amounts.
15971 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15972 DAG.getConstant(VTBits, dl, MVT::i8));
15973 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15974 AndNode, DAG.getConstant(0, dl, MVT::i8));
15977 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15978 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15979 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15981 if (Op.getOpcode() == ISD::SHL_PARTS) {
15982 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15983 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15985 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15986 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15989 SDValue Ops[2] = { Lo, Hi };
15990 return DAG.getMergeValues(Ops, dl);
15993 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15994 SelectionDAG &DAG) const {
15995 SDValue Src = Op.getOperand(0);
15996 MVT SrcVT = Src.getSimpleValueType();
15997 MVT VT = Op.getSimpleValueType();
16000 if (SrcVT.isVector()) {
16001 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
16002 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
16003 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
16004 DAG.getUNDEF(SrcVT)));
16009 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
16010 "Unknown SINT_TO_FP to lower!");
16012 // These are really Legal; return the operand so the caller accepts it as
16014 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
16016 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
16017 Subtarget.is64Bit()) {
16021 SDValue ValueToStore = Op.getOperand(0);
16022 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
16023 !Subtarget.is64Bit())
16024 // Bitcasting to f64 here allows us to do a single 64-bit store from
16025 // an SSE register, avoiding the store forwarding penalty that would come
16026 // with two 32-bit stores.
16027 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16029 unsigned Size = SrcVT.getSizeInBits()/8;
16030 MachineFunction &MF = DAG.getMachineFunction();
16031 auto PtrVT = getPointerTy(MF.getDataLayout());
16032 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
16033 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16034 SDValue Chain = DAG.getStore(
16035 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16036 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16037 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
16040 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
16042 SelectionDAG &DAG) const {
16046 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
16048 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
16050 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
16052 unsigned ByteSize = SrcVT.getSizeInBits()/8;
16054 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
16055 MachineMemOperand *MMO;
16057 int SSFI = FI->getIndex();
16058 MMO = DAG.getMachineFunction().getMachineMemOperand(
16059 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16060 MachineMemOperand::MOLoad, ByteSize, ByteSize);
16062 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
16063 StackSlot = StackSlot.getOperand(1);
16065 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
16066 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
16068 Tys, Ops, SrcVT, MMO);
16071 Chain = Result.getValue(1);
16072 SDValue InFlag = Result.getValue(2);
16074 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
16075 // shouldn't be necessary except that RFP cannot be live across
16076 // multiple blocks. When stackifier is fixed, they can be uncoupled.
16077 MachineFunction &MF = DAG.getMachineFunction();
16078 unsigned SSFISize = Op.getValueSizeInBits()/8;
16079 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
16080 auto PtrVT = getPointerTy(MF.getDataLayout());
16081 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16082 Tys = DAG.getVTList(MVT::Other);
16084 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
16086 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16087 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16088 MachineMemOperand::MOStore, SSFISize, SSFISize);
16090 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
16091 Ops, Op.getValueType(), MMO);
16092 Result = DAG.getLoad(
16093 Op.getValueType(), DL, Chain, StackSlot,
16094 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
16100 /// 64-bit unsigned integer to double expansion.
16101 static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
16102 const X86Subtarget &Subtarget) {
16103 // This algorithm is not obvious. Here it is what we're trying to output:
16106 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
16107 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
16109 haddpd %xmm0, %xmm0
16111 pshufd $0x4e, %xmm0, %xmm1
16117 LLVMContext *Context = DAG.getContext();
16119 // Build some magic constants.
16120 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
16121 Constant *C0 = ConstantDataVector::get(*Context, CV0);
16122 auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
16123 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
16125 SmallVector<Constant*,2> CV1;
16127 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16128 APInt(64, 0x4330000000000000ULL))));
16130 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
16131 APInt(64, 0x4530000000000000ULL))));
16132 Constant *C1 = ConstantVector::get(CV1);
16133 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
16135 // Load the 64-bit value into an XMM register.
16136 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
16139 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
16140 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16141 /* Alignment = */ 16);
16143 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
16146 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
16147 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
16148 /* Alignment = */ 16);
16149 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
16150 // TODO: Are there any fast-math-flags to propagate here?
16151 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
16154 if (Subtarget.hasSSE3()) {
16155 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
16156 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
16158 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
16159 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
16160 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
16161 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
16164 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
16165 DAG.getIntPtrConstant(0, dl));
16168 /// 32-bit unsigned integer to float expansion.
16169 static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
16170 const X86Subtarget &Subtarget) {
16172 // FP constant to bias correct the final result.
16173 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
16176 // Load the 32-bit value into an XMM register.
16177 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
16180 // Zero out the upper parts of the register.
16181 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
16183 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16184 DAG.getBitcast(MVT::v2f64, Load),
16185 DAG.getIntPtrConstant(0, dl));
16187 // Or the load with the bias.
16188 SDValue Or = DAG.getNode(
16189 ISD::OR, dl, MVT::v2i64,
16190 DAG.getBitcast(MVT::v2i64,
16191 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
16192 DAG.getBitcast(MVT::v2i64,
16193 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
16195 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
16196 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
16198 // Subtract the bias.
16199 // TODO: Are there any fast-math-flags to propagate here?
16200 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
16202 // Handle final rounding.
16203 return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType());
16206 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
16207 const X86Subtarget &Subtarget,
16209 if (Op.getSimpleValueType() != MVT::v2f64)
16212 SDValue N0 = Op.getOperand(0);
16213 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
16215 // Legalize to v4i32 type.
16216 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
16217 DAG.getUNDEF(MVT::v2i32));
16219 if (Subtarget.hasAVX512())
16220 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
16222 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
16223 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
16224 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
16225 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
16227 // Two to the power of half-word-size.
16228 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
16230 // Clear upper part of LO, lower HI.
16231 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
16232 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
16234 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
16235 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
16236 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
16238 // Add the two halves.
16239 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
16242 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
16243 const X86Subtarget &Subtarget) {
16244 // The algorithm is the following:
16245 // #ifdef __SSE4_1__
16246 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16247 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16248 // (uint4) 0x53000000, 0xaa);
16250 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16251 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16253 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16254 // return (float4) lo + fhi;
16256 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
16257 // reassociate the two FADDs, and if we do that, the algorithm fails
16258 // spectacularly (PR24512).
16259 // FIXME: If we ever have some kind of Machine FMF, this should be marked
16260 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
16261 // there's also the MachineCombiner reassociations happening on Machine IR.
16262 if (DAG.getTarget().Options.UnsafeFPMath)
16266 SDValue V = Op->getOperand(0);
16267 MVT VecIntVT = V.getSimpleValueType();
16268 bool Is128 = VecIntVT == MVT::v4i32;
16269 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
16270 // If we convert to something else than the supported type, e.g., to v4f64,
16272 if (VecFloatVT != Op->getSimpleValueType(0))
16275 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
16276 "Unsupported custom type");
16278 // In the #idef/#else code, we have in common:
16279 // - The vector of constants:
16285 // Create the splat vector for 0x4b000000.
16286 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
16287 // Create the splat vector for 0x53000000.
16288 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
16290 // Create the right shift.
16291 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
16292 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
16295 if (Subtarget.hasSSE41()) {
16296 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
16297 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
16298 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
16299 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
16300 // Low will be bitcasted right away, so do not bother bitcasting back to its
16302 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
16303 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16304 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
16305 // (uint4) 0x53000000, 0xaa);
16306 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
16307 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
16308 // High will be bitcasted right away, so do not bother bitcasting back to
16309 // its original type.
16310 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
16311 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
16313 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
16314 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
16315 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
16316 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
16318 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
16319 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
16322 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
16323 SDValue VecCstFAdd = DAG.getConstantFP(
16324 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
16326 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
16327 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
16328 // TODO: Are there any fast-math-flags to propagate here?
16330 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
16331 // return (float4) lo + fhi;
16332 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
16333 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
16336 static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
16337 const X86Subtarget &Subtarget) {
16338 SDValue N0 = Op.getOperand(0);
16339 MVT SrcVT = N0.getSimpleValueType();
16342 switch (SrcVT.SimpleTy) {
16344 llvm_unreachable("Custom UINT_TO_FP is not supported!");
16346 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
16349 assert(!Subtarget.hasAVX512());
16350 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
16354 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
16355 SelectionDAG &DAG) const {
16356 SDValue N0 = Op.getOperand(0);
16358 auto PtrVT = getPointerTy(DAG.getDataLayout());
16360 if (Op.getSimpleValueType().isVector())
16361 return lowerUINT_TO_FP_vec(Op, DAG, Subtarget);
16363 MVT SrcVT = N0.getSimpleValueType();
16364 MVT DstVT = Op.getSimpleValueType();
16366 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
16367 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
16368 // Conversions from unsigned i32 to f32/f64 are legal,
16369 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
16373 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
16374 return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
16375 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
16376 return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
16377 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
16380 // Make a 64-bit buffer, and use it to build an FILD.
16381 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
16382 if (SrcVT == MVT::i32) {
16383 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
16384 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
16385 StackSlot, MachinePointerInfo());
16386 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
16387 OffsetSlot, MachinePointerInfo());
16388 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
16392 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
16393 SDValue ValueToStore = Op.getOperand(0);
16394 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
16395 // Bitcasting to f64 here allows us to do a single 64-bit store from
16396 // an SSE register, avoiding the store forwarding penalty that would come
16397 // with two 32-bit stores.
16398 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
16399 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
16400 MachinePointerInfo());
16401 // For i64 source, we need to add the appropriate power of 2 if the input
16402 // was negative. This is the same as the optimization in
16403 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
16404 // we must be careful to do the computation in x87 extended precision, not
16405 // in SSE. (The generic code can't know it's OK to do this, or how to.)
16406 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
16407 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
16408 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
16409 MachineMemOperand::MOLoad, 8, 8);
16411 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
16412 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
16413 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
16416 APInt FF(32, 0x5F800000ULL);
16418 // Check whether the sign bit is set.
16419 SDValue SignSet = DAG.getSetCC(
16420 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
16421 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
16423 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
16424 SDValue FudgePtr = DAG.getConstantPool(
16425 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
16427 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
16428 SDValue Zero = DAG.getIntPtrConstant(0, dl);
16429 SDValue Four = DAG.getIntPtrConstant(4, dl);
16430 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
16431 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
16433 // Load the value out, extending it from f32 to f80.
16434 // FIXME: Avoid the extend by constructing the right constant pool?
16435 SDValue Fudge = DAG.getExtLoad(
16436 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
16437 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
16438 /* Alignment = */ 4);
16439 // Extend everything to 80 bits to force it to be done on x87.
16440 // TODO: Are there any fast-math-flags to propagate here?
16441 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
16442 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
16443 DAG.getIntPtrConstant(0, dl));
16446 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
16447 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
16448 // just return an <SDValue(), SDValue()> pair.
16449 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
16450 // to i16, i32 or i64, and we lower it to a legal sequence.
16451 // If lowered to the final integer result we return a <result, SDValue()> pair.
16452 // Otherwise we lower it to a sequence ending with a FIST, return a
16453 // <FIST, StackSlot> pair, and the caller is responsible for loading
16454 // the final integer result from StackSlot.
16455 std::pair<SDValue,SDValue>
16456 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
16457 bool IsSigned, bool IsReplace) const {
16460 EVT DstTy = Op.getValueType();
16461 EVT TheVT = Op.getOperand(0).getValueType();
16462 auto PtrVT = getPointerTy(DAG.getDataLayout());
16464 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
16465 // f16 must be promoted before using the lowering in this routine.
16466 // fp128 does not use this lowering.
16467 return std::make_pair(SDValue(), SDValue());
16470 // If using FIST to compute an unsigned i64, we'll need some fixup
16471 // to handle values above the maximum signed i64. A FIST is always
16472 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
16473 bool UnsignedFixup = !IsSigned &&
16474 DstTy == MVT::i64 &&
16475 (!Subtarget.is64Bit() ||
16476 !isScalarFPTypeInSSEReg(TheVT));
16478 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
16479 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
16480 // The low 32 bits of the fist result will have the correct uint32 result.
16481 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
16485 assert(DstTy.getSimpleVT() <= MVT::i64 &&
16486 DstTy.getSimpleVT() >= MVT::i16 &&
16487 "Unknown FP_TO_INT to lower!");
16489 // These are really Legal.
16490 if (DstTy == MVT::i32 &&
16491 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16492 return std::make_pair(SDValue(), SDValue());
16493 if (Subtarget.is64Bit() &&
16494 DstTy == MVT::i64 &&
16495 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
16496 return std::make_pair(SDValue(), SDValue());
16498 // We lower FP->int64 into FISTP64 followed by a load from a temporary
16500 MachineFunction &MF = DAG.getMachineFunction();
16501 unsigned MemSize = DstTy.getSizeInBits()/8;
16502 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16503 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16506 switch (DstTy.getSimpleVT().SimpleTy) {
16507 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
16508 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
16509 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
16510 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
16513 SDValue Chain = DAG.getEntryNode();
16514 SDValue Value = Op.getOperand(0);
16515 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
16517 if (UnsignedFixup) {
16519 // Conversion to unsigned i64 is implemented with a select,
16520 // depending on whether the source value fits in the range
16521 // of a signed i64. Let Thresh be the FP equivalent of
16522 // 0x8000000000000000ULL.
16524 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
16525 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
16526 // Fist-to-mem64 FistSrc
16527 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
16528 // to XOR'ing the high 32 bits with Adjust.
16530 // Being a power of 2, Thresh is exactly representable in all FP formats.
16531 // For X87 we'd like to use the smallest FP type for this constant, but
16532 // for DAG type consistency we have to match the FP operand type.
16534 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
16535 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
16536 bool LosesInfo = false;
16537 if (TheVT == MVT::f64)
16538 // The rounding mode is irrelevant as the conversion should be exact.
16539 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16541 else if (TheVT == MVT::f80)
16542 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16543 APFloat::rmNearestTiesToEven, &LosesInfo);
16545 assert(Status == APFloat::opOK && !LosesInfo &&
16546 "FP conversion should have been exact");
16548 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16550 SDValue Cmp = DAG.getSetCC(DL,
16551 getSetCCResultType(DAG.getDataLayout(),
16552 *DAG.getContext(), TheVT),
16553 Value, ThreshVal, ISD::SETLT);
16554 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16555 DAG.getConstant(0, DL, MVT::i32),
16556 DAG.getConstant(0x80000000, DL, MVT::i32));
16557 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16558 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16559 *DAG.getContext(), TheVT),
16560 Value, ThreshVal, ISD::SETLT);
16561 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16564 // FIXME This causes a redundant load/store if the SSE-class value is already
16565 // in memory, such as if it is on the callstack.
16566 if (isScalarFPTypeInSSEReg(TheVT)) {
16567 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16568 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16569 MachinePointerInfo::getFixedStack(MF, SSFI));
16570 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16572 Chain, StackSlot, DAG.getValueType(TheVT)
16575 MachineMemOperand *MMO =
16576 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16577 MachineMemOperand::MOLoad, MemSize, MemSize);
16578 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16579 Chain = Value.getValue(1);
16580 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16581 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16584 MachineMemOperand *MMO =
16585 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16586 MachineMemOperand::MOStore, MemSize, MemSize);
16588 if (UnsignedFixup) {
16590 // Insert the FIST, load its result as two i32's,
16591 // and XOR the high i32 with Adjust.
16593 SDValue FistOps[] = { Chain, Value, StackSlot };
16594 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16595 FistOps, DstTy, MMO);
16598 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16599 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16602 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16603 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16605 if (Subtarget.is64Bit()) {
16606 // Join High32 and Low32 into a 64-bit result.
16607 // (High32 << 32) | Low32
16608 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16609 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16610 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16611 DAG.getConstant(32, DL, MVT::i8));
16612 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16613 return std::make_pair(Result, SDValue());
16616 SDValue ResultOps[] = { Low32, High32 };
16618 SDValue pair = IsReplace
16619 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16620 : DAG.getMergeValues(ResultOps, DL);
16621 return std::make_pair(pair, SDValue());
16623 // Build the FP_TO_INT*_IN_MEM
16624 SDValue Ops[] = { Chain, Value, StackSlot };
16625 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16627 return std::make_pair(FIST, StackSlot);
16631 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16632 const X86Subtarget &Subtarget) {
16633 MVT VT = Op->getSimpleValueType(0);
16634 SDValue In = Op->getOperand(0);
16635 MVT InVT = In.getSimpleValueType();
16638 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
16639 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
16640 "Expected same number of elements");
16641 assert((VT.getVectorElementType() == MVT::i16 ||
16642 VT.getVectorElementType() == MVT::i32 ||
16643 VT.getVectorElementType() == MVT::i64) &&
16644 "Unexpected element type");
16645 assert((InVT.getVectorElementType() == MVT::i8 ||
16646 InVT.getVectorElementType() == MVT::i16 ||
16647 InVT.getVectorElementType() == MVT::i32) &&
16648 "Unexpected element type");
16650 if (Subtarget.hasInt256())
16651 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16653 // Optimize vectors in AVX mode:
16656 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16657 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16658 // Concat upper and lower parts.
16661 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16662 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16663 // Concat upper and lower parts.
16666 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16667 SDValue Undef = DAG.getUNDEF(InVT);
16668 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16669 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16670 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16672 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16673 VT.getVectorNumElements()/2);
16675 OpLo = DAG.getBitcast(HVT, OpLo);
16676 OpHi = DAG.getBitcast(HVT, OpHi);
16678 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16681 // Helper to split and extend a v16i1 mask to v16i8 or v16i16.
16682 static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In,
16683 const SDLoc &dl, SelectionDAG &DAG) {
16684 assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.");
16685 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16686 DAG.getIntPtrConstant(0, dl));
16687 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In,
16688 DAG.getIntPtrConstant(8, dl));
16689 Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo);
16690 Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi);
16691 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi);
16692 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
16695 static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
16696 const X86Subtarget &Subtarget,
16697 SelectionDAG &DAG) {
16698 MVT VT = Op->getSimpleValueType(0);
16699 SDValue In = Op->getOperand(0);
16700 MVT InVT = In.getSimpleValueType();
16701 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
16703 unsigned NumElts = VT.getVectorNumElements();
16705 // For all vectors, but vXi8 we can just emit a sign_extend a shift. This
16706 // avoids a constant pool load.
16707 if (VT.getVectorElementType() != MVT::i8) {
16708 SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In);
16709 return DAG.getNode(ISD::SRL, DL, VT, Extend,
16710 DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
16713 // Extend VT if BWI is not supported.
16715 if (!Subtarget.hasBWI()) {
16716 // If v16i32 is to be avoided, we'll need to split and concatenate.
16717 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
16718 return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG);
16720 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
16723 // Widen to 512-bits if VLX is not supported.
16724 MVT WideVT = ExtVT;
16725 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
16726 NumElts *= 512 / ExtVT.getSizeInBits();
16727 InVT = MVT::getVectorVT(MVT::i1, NumElts);
16728 In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
16729 In, DAG.getIntPtrConstant(0, DL));
16730 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
16734 SDValue One = DAG.getConstant(1, DL, WideVT);
16735 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
16737 SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
16739 // Truncate if we had to extend above.
16741 WideVT = MVT::getVectorVT(MVT::i8, NumElts);
16742 SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal);
16745 // Extract back to 128/256-bit if we widened.
16747 SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
16748 DAG.getIntPtrConstant(0, DL));
16750 return SelectedVal;
16753 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16754 SelectionDAG &DAG) {
16755 SDValue In = Op.getOperand(0);
16756 MVT SVT = In.getSimpleValueType();
16758 if (SVT.getVectorElementType() == MVT::i1)
16759 return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
16761 assert(Subtarget.hasAVX() && "Expected AVX support");
16762 return LowerAVXExtend(Op, DAG, Subtarget);
16765 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16766 /// It makes use of the fact that vectors with enough leading sign/zero bits
16767 /// prevent the PACKSS/PACKUS from saturating the results.
16768 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16769 /// within each 128-bit lane.
16770 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16771 const SDLoc &DL, SelectionDAG &DAG,
16772 const X86Subtarget &Subtarget) {
16773 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16774 "Unexpected PACK opcode");
16776 // Requires SSE2 but AVX512 has fast vector truncate.
16777 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512() || !DstVT.isVector())
16780 EVT SrcVT = In.getValueType();
16782 // No truncation required, we might get here due to recursive calls.
16783 if (SrcVT == DstVT)
16786 // We only support vector truncation to 64bits or greater from a
16787 // 128bits or greater source.
16788 unsigned DstSizeInBits = DstVT.getSizeInBits();
16789 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16790 if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0)
16793 unsigned NumElems = SrcVT.getVectorNumElements();
16794 if (!isPowerOf2_32(NumElems))
16797 LLVMContext &Ctx = *DAG.getContext();
16798 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16799 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16801 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16803 // Pack to the largest type possible:
16804 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16805 EVT InVT = MVT::i16, OutVT = MVT::i8;
16806 if (SrcVT.getScalarSizeInBits() > 16 &&
16807 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16812 // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector.
16813 if (SrcVT.is128BitVector()) {
16814 InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
16815 OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
16816 SDValue Res = DAG.getNode(Opcode, DL, OutVT,
16817 DAG.getBitcast(InVT, In), DAG.getUNDEF(InVT));
16818 Res = extractSubVector(Res, 0, DAG, DL, 64);
16819 return DAG.getBitcast(DstVT, Res);
16822 // Extract lower/upper subvectors.
16823 unsigned NumSubElts = NumElems / 2;
16824 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16825 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16827 unsigned SubSizeInBits = SrcSizeInBits / 2;
16828 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16829 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16831 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16832 if (SrcVT.is256BitVector() && DstVT.is128BitVector()) {
16833 Lo = DAG.getBitcast(InVT, Lo);
16834 Hi = DAG.getBitcast(InVT, Hi);
16835 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16836 return DAG.getBitcast(DstVT, Res);
16839 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16840 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16841 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16842 Lo = DAG.getBitcast(InVT, Lo);
16843 Hi = DAG.getBitcast(InVT, Hi);
16844 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16846 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16847 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16848 Res = DAG.getBitcast(MVT::v4i64, Res);
16849 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16851 if (DstVT.is256BitVector())
16852 return DAG.getBitcast(DstVT, Res);
16854 // If 512bit -> 128bit truncate another stage.
16855 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16856 Res = DAG.getBitcast(PackedVT, Res);
16857 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16860 // Recursively pack lower/upper subvectors, concat result and pack again.
16861 assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
16862 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16863 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16864 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16866 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16867 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16868 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16871 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16872 const X86Subtarget &Subtarget) {
16875 MVT VT = Op.getSimpleValueType();
16876 SDValue In = Op.getOperand(0);
16877 MVT InVT = In.getSimpleValueType();
16879 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16881 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16882 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16883 if (InVT.getScalarSizeInBits() <= 16) {
16884 if (Subtarget.hasBWI()) {
16885 // legal, will go to VPMOVB2M, VPMOVW2M
16886 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16887 // We need to shift to get the lsb into sign position.
16888 // Shift packed bytes not supported natively, bitcast to word
16889 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16890 In = DAG.getNode(ISD::SHL, DL, ExtVT,
16891 DAG.getBitcast(ExtVT, In),
16892 DAG.getConstant(ShiftInx, DL, ExtVT));
16893 In = DAG.getBitcast(InVT, In);
16895 return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT),
16896 In, DAG.getConstant(6, DL, MVT::i8));
16898 // Use TESTD/Q, extended vector to packed dword/qword.
16899 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16900 "Unexpected vector type.");
16901 unsigned NumElts = InVT.getVectorNumElements();
16902 assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements");
16903 // We need to change to a wider element type that we have support for.
16904 // For 8 element vectors this is easy, we either extend to v8i32 or v8i64.
16905 // For 16 element vectors we extend to v16i32 unless we are explicitly
16906 // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
16907 // we need to split into two 8 element vectors which we can extend to v8i32,
16908 // truncate and concat the results. There's an additional complication if
16909 // the original type is v16i8. In that case we can't split the v16i8 so
16910 // first we pre-extend it to v16i16 which we can split to v8i16, then extend
16911 // to v8i32, truncate that to v8i1 and concat the two halves.
16912 if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
16913 if (InVT == MVT::v16i8) {
16914 // First we need to sign extend up to 256-bits so we can split that.
16915 InVT = MVT::v16i16;
16916 In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
16918 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
16919 SDValue Hi = extract128BitVector(In, 8, DAG, DL);
16920 // We're split now, just emit two truncates and a concat. The two
16921 // truncates will trigger legalization to come back to this function.
16922 Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
16923 Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi);
16924 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
16926 // We either have 8 elements or we're allowed to use 512-bit vectors.
16927 // If we have VLX, we want to use the narrowest vector that can get the
16928 // job done so we use vXi32.
16929 MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts);
16930 MVT ExtVT = MVT::getVectorVT(EltVT, NumElts);
16931 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16933 ShiftInx = InVT.getScalarSizeInBits() - 1;
16936 if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) {
16937 // We need to shift to get the lsb into sign position.
16938 In = DAG.getNode(ISD::SHL, DL, InVT, In,
16939 DAG.getConstant(ShiftInx, DL, InVT));
16941 // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m.
16942 if (Subtarget.hasDQI())
16943 return DAG.getNode(X86ISD::CMPM, DL, VT, DAG.getConstant(0, DL, InVT),
16944 In, DAG.getConstant(6, DL, MVT::i8));
16945 return DAG.getNode(X86ISD::CMPM, DL, VT, In,
16946 getZeroVector(InVT, Subtarget, DAG, DL),
16947 DAG.getConstant(4, DL, MVT::i8));
16950 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16952 MVT VT = Op.getSimpleValueType();
16953 SDValue In = Op.getOperand(0);
16954 MVT InVT = In.getSimpleValueType();
16955 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16957 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16958 "Invalid TRUNCATE operation");
16960 if (VT.getVectorElementType() == MVT::i1)
16961 return LowerTruncateVecI1(Op, DAG, Subtarget);
16963 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16964 if (Subtarget.hasAVX512()) {
16965 // word to byte only under BWI
16966 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) { // v16i16 -> v16i8
16967 // Make sure we're allowed to promote 512-bits.
16968 if (Subtarget.canExtendTo512DQ())
16969 return DAG.getNode(ISD::TRUNCATE, DL, VT,
16970 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In,
16973 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16977 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16978 // extend all the way to the packed/truncated value.
16979 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16980 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16982 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16985 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16986 // that extend all the way to the packed/truncated value.
16987 // Pre-SSE41 we can only use PACKUSWB.
16989 DAG.computeKnownBits(In, Known);
16990 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16991 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16993 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16996 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16997 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16998 if (Subtarget.hasInt256()) {
16999 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
17000 In = DAG.getBitcast(MVT::v8i32, In);
17001 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
17002 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
17003 DAG.getIntPtrConstant(0, DL));
17006 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17007 DAG.getIntPtrConstant(0, DL));
17008 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17009 DAG.getIntPtrConstant(2, DL));
17010 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17011 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17012 static const int ShufMask[] = {0, 2, 4, 6};
17013 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
17016 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
17017 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
17018 if (Subtarget.hasInt256()) {
17019 In = DAG.getBitcast(MVT::v32i8, In);
17021 // The PSHUFB mask:
17022 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
17023 -1, -1, -1, -1, -1, -1, -1, -1,
17024 16, 17, 20, 21, 24, 25, 28, 29,
17025 -1, -1, -1, -1, -1, -1, -1, -1 };
17026 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
17027 In = DAG.getBitcast(MVT::v4i64, In);
17029 static const int ShufMask2[] = {0, 2, -1, -1};
17030 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
17031 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
17032 DAG.getIntPtrConstant(0, DL));
17033 return DAG.getBitcast(VT, In);
17036 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17037 DAG.getIntPtrConstant(0, DL));
17039 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
17040 DAG.getIntPtrConstant(4, DL));
17042 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
17043 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
17045 // The PSHUFB mask:
17046 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
17047 -1, -1, -1, -1, -1, -1, -1, -1};
17049 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
17050 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
17052 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
17053 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
17055 // The MOVLHPS Mask:
17056 static const int ShufMask2[] = {0, 1, 4, 5};
17057 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
17058 return DAG.getBitcast(MVT::v8i16, res);
17061 // Handle truncation of V256 to V128 using shuffles.
17062 assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!");
17064 assert(Subtarget.hasAVX() && "256-bit vector without AVX!");
17066 unsigned NumElems = VT.getVectorNumElements();
17067 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
17069 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
17070 // Prepare truncation shuffle mask
17071 for (unsigned i = 0; i != NumElems; ++i)
17072 MaskVec[i] = i * 2;
17073 In = DAG.getBitcast(NVT, In);
17074 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
17075 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
17076 DAG.getIntPtrConstant(0, DL));
17079 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
17080 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
17081 MVT VT = Op.getSimpleValueType();
17083 if (VT.isVector()) {
17084 SDValue Src = Op.getOperand(0);
17087 if (VT == MVT::v2i1 && Src.getSimpleValueType() == MVT::v2f64) {
17088 MVT ResVT = MVT::v4i32;
17089 MVT TruncVT = MVT::v4i1;
17090 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
17091 if (!IsSigned && !Subtarget.hasVLX()) {
17092 // Widen to 512-bits.
17093 ResVT = MVT::v8i32;
17094 TruncVT = MVT::v8i1;
17095 Opc = ISD::FP_TO_UINT;
17096 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
17097 DAG.getUNDEF(MVT::v8f64),
17098 Src, DAG.getIntPtrConstant(0, dl));
17100 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
17101 Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
17102 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
17103 DAG.getIntPtrConstant(0, dl));
17106 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
17107 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
17108 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
17109 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
17110 DAG.getUNDEF(MVT::v2f32)));
17116 assert(!VT.isVector());
17118 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
17119 IsSigned, /*IsReplace=*/ false);
17120 SDValue FIST = Vals.first, StackSlot = Vals.second;
17121 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
17122 if (!FIST.getNode())
17125 if (StackSlot.getNode())
17126 // Load the result.
17127 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
17129 // The node is the result.
17133 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
17135 MVT VT = Op.getSimpleValueType();
17136 SDValue In = Op.getOperand(0);
17137 MVT SVT = In.getSimpleValueType();
17139 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
17141 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
17142 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
17143 In, DAG.getUNDEF(SVT)));
17146 /// The only differences between FABS and FNEG are the mask and the logic op.
17147 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
17148 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
17149 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
17150 "Wrong opcode for lowering FABS or FNEG.");
17152 bool IsFABS = (Op.getOpcode() == ISD::FABS);
17154 // If this is a FABS and it has an FNEG user, bail out to fold the combination
17155 // into an FNABS. We'll lower the FABS after that if it is still in use.
17157 for (SDNode *User : Op->uses())
17158 if (User->getOpcode() == ISD::FNEG)
17162 MVT VT = Op.getSimpleValueType();
17164 bool IsF128 = (VT == MVT::f128);
17166 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
17167 // decide if we should generate a 16-byte constant mask when we only need 4 or
17168 // 8 bytes for the scalar case.
17173 if (VT.isVector()) {
17175 EltVT = VT.getVectorElementType();
17176 } else if (IsF128) {
17177 // SSE instructions are used for optimized f128 logical operations.
17178 LogicVT = MVT::f128;
17181 // There are no scalar bitwise logical SSE/AVX instructions, so we
17182 // generate a 16-byte vector constant and logic op even for the scalar case.
17183 // Using a 16-byte mask allows folding the load of the mask with
17184 // the logic op, so it can save (~4 bytes) on code size.
17185 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17189 unsigned EltBits = EltVT.getSizeInBits();
17190 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
17192 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
17193 const fltSemantics &Sem =
17194 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
17195 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17196 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
17198 SDValue Op0 = Op.getOperand(0);
17199 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
17201 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
17202 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
17204 if (VT.isVector() || IsF128)
17205 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17207 // For the scalar case extend to a 128-bit vector, perform the logic op,
17208 // and extract the scalar result back out.
17209 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
17210 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
17211 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
17212 DAG.getIntPtrConstant(0, dl));
17215 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
17216 SDValue Mag = Op.getOperand(0);
17217 SDValue Sign = Op.getOperand(1);
17220 // If the sign operand is smaller, extend it first.
17221 MVT VT = Op.getSimpleValueType();
17222 if (Sign.getSimpleValueType().bitsLT(VT))
17223 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
17225 // And if it is bigger, shrink it first.
17226 if (Sign.getSimpleValueType().bitsGT(VT))
17227 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
17229 // At this point the operands and the result should have the same
17230 // type, and that won't be f80 since that is not custom lowered.
17231 bool IsF128 = (VT == MVT::f128);
17232 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
17233 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
17234 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
17235 "Unexpected type in LowerFCOPYSIGN");
17237 MVT EltVT = VT.getScalarType();
17238 const fltSemantics &Sem =
17239 EltVT == MVT::f64 ? APFloat::IEEEdouble()
17240 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
17242 // Perform all scalar logic operations as 16-byte vectors because there are no
17243 // scalar FP logic instructions in SSE.
17244 // TODO: This isn't necessary. If we used scalar types, we might avoid some
17245 // unnecessary splats, but we might miss load folding opportunities. Should
17246 // this decision be based on OptimizeForSize?
17247 bool IsFakeVector = !VT.isVector() && !IsF128;
17250 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
17252 // The mask constants are automatically splatted for vector types.
17253 unsigned EltSizeInBits = VT.getScalarSizeInBits();
17254 SDValue SignMask = DAG.getConstantFP(
17255 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17256 SDValue MagMask = DAG.getConstantFP(
17257 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
17259 // First, clear all bits but the sign bit from the second operand (sign).
17261 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
17262 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
17264 // Next, clear the sign bit from the first operand (magnitude).
17265 // TODO: If we had general constant folding for FP logic ops, this check
17266 // wouldn't be necessary.
17268 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
17269 APFloat APF = Op0CN->getValueAPF();
17271 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
17273 // If the magnitude operand wasn't a constant, we need to AND out the sign.
17275 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
17276 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
17279 // OR the magnitude value with the sign bit.
17280 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
17281 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
17282 DAG.getIntPtrConstant(0, dl));
17285 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
17286 SDValue N0 = Op.getOperand(0);
17288 MVT VT = Op.getSimpleValueType();
17290 MVT OpVT = N0.getSimpleValueType();
17291 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
17292 "Unexpected type for FGETSIGN");
17294 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
17295 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
17296 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
17297 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
17298 Res = DAG.getZExtOrTrunc(Res, dl, VT);
17299 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
17303 /// Helper for creating a X86ISD::SETCC node.
17304 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17305 SelectionDAG &DAG) {
17306 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17307 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17310 // Check whether an OR'd tree is PTEST-able.
17311 static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
17312 const X86Subtarget &Subtarget,
17313 SelectionDAG &DAG) {
17314 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
17316 if (!Subtarget.hasSSE41())
17319 if (!Op->hasOneUse())
17322 SDNode *N = Op.getNode();
17325 SmallVector<SDValue, 8> Opnds;
17326 DenseMap<SDValue, unsigned> VecInMap;
17327 SmallVector<SDValue, 8> VecIns;
17328 EVT VT = MVT::Other;
17330 // Recognize a special case where a vector is casted into wide integer to
17332 Opnds.push_back(N->getOperand(0));
17333 Opnds.push_back(N->getOperand(1));
17335 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
17336 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
17337 // BFS traverse all OR'd operands.
17338 if (I->getOpcode() == ISD::OR) {
17339 Opnds.push_back(I->getOperand(0));
17340 Opnds.push_back(I->getOperand(1));
17341 // Re-evaluate the number of nodes to be traversed.
17342 e += 2; // 2 more nodes (LHS and RHS) are pushed.
17346 // Quit if a non-EXTRACT_VECTOR_ELT
17347 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
17350 // Quit if without a constant index.
17351 SDValue Idx = I->getOperand(1);
17352 if (!isa<ConstantSDNode>(Idx))
17355 SDValue ExtractedFromVec = I->getOperand(0);
17356 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
17357 if (M == VecInMap.end()) {
17358 VT = ExtractedFromVec.getValueType();
17359 // Quit if not 128/256-bit vector.
17360 if (!VT.is128BitVector() && !VT.is256BitVector())
17362 // Quit if not the same type.
17363 if (VecInMap.begin() != VecInMap.end() &&
17364 VT != VecInMap.begin()->first.getValueType())
17366 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
17367 VecIns.push_back(ExtractedFromVec);
17369 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
17372 assert((VT.is128BitVector() || VT.is256BitVector()) &&
17373 "Not extracted from 128-/256-bit vector.");
17375 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
17377 for (DenseMap<SDValue, unsigned>::const_iterator
17378 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
17379 // Quit if not all elements are used.
17380 if (I->second != FullMask)
17384 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
17386 // Cast all vectors into TestVT for PTEST.
17387 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
17388 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
17390 // If more than one full vector is evaluated, OR them first before PTEST.
17391 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
17392 // Each iteration will OR 2 nodes and append the result until there is only
17393 // 1 node left, i.e. the final OR'd value of all vectors.
17394 SDValue LHS = VecIns[Slot];
17395 SDValue RHS = VecIns[Slot + 1];
17396 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
17399 SDValue Res = DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
17400 VecIns.back(), VecIns.back());
17401 return getSETCC(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE, Res, DL, DAG);
17404 /// \brief return true if \c Op has a use that doesn't just read flags.
17405 static bool hasNonFlagsUse(SDValue Op) {
17406 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
17408 SDNode *User = *UI;
17409 unsigned UOpNo = UI.getOperandNo();
17410 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
17411 // Look pass truncate.
17412 UOpNo = User->use_begin().getOperandNo();
17413 User = *User->use_begin();
17416 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
17417 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
17423 /// Emit nodes that will be selected as "test Op0,Op0", or something
17425 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
17426 SelectionDAG &DAG) const {
17427 // CF and OF aren't always set the way we want. Determine which
17428 // of these we need.
17429 bool NeedCF = false;
17430 bool NeedOF = false;
17433 case X86::COND_A: case X86::COND_AE:
17434 case X86::COND_B: case X86::COND_BE:
17437 case X86::COND_G: case X86::COND_GE:
17438 case X86::COND_L: case X86::COND_LE:
17439 case X86::COND_O: case X86::COND_NO: {
17440 // Check if we really need to set the
17441 // Overflow flag. If NoSignedWrap is present
17442 // that is not actually needed.
17443 switch (Op->getOpcode()) {
17448 if (Op.getNode()->getFlags().hasNoSignedWrap())
17458 // See if we can use the EFLAGS value from the operand instead of
17459 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
17460 // we prove that the arithmetic won't overflow, we can't use OF or CF.
17461 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
17462 // Emit a CMP with 0, which is the TEST pattern.
17463 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17464 DAG.getConstant(0, dl, Op.getValueType()));
17466 unsigned Opcode = 0;
17467 unsigned NumOperands = 0;
17469 // Truncate operations may prevent the merge of the SETCC instruction
17470 // and the arithmetic instruction before it. Attempt to truncate the operands
17471 // of the arithmetic instruction and use a reduced bit-width instruction.
17472 bool NeedTruncation = false;
17473 SDValue ArithOp = Op;
17474 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
17475 SDValue Arith = Op->getOperand(0);
17476 // Both the trunc and the arithmetic op need to have one user each.
17477 if (Arith->hasOneUse())
17478 switch (Arith.getOpcode()) {
17485 NeedTruncation = true;
17491 // Sometimes flags can be set either with an AND or with an SRL/SHL
17492 // instruction. SRL/SHL variant should be preferred for masks longer than this
17494 const int ShiftToAndMaxMaskWidth = 32;
17495 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
17497 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
17498 // which may be the result of a CAST. We use the variable 'Op', which is the
17499 // non-casted variable when we check for possible users.
17500 switch (ArithOp.getOpcode()) {
17502 // We only want to rewrite this as a target-specific node with attached
17503 // flags if there is a reasonable chance of either using that to do custom
17504 // instructions selection that can fold some of the memory operands, or if
17505 // only the flags are used. If there are other uses, leave the node alone
17506 // and emit a test instruction.
17507 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17508 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17509 if (UI->getOpcode() != ISD::CopyToReg &&
17510 UI->getOpcode() != ISD::SETCC &&
17511 UI->getOpcode() != ISD::STORE)
17514 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
17515 // An add of one will be selected as an INC.
17517 (!Subtarget.slowIncDec() ||
17518 DAG.getMachineFunction().getFunction().optForSize())) {
17519 Opcode = X86ISD::INC;
17524 // An add of negative one (subtract of one) will be selected as a DEC.
17525 if (C->isAllOnesValue() &&
17526 (!Subtarget.slowIncDec() ||
17527 DAG.getMachineFunction().getFunction().optForSize())) {
17528 Opcode = X86ISD::DEC;
17534 // Otherwise use a regular EFLAGS-setting add.
17535 Opcode = X86ISD::ADD;
17540 // If we have a constant logical shift that's only used in a comparison
17541 // against zero turn it into an equivalent AND. This allows turning it into
17542 // a TEST instruction later.
17543 if (ZeroCheck && Op->hasOneUse() &&
17544 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
17545 EVT VT = Op.getValueType();
17546 unsigned BitWidth = VT.getSizeInBits();
17547 unsigned ShAmt = Op->getConstantOperandVal(1);
17548 if (ShAmt >= BitWidth) // Avoid undefined shifts.
17550 APInt Mask = ArithOp.getOpcode() == ISD::SRL
17551 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
17552 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
17553 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17555 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
17556 DAG.getConstant(Mask, dl, VT));
17561 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
17562 // because a TEST instruction will be better. However, AND should be
17563 // preferred if the instruction can be combined into ANDN.
17564 if (!hasNonFlagsUse(Op)) {
17565 SDValue Op0 = ArithOp->getOperand(0);
17566 SDValue Op1 = ArithOp->getOperand(1);
17567 EVT VT = ArithOp.getValueType();
17568 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
17569 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
17570 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
17572 // If we cannot select an ANDN instruction, check if we can replace
17573 // AND+IMM64 with a shift before giving up. This is possible for masks
17574 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
17575 if (!isProperAndn) {
17579 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
17580 auto *CN = dyn_cast<ConstantSDNode>(Op1);
17584 const APInt &Mask = CN->getAPIntValue();
17585 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
17586 break; // Prefer TEST instruction.
17588 unsigned BitWidth = Mask.getBitWidth();
17589 unsigned LeadingOnes = Mask.countLeadingOnes();
17590 unsigned TrailingZeros = Mask.countTrailingZeros();
17592 if (LeadingOnes + TrailingZeros == BitWidth) {
17593 assert(TrailingZeros < VT.getSizeInBits() &&
17594 "Shift amount should be less than the type width");
17595 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17596 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
17597 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
17601 unsigned LeadingZeros = Mask.countLeadingZeros();
17602 unsigned TrailingOnes = Mask.countTrailingOnes();
17604 if (LeadingZeros + TrailingOnes == BitWidth) {
17605 assert(LeadingZeros < VT.getSizeInBits() &&
17606 "Shift amount should be less than the type width");
17607 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
17608 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
17609 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
17620 // Similar to ISD::ADD above, check if the uses will preclude useful
17621 // lowering of the target-specific node.
17622 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
17623 UE = Op.getNode()->use_end(); UI != UE; ++UI)
17624 if (UI->getOpcode() != ISD::CopyToReg &&
17625 UI->getOpcode() != ISD::SETCC &&
17626 UI->getOpcode() != ISD::STORE)
17629 // Otherwise use a regular EFLAGS-setting instruction.
17630 switch (ArithOp.getOpcode()) {
17631 default: llvm_unreachable("unexpected operator!");
17632 case ISD::SUB: Opcode = X86ISD::SUB; break;
17633 case ISD::XOR: Opcode = X86ISD::XOR; break;
17634 case ISD::AND: Opcode = X86ISD::AND; break;
17635 case ISD::OR: Opcode = X86ISD::OR; break;
17647 return SDValue(Op.getNode(), 1);
17653 // If we found that truncation is beneficial, perform the truncation and
17655 if (NeedTruncation) {
17656 EVT VT = Op.getValueType();
17657 SDValue WideVal = Op->getOperand(0);
17658 EVT WideVT = WideVal.getValueType();
17659 unsigned ConvertedOp = 0;
17660 // Use a target machine opcode to prevent further DAGCombine
17661 // optimizations that may separate the arithmetic operations
17662 // from the setcc node.
17663 switch (WideVal.getOpcode()) {
17665 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17666 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17667 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17668 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17669 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17673 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17674 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17675 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17676 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17677 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17678 Op = DAG.getNode(ConvertedOp, dl, VTs, V0, V1);
17684 // Emit a CMP with 0, which is the TEST pattern.
17685 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17686 DAG.getConstant(0, dl, Op.getValueType()));
17688 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17689 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17691 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17692 DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New);
17693 return SDValue(New.getNode(), 1);
17696 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17698 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17699 const SDLoc &dl, SelectionDAG &DAG) const {
17700 if (isNullConstant(Op1))
17701 return EmitTest(Op0, X86CC, dl, DAG);
17703 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17704 "Unexpected comparison operation for MVT::i1 operands");
17706 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17707 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17708 // Only promote the compare up to I32 if it is a 16 bit operation
17709 // with an immediate. 16 bit immediates are to be avoided.
17710 if ((Op0.getValueType() == MVT::i16 &&
17711 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17712 !DAG.getMachineFunction().getFunction().optForMinSize() &&
17713 !Subtarget.isAtom()) {
17714 unsigned ExtendOp =
17715 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17716 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17717 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17719 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17720 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17721 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17722 return SDValue(Sub.getNode(), 1);
17724 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17727 /// Convert a comparison if required by the subtarget.
17728 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17729 SelectionDAG &DAG) const {
17730 // If the subtarget does not support the FUCOMI instruction, floating-point
17731 // comparisons have to be converted.
17732 if (Subtarget.hasCMov() ||
17733 Cmp.getOpcode() != X86ISD::CMP ||
17734 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17735 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17738 // The instruction selector will select an FUCOM instruction instead of
17739 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17740 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17741 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17743 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17744 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17745 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17746 DAG.getConstant(8, dl, MVT::i8));
17747 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17749 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17750 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17751 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17754 /// Check if replacement of SQRT with RSQRT should be disabled.
17755 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17756 EVT VT = Op.getValueType();
17758 // We never want to use both SQRT and RSQRT instructions for the same input.
17759 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17763 return Subtarget.hasFastVectorFSQRT();
17764 return Subtarget.hasFastScalarFSQRT();
17767 /// The minimum architected relative accuracy is 2^-12. We need one
17768 /// Newton-Raphson step to have a good float result (24 bits of precision).
17769 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17770 SelectionDAG &DAG, int Enabled,
17771 int &RefinementSteps,
17772 bool &UseOneConstNR,
17773 bool Reciprocal) const {
17774 EVT VT = Op.getValueType();
17776 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17777 // TODO: Add support for AVX512 (v16f32).
17778 // It is likely not profitable to do this for f64 because a double-precision
17779 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17780 // instructions: convert to single, rsqrtss, convert back to double, refine
17781 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17782 // along with FMA, this could be a throughput win.
17783 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17784 // after legalize types.
17785 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17786 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17787 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17788 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17789 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17790 RefinementSteps = 1;
17792 UseOneConstNR = false;
17793 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17798 /// The minimum architected relative accuracy is 2^-12. We need one
17799 /// Newton-Raphson step to have a good float result (24 bits of precision).
17800 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17802 int &RefinementSteps) const {
17803 EVT VT = Op.getValueType();
17805 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17806 // TODO: Add support for AVX512 (v16f32).
17807 // It is likely not profitable to do this for f64 because a double-precision
17808 // reciprocal estimate with refinement on x86 prior to FMA requires
17809 // 15 instructions: convert to single, rcpss, convert back to double, refine
17810 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17811 // along with FMA, this could be a throughput win.
17813 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17814 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17815 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17816 // Enable estimate codegen with 1 refinement step for vector division.
17817 // Scalar division estimates are disabled because they break too much
17818 // real-world code. These defaults are intended to match GCC behavior.
17819 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17822 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17823 RefinementSteps = 1;
17825 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17830 /// If we have at least two divisions that use the same divisor, convert to
17831 /// multiplication by a reciprocal. This may need to be adjusted for a given
17832 /// CPU if a division's cost is not at least twice the cost of a multiplication.
17833 /// This is because we still need one division to calculate the reciprocal and
17834 /// then we need two multiplies by that reciprocal as replacements for the
17835 /// original divisions.
17836 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17840 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17841 /// according to equal/not-equal condition code \p CC.
17842 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17843 const SDLoc &dl, SelectionDAG &DAG) {
17844 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17845 // instruction. Since the shift amount is in-range-or-undefined, we know
17846 // that doing a bittest on the i32 value is ok. We extend to i32 because
17847 // the encoding for the i16 version is larger than the i32 version.
17848 // Also promote i16 to i32 for performance / code size reason.
17849 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17850 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17852 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17853 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17854 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17855 // known to be zero.
17856 if (Src.getValueType() == MVT::i64 &&
17857 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17858 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17860 // If the operand types disagree, extend the shift amount to match. Since
17861 // BT ignores high bits (like shifts) we can use anyextend.
17862 if (Src.getValueType() != BitNo.getValueType())
17863 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17865 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17866 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17867 return getSETCC(Cond, BT, dl , DAG);
17870 /// Result of 'and' is compared against zero. Change to a BT node if possible.
17871 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17872 const SDLoc &dl, SelectionDAG &DAG) {
17873 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
17874 SDValue Op0 = And.getOperand(0);
17875 SDValue Op1 = And.getOperand(1);
17876 if (Op0.getOpcode() == ISD::TRUNCATE)
17877 Op0 = Op0.getOperand(0);
17878 if (Op1.getOpcode() == ISD::TRUNCATE)
17879 Op1 = Op1.getOperand(0);
17882 if (Op1.getOpcode() == ISD::SHL)
17883 std::swap(Op0, Op1);
17884 if (Op0.getOpcode() == ISD::SHL) {
17885 if (isOneConstant(Op0.getOperand(0))) {
17886 // If we looked past a truncate, check that it's only truncating away
17888 unsigned BitWidth = Op0.getValueSizeInBits();
17889 unsigned AndBitWidth = And.getValueSizeInBits();
17890 if (BitWidth > AndBitWidth) {
17892 DAG.computeKnownBits(Op0, Known);
17893 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17897 RHS = Op0.getOperand(1);
17899 } else if (Op1.getOpcode() == ISD::Constant) {
17900 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17901 uint64_t AndRHSVal = AndRHS->getZExtValue();
17902 SDValue AndLHS = Op0;
17904 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17905 LHS = AndLHS.getOperand(0);
17906 RHS = AndLHS.getOperand(1);
17908 // Use BT if the immediate can't be encoded in a TEST instruction or we
17909 // are optimizing for size and the immedaite won't fit in a byte.
17910 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
17911 if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
17912 isPowerOf2_64(AndRHSVal)) {
17914 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17920 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17925 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17927 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17932 // SSE Condition code mapping:
17941 switch (SetCCOpcode) {
17942 default: llvm_unreachable("Unexpected SETCC condition");
17944 case ISD::SETEQ: SSECC = 0; break;
17946 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17948 case ISD::SETOLT: SSECC = 1; break;
17950 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17952 case ISD::SETOLE: SSECC = 2; break;
17953 case ISD::SETUO: SSECC = 3; break;
17955 case ISD::SETNE: SSECC = 4; break;
17956 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17957 case ISD::SETUGE: SSECC = 5; break;
17958 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17959 case ISD::SETUGT: SSECC = 6; break;
17960 case ISD::SETO: SSECC = 7; break;
17961 case ISD::SETUEQ: SSECC = 8; break;
17962 case ISD::SETONE: SSECC = 12; break;
17965 std::swap(Op0, Op1);
17970 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17971 /// concatenate the result back.
17972 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17973 MVT VT = Op.getSimpleValueType();
17975 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17976 "Unsupported value type for operation");
17978 unsigned NumElems = VT.getVectorNumElements();
17980 SDValue CC = Op.getOperand(2);
17982 // Extract the LHS vectors
17983 SDValue LHS = Op.getOperand(0);
17984 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17985 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17987 // Extract the RHS vectors
17988 SDValue RHS = Op.getOperand(1);
17989 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17990 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17992 // Issue the operation on the smaller types and concatenate the result back
17993 MVT EltVT = VT.getVectorElementType();
17994 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17995 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17996 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17997 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
18000 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
18002 SDValue Op0 = Op.getOperand(0);
18003 SDValue Op1 = Op.getOperand(1);
18004 SDValue CC = Op.getOperand(2);
18005 MVT VT = Op.getSimpleValueType();
18008 assert(VT.getVectorElementType() == MVT::i1 &&
18009 "Cannot set masked compare for this operation");
18011 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
18013 // If this is a seteq make sure any build vectors of all zeros are on the RHS.
18014 // This helps with vptestm matching.
18015 // TODO: Should we just canonicalize the setcc during DAG combine?
18016 if ((SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE) &&
18017 ISD::isBuildVectorAllZeros(Op0.getNode()))
18018 std::swap(Op0, Op1);
18022 switch (SetCCOpcode) {
18023 default: llvm_unreachable("Unexpected SETCC condition");
18024 case ISD::SETNE: SSECC = 4; break;
18025 case ISD::SETEQ: SSECC = 0; break;
18026 case ISD::SETULT: SSECC = 1; break;
18027 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
18029 case ISD::SETGT: SSECC = 6; break;
18031 case ISD::SETGE: SSECC = 5; break;
18033 case ISD::SETLE: SSECC = 2; break;
18036 std::swap(Op0, Op1);
18038 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode) ? X86ISD::CMPMU
18040 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18041 DAG.getConstant(SSECC, dl, MVT::i8));
18044 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
18045 /// operand \p Op1. If non-trivial (for example because it's not constant)
18046 /// return an empty value.
18047 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
18048 SelectionDAG &DAG) {
18049 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
18053 MVT VT = Op1.getSimpleValueType();
18054 MVT EVT = VT.getVectorElementType();
18055 unsigned n = VT.getVectorNumElements();
18056 SmallVector<SDValue, 8> ULTOp1;
18058 for (unsigned i = 0; i < n; ++i) {
18059 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
18060 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
18063 // Avoid underflow.
18064 APInt Val = Elt->getAPIntValue();
18068 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
18071 return DAG.getBuildVector(VT, dl, ULTOp1);
18074 /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for
18076 /// t = psubus Op0, Op1
18077 /// pcmpeq t, <0..0>
18078 static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
18079 ISD::CondCode Cond, const SDLoc &dl,
18080 const X86Subtarget &Subtarget,
18081 SelectionDAG &DAG) {
18082 if (!Subtarget.hasSSE2())
18085 MVT VET = VT.getVectorElementType();
18086 if (VET != MVT::i8 && VET != MVT::i16)
18092 case ISD::SETULT: {
18093 // If the comparison is against a constant we can turn this into a
18094 // setule. With psubus, setule does not require a swap. This is
18095 // beneficial because the constant in the register is no longer
18096 // destructed as the destination so it can be hoisted out of a loop.
18097 // Only do this pre-AVX since vpcmp* is no longer destructive.
18098 if (Subtarget.hasAVX())
18100 SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
18106 // Psubus is better than flip-sign because it requires no inversion.
18108 std::swap(Op0, Op1);
18114 SDValue Result = DAG.getNode(X86ISD::SUBUS, dl, VT, Op0, Op1);
18115 return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
18116 getZeroVector(VT, Subtarget, DAG, dl));
18119 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
18120 SelectionDAG &DAG) {
18121 SDValue Op0 = Op.getOperand(0);
18122 SDValue Op1 = Op.getOperand(1);
18123 SDValue CC = Op.getOperand(2);
18124 MVT VT = Op.getSimpleValueType();
18125 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
18126 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
18131 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
18132 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
18136 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
18137 assert(VT.getVectorNumElements() <= 16);
18138 Opc = X86ISD::CMPM;
18140 Opc = X86ISD::CMPP;
18141 // The SSE/AVX packed FP comparison nodes are defined with a
18142 // floating-point vector result that matches the operand type. This allows
18143 // them to work with an SSE1 target (integer vector types are not legal).
18144 VT = Op0.getSimpleValueType();
18147 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
18148 // emit two comparisons and a logic op to tie them together.
18150 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
18151 if (SSECC >= 8 && !Subtarget.hasAVX()) {
18152 // LLVM predicate is SETUEQ or SETONE.
18154 unsigned CombineOpc;
18155 if (Cond == ISD::SETUEQ) {
18158 CombineOpc = X86ISD::FOR;
18160 assert(Cond == ISD::SETONE);
18163 CombineOpc = X86ISD::FAND;
18166 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18167 DAG.getConstant(CC0, dl, MVT::i8));
18168 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
18169 DAG.getConstant(CC1, dl, MVT::i8));
18170 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
18172 // Handle all other FP comparisons here.
18173 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
18174 DAG.getConstant(SSECC, dl, MVT::i8));
18177 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
18178 // result type of SETCC. The bitcast is expected to be optimized away
18179 // during combining/isel.
18180 if (Opc == X86ISD::CMPP)
18181 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
18186 MVT VTOp0 = Op0.getSimpleValueType();
18187 assert(VTOp0 == Op1.getSimpleValueType() &&
18188 "Expected operands with same type!");
18189 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
18190 "Invalid number of packed elements for source and destination!");
18192 // This is being called by type legalization because v2i32 is marked custom
18193 // for result type legalization for v2f32.
18194 if (VTOp0 == MVT::v2i32)
18197 // The non-AVX512 code below works under the assumption that source and
18198 // destination types are the same.
18199 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
18200 "Value types for source and destination must be the same!");
18202 // Break 256-bit integer vector compare into smaller ones.
18203 if (VT.is256BitVector() && !Subtarget.hasInt256())
18204 return Lower256IntVSETCC(Op, DAG);
18206 // The result is boolean, but operands are int/float
18207 if (VT.getVectorElementType() == MVT::i1) {
18208 // In AVX-512 architecture setcc returns mask with i1 elements,
18209 // But there is no compare instruction for i8 and i16 elements in KNL.
18210 assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&
18211 "Unexpected operand type");
18212 return LowerIntVSETCC_AVX512(Op, DAG);
18215 // Lower using XOP integer comparisons.
18216 if (VT.is128BitVector() && Subtarget.hasXOP()) {
18217 // Translate compare code to XOP PCOM compare mode.
18218 unsigned CmpMode = 0;
18220 default: llvm_unreachable("Unexpected SETCC condition");
18222 case ISD::SETLT: CmpMode = 0x00; break;
18224 case ISD::SETLE: CmpMode = 0x01; break;
18226 case ISD::SETGT: CmpMode = 0x02; break;
18228 case ISD::SETGE: CmpMode = 0x03; break;
18229 case ISD::SETEQ: CmpMode = 0x04; break;
18230 case ISD::SETNE: CmpMode = 0x05; break;
18233 // Are we comparing unsigned or signed integers?
18235 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
18237 return DAG.getNode(Opc, dl, VT, Op0, Op1,
18238 DAG.getConstant(CmpMode, dl, MVT::i8));
18241 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
18242 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
18243 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
18244 SDValue BC0 = peekThroughBitcasts(Op0);
18245 if (BC0.getOpcode() == ISD::AND) {
18247 SmallVector<APInt, 64> EltBits;
18248 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
18249 VT.getScalarSizeInBits(), UndefElts,
18250 EltBits, false, false)) {
18251 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
18253 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
18259 // If this is a SETNE against the signed minimum value, change it to SETGT.
18260 // If this is a SETNE against the signed maximum value, change it to SETLT
18261 // which will be swapped to SETGT.
18262 // Otherwise we use PCMPEQ+invert.
18264 if (Cond == ISD::SETNE &&
18265 ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) {
18266 if (ConstValue.isMinSignedValue())
18268 else if (ConstValue.isMaxSignedValue())
18272 // If both operands are known non-negative, then an unsigned compare is the
18273 // same as a signed compare and there's no need to flip signbits.
18274 // TODO: We could check for more general simplifications here since we're
18275 // computing known bits.
18276 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
18277 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
18279 // Special case: Use min/max operations for unsigned compares. We only want
18280 // to do this for unsigned compares if we need to flip signs or if it allows
18281 // use to avoid an invert.
18282 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18283 if (ISD::isUnsignedIntSetCC(Cond) &&
18284 (FlipSigns || ISD::isTrueWhenEqual(Cond)) &&
18285 TLI.isOperationLegal(ISD::UMIN, VT)) {
18286 bool Invert = false;
18289 default: llvm_unreachable("Unexpected condition code");
18290 case ISD::SETUGT: Invert = true; LLVM_FALLTHROUGH;
18291 case ISD::SETULE: Opc = ISD::UMIN; break;
18292 case ISD::SETULT: Invert = true; LLVM_FALLTHROUGH;
18293 case ISD::SETUGE: Opc = ISD::UMAX; break;
18296 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18297 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
18299 // If the logical-not of the result is required, perform that now.
18301 Result = DAG.getNOT(dl, Result, VT);
18306 // Try to use SUBUS and PCMPEQ.
18307 if (SDValue V = LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG))
18310 // We are handling one of the integer comparisons here. Since SSE only has
18311 // GT and EQ comparisons for integer, swapping operands and multiple
18312 // operations may be required for some comparisons.
18313 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
18315 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
18316 Cond == ISD::SETGE || Cond == ISD::SETUGE;
18317 bool Invert = Cond == ISD::SETNE ||
18318 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
18321 std::swap(Op0, Op1);
18323 // Check that the operation in question is available (most are plain SSE2,
18324 // but PCMPGTQ and PCMPEQQ have different requirements).
18325 if (VT == MVT::v2i64) {
18326 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
18327 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
18329 // First cast everything to the right type.
18330 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18331 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18333 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18334 // bits of the inputs before performing those operations. The lower
18335 // compare is always unsigned.
18338 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
18340 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
18341 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
18342 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
18344 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
18345 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
18347 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
18348 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
18349 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
18351 // Create masks for only the low parts/high parts of the 64 bit integers.
18352 static const int MaskHi[] = { 1, 1, 3, 3 };
18353 static const int MaskLo[] = { 0, 0, 2, 2 };
18354 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
18355 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
18356 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
18358 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
18359 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
18362 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18364 return DAG.getBitcast(VT, Result);
18367 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
18368 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
18369 // pcmpeqd + pshufd + pand.
18370 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
18372 // First cast everything to the right type.
18373 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
18374 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
18377 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
18379 // Make sure the lower and upper halves are both all-ones.
18380 static const int Mask[] = { 1, 0, 3, 2 };
18381 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
18382 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
18385 Result = DAG.getNOT(dl, Result, MVT::v4i32);
18387 return DAG.getBitcast(VT, Result);
18391 // Since SSE has no unsigned integer comparisons, we need to flip the sign
18392 // bits of the inputs before performing those operations.
18394 MVT EltVT = VT.getVectorElementType();
18395 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
18397 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
18398 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
18401 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
18403 // If the logical-not of the result is required, perform that now.
18405 Result = DAG.getNOT(dl, Result, VT);
18410 // Try to select this as a KTEST+SETCC if possible.
18411 static SDValue EmitKTEST(SDValue Op0, SDValue Op1, ISD::CondCode CC,
18412 const SDLoc &dl, SelectionDAG &DAG,
18413 const X86Subtarget &Subtarget) {
18414 // Only support equality comparisons.
18415 if (CC != ISD::SETEQ && CC != ISD::SETNE)
18418 // Must be a bitcast from vXi1.
18419 if (Op0.getOpcode() != ISD::BITCAST)
18422 Op0 = Op0.getOperand(0);
18423 MVT VT = Op0.getSimpleValueType();
18424 if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) &&
18425 !(Subtarget.hasDQI() && VT == MVT::v8i1) &&
18426 !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)))
18429 X86::CondCode X86CC;
18430 if (isNullConstant(Op1)) {
18431 X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
18432 } else if (isAllOnesConstant(Op1)) {
18433 // C flag is set for all ones.
18434 X86CC = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE;
18438 // If the input is an OR, we can combine it's operands into the KORTEST.
18441 if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) {
18442 LHS = Op0.getOperand(0);
18443 RHS = Op0.getOperand(1);
18446 SDValue KORTEST = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
18447 return getSETCC(X86CC, KORTEST, dl, DAG);
18450 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
18452 MVT VT = Op.getSimpleValueType();
18454 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
18456 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
18457 SDValue Op0 = Op.getOperand(0);
18458 SDValue Op1 = Op.getOperand(1);
18460 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
18462 // Optimize to BT if possible.
18463 // Lower (X & (1 << N)) == 0 to BT(X, N).
18464 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
18465 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
18466 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
18467 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18468 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
18472 // Try to use PTEST for a tree ORs equality compared with 0.
18473 // TODO: We could do AND tree with all 1s as well by using the C flag.
18474 if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
18475 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18476 if (SDValue NewSetCC = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG))
18480 // Try to lower using KTEST.
18481 if (SDValue NewSetCC = EmitKTEST(Op0, Op1, CC, dl, DAG, Subtarget))
18484 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
18486 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
18487 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
18489 // If the input is a setcc, then reuse the input setcc or use a new one with
18490 // the inverted condition.
18491 if (Op0.getOpcode() == X86ISD::SETCC) {
18492 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
18493 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
18497 CCode = X86::GetOppositeBranchCondition(CCode);
18498 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
18502 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
18503 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
18504 if (X86CC == X86::COND_INVALID)
18507 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
18508 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
18509 return getSETCC(X86CC, EFLAGS, dl, DAG);
18512 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
18513 SDValue LHS = Op.getOperand(0);
18514 SDValue RHS = Op.getOperand(1);
18515 SDValue Carry = Op.getOperand(2);
18516 SDValue Cond = Op.getOperand(3);
18519 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
18520 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
18522 // Recreate the carry if needed.
18523 EVT CarryVT = Carry.getValueType();
18524 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
18525 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
18526 Carry, DAG.getConstant(NegOne, DL, CarryVT));
18528 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18529 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
18530 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
18533 /// Return true if opcode is a X86 logical comparison.
18534 static bool isX86LogicalCmp(SDValue Op) {
18535 unsigned Opc = Op.getOpcode();
18536 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
18537 Opc == X86ISD::SAHF)
18539 if (Op.getResNo() == 1 &&
18540 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
18541 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
18542 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
18543 Opc == X86ISD::XOR || Opc == X86ISD::AND))
18546 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
18552 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
18553 if (V.getOpcode() != ISD::TRUNCATE)
18556 SDValue VOp0 = V.getOperand(0);
18557 unsigned InBits = VOp0.getValueSizeInBits();
18558 unsigned Bits = V.getValueSizeInBits();
18559 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
18562 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
18563 bool AddTest = true;
18564 SDValue Cond = Op.getOperand(0);
18565 SDValue Op1 = Op.getOperand(1);
18566 SDValue Op2 = Op.getOperand(2);
18568 MVT VT = Op1.getSimpleValueType();
18571 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
18572 // are available or VBLENDV if AVX is available.
18573 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
18574 if (Cond.getOpcode() == ISD::SETCC &&
18575 ((Subtarget.hasSSE2() && VT == MVT::f64) ||
18576 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
18577 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
18578 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
18579 unsigned SSECC = translateX86FSETCC(
18580 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
18582 if (Subtarget.hasAVX512()) {
18583 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
18584 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
18585 assert(!VT.isVector() && "Not a scalar type?");
18586 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18589 if (SSECC < 8 || Subtarget.hasAVX()) {
18590 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
18591 DAG.getConstant(SSECC, DL, MVT::i8));
18593 // If we have AVX, we can use a variable vector select (VBLENDV) instead
18594 // of 3 logic instructions for size savings and potentially speed.
18595 // Unfortunately, there is no scalar form of VBLENDV.
18597 // If either operand is a constant, don't try this. We can expect to
18598 // optimize away at least one of the logic instructions later in that
18599 // case, so that sequence would be faster than a variable blend.
18601 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
18602 // uses XMM0 as the selection register. That may need just as many
18603 // instructions as the AND/ANDN/OR sequence due to register moves, so
18606 if (Subtarget.hasAVX() &&
18607 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
18609 // Convert to vectors, do a VSELECT, and convert back to scalar.
18610 // All of the conversions should be optimized away.
18612 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
18613 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
18614 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
18615 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
18617 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18618 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18620 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18622 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18623 VSel, DAG.getIntPtrConstant(0, DL));
18625 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18626 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18627 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18631 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18632 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18633 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18634 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18637 // For v64i1 without 64-bit support we need to split and rejoin.
18638 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
18639 assert(Subtarget.hasBWI() && "Expected BWI to be legal");
18640 SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
18641 SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
18642 SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
18643 SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
18644 SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
18645 SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
18646 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
18649 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18651 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18652 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18653 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18654 Op1Scalar = Op1.getOperand(0);
18656 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18657 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18658 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18659 Op2Scalar = Op2.getOperand(0);
18660 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18661 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18662 Op1Scalar, Op2Scalar);
18663 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18664 return DAG.getBitcast(VT, newSelect);
18665 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18666 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18667 DAG.getIntPtrConstant(0, DL));
18671 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18672 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18673 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18674 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18675 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18676 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18677 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18678 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18681 if (Cond.getOpcode() == ISD::SETCC) {
18682 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18684 // If the condition was updated, it's possible that the operands of the
18685 // select were also updated (for example, EmitTest has a RAUW). Refresh
18686 // the local references to the select operands in case they got stale.
18687 Op1 = Op.getOperand(1);
18688 Op2 = Op.getOperand(2);
18692 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18693 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18694 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18695 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18696 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18697 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18698 if (Cond.getOpcode() == X86ISD::SETCC &&
18699 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18700 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18701 SDValue Cmp = Cond.getOperand(1);
18702 unsigned CondCode =
18703 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18705 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18706 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18707 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18708 SDValue CmpOp0 = Cmp.getOperand(0);
18710 // Apply further optimizations for special cases
18711 // (select (x != 0), -1, 0) -> neg & sbb
18712 // (select (x == 0), 0, -1) -> neg & sbb
18713 if (isNullConstant(Y) &&
18714 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18715 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18716 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18717 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18718 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18719 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18720 SDValue(Neg.getNode(), 1));
18724 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18725 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18726 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18728 SDValue Res = // Res = 0 or -1.
18729 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18730 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18732 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18733 Res = DAG.getNOT(DL, Res, Res.getValueType());
18735 if (!isNullConstant(Op2))
18736 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18738 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18739 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18740 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18741 SDValue CmpOp0 = Cmp.getOperand(0);
18742 SDValue Src1, Src2;
18743 // true if Op2 is XOR or OR operator and one of its operands
18745 // ( a , a op b) || ( b , a op b)
18746 auto isOrXorPattern = [&]() {
18747 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18748 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18750 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18757 if (isOrXorPattern()) {
18759 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18760 // we need mask of all zeros or ones with same size of the other
18762 if (CmpSz > VT.getSizeInBits())
18763 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18764 else if (CmpSz < VT.getSizeInBits())
18765 Neg = DAG.getNode(ISD::AND, DL, VT,
18766 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18767 DAG.getConstant(1, DL, VT));
18770 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18771 Neg); // -(and (x, 0x1))
18772 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18773 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18778 // Look past (and (setcc_carry (cmp ...)), 1).
18779 if (Cond.getOpcode() == ISD::AND &&
18780 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18781 isOneConstant(Cond.getOperand(1)))
18782 Cond = Cond.getOperand(0);
18784 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18785 // setting operand in place of the X86ISD::SETCC.
18786 unsigned CondOpcode = Cond.getOpcode();
18787 if (CondOpcode == X86ISD::SETCC ||
18788 CondOpcode == X86ISD::SETCC_CARRY) {
18789 CC = Cond.getOperand(0);
18791 SDValue Cmp = Cond.getOperand(1);
18792 unsigned Opc = Cmp.getOpcode();
18793 MVT VT = Op.getSimpleValueType();
18795 bool IllegalFPCMov = false;
18796 if (VT.isFloatingPoint() && !VT.isVector() &&
18797 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18798 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18800 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18801 Opc == X86ISD::BT) { // FIXME
18805 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18806 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18807 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18808 Cond.getOperand(0).getValueType() != MVT::i8)) {
18809 SDValue LHS = Cond.getOperand(0);
18810 SDValue RHS = Cond.getOperand(1);
18811 unsigned X86Opcode;
18814 switch (CondOpcode) {
18815 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18816 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18817 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18818 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18819 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18820 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18821 default: llvm_unreachable("unexpected overflowing operator");
18823 if (CondOpcode == ISD::UMULO)
18824 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18827 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18829 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18831 if (CondOpcode == ISD::UMULO)
18832 Cond = X86Op.getValue(2);
18834 Cond = X86Op.getValue(1);
18836 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18841 // Look past the truncate if the high bits are known zero.
18842 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18843 Cond = Cond.getOperand(0);
18845 // We know the result of AND is compared against zero. Try to match
18847 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18848 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18849 CC = NewSetCC.getOperand(0);
18850 Cond = NewSetCC.getOperand(1);
18857 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18858 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18861 // a < b ? -1 : 0 -> RES = ~setcc_carry
18862 // a < b ? 0 : -1 -> RES = setcc_carry
18863 // a >= b ? -1 : 0 -> RES = setcc_carry
18864 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18865 if (Cond.getOpcode() == X86ISD::SUB) {
18866 Cond = ConvertCmpIfNecessary(Cond, DAG);
18867 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18869 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18870 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18871 (isNullConstant(Op1) || isNullConstant(Op2))) {
18872 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18873 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18875 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18876 return DAG.getNOT(DL, Res, Res.getValueType());
18881 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18882 // widen the cmov and push the truncate through. This avoids introducing a new
18883 // branch during isel and doesn't add any extensions.
18884 if (Op.getValueType() == MVT::i8 &&
18885 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18886 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18887 if (T1.getValueType() == T2.getValueType() &&
18888 // Blacklist CopyFromReg to avoid partial register stalls.
18889 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18890 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18892 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18896 // Promote i16 cmovs if it won't prevent folding a load.
18897 if (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && !MayFoldLoad(Op2)) {
18898 Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
18899 Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
18900 SDValue Ops[] = { Op2, Op1, CC, Cond };
18901 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops);
18902 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18905 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18906 // condition is true.
18907 SDValue Ops[] = { Op2, Op1, CC, Cond };
18908 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18911 static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
18912 const X86Subtarget &Subtarget,
18913 SelectionDAG &DAG) {
18914 MVT VT = Op->getSimpleValueType(0);
18915 SDValue In = Op->getOperand(0);
18916 MVT InVT = In.getSimpleValueType();
18917 assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
18918 MVT VTElt = VT.getVectorElementType();
18921 unsigned NumElts = VT.getVectorNumElements();
18923 // Extend VT if the scalar type is v8/v16 and BWI is not supported.
18925 if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) {
18926 // If v16i32 is to be avoided, we'll need to split and concatenate.
18927 if (NumElts == 16 && !Subtarget.canExtendTo512DQ())
18928 return SplitAndExtendv16i1(ISD::SIGN_EXTEND, VT, In, dl, DAG);
18930 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18933 // Widen to 512-bits if VLX is not supported.
18934 MVT WideVT = ExtVT;
18935 if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
18936 NumElts *= 512 / ExtVT.getSizeInBits();
18937 InVT = MVT::getVectorVT(MVT::i1, NumElts);
18938 In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
18939 In, DAG.getIntPtrConstant(0, dl));
18940 WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
18944 MVT WideEltVT = WideVT.getVectorElementType();
18945 if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
18946 (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
18947 V = DAG.getNode(ISD::SIGN_EXTEND, dl, WideVT, In);
18949 SDValue NegOne = getOnesVector(WideVT, DAG, dl);
18950 SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
18951 V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
18954 // Truncate if we had to extend i16/i8 above.
18956 WideVT = MVT::getVectorVT(VTElt, NumElts);
18957 V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V);
18960 // Extract back to 128/256-bit if we widened.
18962 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
18963 DAG.getIntPtrConstant(0, dl));
18968 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18969 SelectionDAG &DAG) {
18970 SDValue In = Op->getOperand(0);
18971 MVT InVT = In.getSimpleValueType();
18973 if (InVT.getVectorElementType() == MVT::i1)
18974 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
18976 assert(Subtarget.hasAVX() && "Expected AVX support");
18977 return LowerAVXExtend(Op, DAG, Subtarget);
18980 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18981 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18982 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18983 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18984 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18985 const X86Subtarget &Subtarget,
18986 SelectionDAG &DAG) {
18987 SDValue In = Op->getOperand(0);
18988 MVT VT = Op->getSimpleValueType(0);
18989 MVT InVT = In.getSimpleValueType();
18990 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18992 MVT SVT = VT.getVectorElementType();
18993 MVT InSVT = InVT.getVectorElementType();
18994 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18996 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18998 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
19000 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
19001 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
19002 !(VT.is512BitVector() && Subtarget.hasAVX512()))
19007 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
19008 // For 512-bit vectors, we need 128-bits or 256-bits.
19009 if (VT.getSizeInBits() > 128) {
19010 // Input needs to be at least the same number of elements as output, and
19011 // at least 128-bits.
19012 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
19013 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
19016 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
19017 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
19019 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
19020 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
19021 // need to be handled here for 256/512-bit results.
19022 if (Subtarget.hasInt256()) {
19023 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
19024 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
19025 X86ISD::VSEXT : X86ISD::VZEXT;
19026 return DAG.getNode(ExtOpc, dl, VT, In);
19029 // We should only get here for sign extend.
19030 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
19031 "Unexpected opcode!");
19033 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
19037 // As SRAI is only available on i16/i32 types, we expand only up to i32
19038 // and handle i64 separately.
19039 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
19040 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
19041 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
19042 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
19043 Curr = DAG.getBitcast(CurrVT, Curr);
19046 SDValue SignExt = Curr;
19047 if (CurrVT != InVT) {
19048 unsigned SignExtShift =
19049 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
19050 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19051 DAG.getConstant(SignExtShift, dl, MVT::i8));
19057 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
19058 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
19059 DAG.getConstant(31, dl, MVT::i8));
19060 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
19061 return DAG.getBitcast(VT, Ext);
19067 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
19068 SelectionDAG &DAG) {
19069 MVT VT = Op->getSimpleValueType(0);
19070 SDValue In = Op->getOperand(0);
19071 MVT InVT = In.getSimpleValueType();
19074 if (InVT.getVectorElementType() == MVT::i1)
19075 return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
19077 assert(VT.isVector() && InVT.isVector() && "Expected vector type");
19078 assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
19079 "Expected same number of elements");
19080 assert((VT.getVectorElementType() == MVT::i16 ||
19081 VT.getVectorElementType() == MVT::i32 ||
19082 VT.getVectorElementType() == MVT::i64) &&
19083 "Unexpected element type");
19084 assert((InVT.getVectorElementType() == MVT::i8 ||
19085 InVT.getVectorElementType() == MVT::i16 ||
19086 InVT.getVectorElementType() == MVT::i32) &&
19087 "Unexpected element type");
19089 if (Subtarget.hasInt256())
19090 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
19092 // Optimize vectors in AVX mode
19093 // Sign extend v8i16 to v8i32 and
19096 // Divide input vector into two parts
19097 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
19098 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
19099 // concat the vectors to original VT
19101 unsigned NumElems = InVT.getVectorNumElements();
19102 SDValue Undef = DAG.getUNDEF(InVT);
19104 SmallVector<int,8> ShufMask1(NumElems, -1);
19105 for (unsigned i = 0; i != NumElems/2; ++i)
19108 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
19110 SmallVector<int,8> ShufMask2(NumElems, -1);
19111 for (unsigned i = 0; i != NumElems/2; ++i)
19112 ShufMask2[i] = i + NumElems/2;
19114 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
19116 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
19117 VT.getVectorNumElements() / 2);
19119 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
19120 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
19122 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
19125 static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
19126 SelectionDAG &DAG) {
19127 StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
19129 SDValue StoredVal = St->getValue();
19131 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19132 assert(StoredVal.getValueType().isVector() &&
19133 StoredVal.getValueType().getVectorElementType() == MVT::i1 &&
19134 StoredVal.getValueType().getVectorNumElements() <= 8 &&
19136 assert(!St->isTruncatingStore() && "Expected non-truncating store");
19137 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19138 "Expected AVX512F without AVX512DQI");
19140 StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
19141 DAG.getUNDEF(MVT::v8i1), StoredVal,
19142 DAG.getIntPtrConstant(0, dl));
19143 StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
19145 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
19146 St->getPointerInfo(), St->getAlignment(),
19147 St->getMemOperand()->getFlags());
19150 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
19151 // may emit an illegal shuffle but the expansion is still better than scalar
19152 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
19153 // we'll emit a shuffle and a arithmetic shift.
19154 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
19155 // TODO: It is possible to support ZExt by zeroing the undef values during
19156 // the shuffle phase or after the shuffle.
19157 static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
19158 SelectionDAG &DAG) {
19159 MVT RegVT = Op.getSimpleValueType();
19160 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
19161 assert(RegVT.isInteger() &&
19162 "We only custom lower integer vector sext loads.");
19164 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
19166 EVT MemVT = Ld->getMemoryVT();
19168 // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
19169 if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
19170 assert(EVT(RegVT) == MemVT && "Expected non-extending load");
19171 assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
19172 assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
19173 "Expected AVX512F without AVX512DQI");
19175 SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
19176 Ld->getPointerInfo(), Ld->getAlignment(),
19177 Ld->getMemOperand()->getFlags());
19179 // Replace chain users with the new chain.
19180 assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
19181 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
19183 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
19184 DAG.getBitcast(MVT::v8i1, NewLd),
19185 DAG.getIntPtrConstant(0, dl));
19186 return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
19189 // Nothing useful we can do without SSE2 shuffles.
19190 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
19192 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19193 unsigned RegSz = RegVT.getSizeInBits();
19195 ISD::LoadExtType Ext = Ld->getExtensionType();
19197 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
19198 && "Only anyext and sext are currently implemented.");
19199 assert(MemVT != RegVT && "Cannot extend to the same type");
19200 assert(MemVT.isVector() && "Must load a vector from memory");
19202 unsigned NumElems = RegVT.getVectorNumElements();
19203 unsigned MemSz = MemVT.getSizeInBits();
19204 assert(RegSz > MemSz && "Register size must be greater than the mem size");
19206 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
19207 // The only way in which we have a legal 256-bit vector result but not the
19208 // integer 256-bit operations needed to directly lower a sextload is if we
19209 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
19210 // a 128-bit vector and a normal sign_extend to 256-bits that should get
19211 // correctly legalized. We do this late to allow the canonical form of
19212 // sextload to persist throughout the rest of the DAG combiner -- it wants
19213 // to fold together any extensions it can, and so will fuse a sign_extend
19214 // of an sextload into a sextload targeting a wider value.
19216 if (MemSz == 128) {
19217 // Just switch this to a normal load.
19218 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
19219 "it must be a legal 128-bit vector "
19221 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
19222 Ld->getPointerInfo(), Ld->getAlignment(),
19223 Ld->getMemOperand()->getFlags());
19225 assert(MemSz < 128 &&
19226 "Can't extend a type wider than 128 bits to a 256 bit vector!");
19227 // Do an sext load to a 128-bit vector type. We want to use the same
19228 // number of elements, but elements half as wide. This will end up being
19229 // recursively lowered by this routine, but will succeed as we definitely
19230 // have all the necessary features if we're using AVX1.
19232 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
19233 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
19235 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
19236 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
19237 Ld->getMemOperand()->getFlags());
19240 // Replace chain users with the new chain.
19241 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
19242 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
19244 // Finally, do a normal sign-extend to the desired register.
19245 return DAG.getSExtOrTrunc(Load, dl, RegVT);
19248 // All sizes must be a power of two.
19249 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
19250 "Non-power-of-two elements are not custom lowered!");
19252 // Attempt to load the original value using scalar loads.
19253 // Find the largest scalar type that divides the total loaded size.
19254 MVT SclrLoadTy = MVT::i8;
19255 for (MVT Tp : MVT::integer_valuetypes()) {
19256 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
19261 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
19262 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
19264 SclrLoadTy = MVT::f64;
19266 // Calculate the number of scalar loads that we need to perform
19267 // in order to load our vector from memory.
19268 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
19270 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
19271 "Can only lower sext loads with a single scalar load!");
19273 unsigned loadRegZize = RegSz;
19274 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
19277 // If we don't have BWI we won't be able to create the shuffle needed for
19279 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19280 MemVT == MVT::v8i8)
19283 // Represent our vector as a sequence of elements which are the
19284 // largest scalar that we can load.
19285 EVT LoadUnitVecVT = EVT::getVectorVT(
19286 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
19288 // Represent the data using the same element type that is stored in
19289 // memory. In practice, we ''widen'' MemVT.
19291 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
19292 loadRegZize / MemVT.getScalarSizeInBits());
19294 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
19295 "Invalid vector type");
19297 // We can't shuffle using an illegal type.
19298 assert(TLI.isTypeLegal(WideVecVT) &&
19299 "We only lower types that form legal widened vector types");
19301 SmallVector<SDValue, 8> Chains;
19302 SDValue Ptr = Ld->getBasePtr();
19303 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
19304 TLI.getPointerTy(DAG.getDataLayout()));
19305 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
19307 for (unsigned i = 0; i < NumLoads; ++i) {
19308 // Perform a single load.
19309 SDValue ScalarLoad =
19310 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
19311 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
19312 Chains.push_back(ScalarLoad.getValue(1));
19313 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
19314 // another round of DAGCombining.
19316 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
19318 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
19319 ScalarLoad, DAG.getIntPtrConstant(i, dl));
19321 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
19324 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
19326 // Bitcast the loaded value to a vector of the original element type, in
19327 // the size of the target vector type.
19328 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
19329 unsigned SizeRatio = RegSz / MemSz;
19331 if (Ext == ISD::SEXTLOAD) {
19332 // If we have SSE4.1, we can directly emit a VSEXT node.
19333 if (Subtarget.hasSSE41()) {
19334 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
19335 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19339 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
19341 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
19342 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
19344 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
19345 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19349 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
19350 MemVT == MVT::v8i8) {
19351 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
19352 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19356 // Redistribute the loaded elements into the different locations.
19357 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
19358 for (unsigned i = 0; i != NumElems; ++i)
19359 ShuffleVec[i * SizeRatio] = i;
19361 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
19362 DAG.getUNDEF(WideVecVT), ShuffleVec);
19364 // Bitcast to the requested type.
19365 Shuff = DAG.getBitcast(RegVT, Shuff);
19366 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
19370 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
19371 /// each of which has no other use apart from the AND / OR.
19372 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
19373 Opc = Op.getOpcode();
19374 if (Opc != ISD::OR && Opc != ISD::AND)
19376 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19377 Op.getOperand(0).hasOneUse() &&
19378 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
19379 Op.getOperand(1).hasOneUse());
19382 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
19383 /// SETCC node has a single use.
19384 static bool isXor1OfSetCC(SDValue Op) {
19385 if (Op.getOpcode() != ISD::XOR)
19387 if (isOneConstant(Op.getOperand(1)))
19388 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
19389 Op.getOperand(0).hasOneUse();
19393 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
19394 bool addTest = true;
19395 SDValue Chain = Op.getOperand(0);
19396 SDValue Cond = Op.getOperand(1);
19397 SDValue Dest = Op.getOperand(2);
19400 bool Inverted = false;
19402 if (Cond.getOpcode() == ISD::SETCC) {
19403 // Check for setcc([su]{add,sub,mul}o == 0).
19404 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
19405 isNullConstant(Cond.getOperand(1)) &&
19406 Cond.getOperand(0).getResNo() == 1 &&
19407 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
19408 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
19409 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
19410 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
19411 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
19412 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
19414 Cond = Cond.getOperand(0);
19416 if (SDValue NewCond = LowerSETCC(Cond, DAG))
19421 // FIXME: LowerXALUO doesn't handle these!!
19422 else if (Cond.getOpcode() == X86ISD::ADD ||
19423 Cond.getOpcode() == X86ISD::SUB ||
19424 Cond.getOpcode() == X86ISD::SMUL ||
19425 Cond.getOpcode() == X86ISD::UMUL)
19426 Cond = LowerXALUO(Cond, DAG);
19429 // Look pass (and (setcc_carry (cmp ...)), 1).
19430 if (Cond.getOpcode() == ISD::AND &&
19431 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
19432 isOneConstant(Cond.getOperand(1)))
19433 Cond = Cond.getOperand(0);
19435 // If condition flag is set by a X86ISD::CMP, then use it as the condition
19436 // setting operand in place of the X86ISD::SETCC.
19437 unsigned CondOpcode = Cond.getOpcode();
19438 if (CondOpcode == X86ISD::SETCC ||
19439 CondOpcode == X86ISD::SETCC_CARRY) {
19440 CC = Cond.getOperand(0);
19442 SDValue Cmp = Cond.getOperand(1);
19443 unsigned Opc = Cmp.getOpcode();
19444 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
19445 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
19449 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
19453 // These can only come from an arithmetic instruction with overflow,
19454 // e.g. SADDO, UADDO.
19455 Cond = Cond.getOperand(1);
19461 CondOpcode = Cond.getOpcode();
19462 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
19463 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
19464 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
19465 Cond.getOperand(0).getValueType() != MVT::i8)) {
19466 SDValue LHS = Cond.getOperand(0);
19467 SDValue RHS = Cond.getOperand(1);
19468 unsigned X86Opcode;
19471 // Keep this in sync with LowerXALUO, otherwise we might create redundant
19472 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
19474 switch (CondOpcode) {
19475 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
19477 if (isOneConstant(RHS)) {
19478 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
19481 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
19482 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
19484 if (isOneConstant(RHS)) {
19485 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
19488 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
19489 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
19490 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
19491 default: llvm_unreachable("unexpected overflowing operator");
19494 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
19495 if (CondOpcode == ISD::UMULO)
19496 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
19499 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
19501 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
19503 if (CondOpcode == ISD::UMULO)
19504 Cond = X86Op.getValue(2);
19506 Cond = X86Op.getValue(1);
19508 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19512 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
19513 SDValue Cmp = Cond.getOperand(0).getOperand(1);
19514 if (CondOpc == ISD::OR) {
19515 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
19516 // two branches instead of an explicit OR instruction with a
19518 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19519 isX86LogicalCmp(Cmp)) {
19520 CC = Cond.getOperand(0).getOperand(0);
19521 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19522 Chain, Dest, CC, Cmp);
19523 CC = Cond.getOperand(1).getOperand(0);
19527 } else { // ISD::AND
19528 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
19529 // two branches instead of an explicit AND instruction with a
19530 // separate test. However, we only do this if this block doesn't
19531 // have a fall-through edge, because this requires an explicit
19532 // jmp when the condition is false.
19533 if (Cmp == Cond.getOperand(1).getOperand(1) &&
19534 isX86LogicalCmp(Cmp) &&
19535 Op.getNode()->hasOneUse()) {
19536 X86::CondCode CCode =
19537 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19538 CCode = X86::GetOppositeBranchCondition(CCode);
19539 CC = DAG.getConstant(CCode, dl, MVT::i8);
19540 SDNode *User = *Op.getNode()->use_begin();
19541 // Look for an unconditional branch following this conditional branch.
19542 // We need this because we need to reverse the successors in order
19543 // to implement FCMP_OEQ.
19544 if (User->getOpcode() == ISD::BR) {
19545 SDValue FalseBB = User->getOperand(1);
19547 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19548 assert(NewBR == User);
19552 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19553 Chain, Dest, CC, Cmp);
19554 X86::CondCode CCode =
19555 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19556 CCode = X86::GetOppositeBranchCondition(CCode);
19557 CC = DAG.getConstant(CCode, dl, MVT::i8);
19563 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19564 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19565 // It should be transformed during dag combiner except when the condition
19566 // is set by a arithmetics with overflow node.
19567 X86::CondCode CCode =
19568 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19569 CCode = X86::GetOppositeBranchCondition(CCode);
19570 CC = DAG.getConstant(CCode, dl, MVT::i8);
19571 Cond = Cond.getOperand(0).getOperand(1);
19573 } else if (Cond.getOpcode() == ISD::SETCC &&
19574 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19575 // For FCMP_OEQ, we can emit
19576 // two branches instead of an explicit AND instruction with a
19577 // separate test. However, we only do this if this block doesn't
19578 // have a fall-through edge, because this requires an explicit
19579 // jmp when the condition is false.
19580 if (Op.getNode()->hasOneUse()) {
19581 SDNode *User = *Op.getNode()->use_begin();
19582 // Look for an unconditional branch following this conditional branch.
19583 // We need this because we need to reverse the successors in order
19584 // to implement FCMP_OEQ.
19585 if (User->getOpcode() == ISD::BR) {
19586 SDValue FalseBB = User->getOperand(1);
19588 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19589 assert(NewBR == User);
19593 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19594 Cond.getOperand(0), Cond.getOperand(1));
19595 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19596 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19597 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19598 Chain, Dest, CC, Cmp);
19599 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19604 } else if (Cond.getOpcode() == ISD::SETCC &&
19605 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19606 // For FCMP_UNE, we can emit
19607 // two branches instead of an explicit AND instruction with a
19608 // separate test. However, we only do this if this block doesn't
19609 // have a fall-through edge, because this requires an explicit
19610 // jmp when the condition is false.
19611 if (Op.getNode()->hasOneUse()) {
19612 SDNode *User = *Op.getNode()->use_begin();
19613 // Look for an unconditional branch following this conditional branch.
19614 // We need this because we need to reverse the successors in order
19615 // to implement FCMP_UNE.
19616 if (User->getOpcode() == ISD::BR) {
19617 SDValue FalseBB = User->getOperand(1);
19619 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19620 assert(NewBR == User);
19623 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19624 Cond.getOperand(0), Cond.getOperand(1));
19625 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19626 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19627 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19628 Chain, Dest, CC, Cmp);
19629 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19639 // Look pass the truncate if the high bits are known zero.
19640 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19641 Cond = Cond.getOperand(0);
19643 // We know the result of AND is compared against zero. Try to match
19645 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19646 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19647 CC = NewSetCC.getOperand(0);
19648 Cond = NewSetCC.getOperand(1);
19655 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19656 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19657 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19659 Cond = ConvertCmpIfNecessary(Cond, DAG);
19660 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19661 Chain, Dest, CC, Cond);
19664 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19665 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19666 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19667 // that the guard pages used by the OS virtual memory manager are allocated in
19668 // correct sequence.
19670 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19671 SelectionDAG &DAG) const {
19672 MachineFunction &MF = DAG.getMachineFunction();
19673 bool SplitStack = MF.shouldSplitStack();
19674 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19675 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19676 SplitStack || EmitStackProbe;
19680 SDNode *Node = Op.getNode();
19681 SDValue Chain = Op.getOperand(0);
19682 SDValue Size = Op.getOperand(1);
19683 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19684 EVT VT = Node->getValueType(0);
19686 // Chain the dynamic stack allocation so that it doesn't modify the stack
19687 // pointer when other instructions are using the stack.
19688 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19690 bool Is64Bit = Subtarget.is64Bit();
19691 MVT SPTy = getPointerTy(DAG.getDataLayout());
19695 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19696 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19697 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19698 " not tell us which reg is the stack pointer!");
19700 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19701 Chain = SP.getValue(1);
19702 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19703 unsigned StackAlign = TFI.getStackAlignment();
19704 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19705 if (Align > StackAlign)
19706 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19707 DAG.getConstant(-(uint64_t)Align, dl, VT));
19708 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19709 } else if (SplitStack) {
19710 MachineRegisterInfo &MRI = MF.getRegInfo();
19713 // The 64 bit implementation of segmented stacks needs to clobber both r10
19714 // r11. This makes it impossible to use it along with nested parameters.
19715 const Function &F = MF.getFunction();
19716 for (const auto &A : F.args()) {
19717 if (A.hasNestAttr())
19718 report_fatal_error("Cannot use segmented stacks with functions that "
19719 "have nested arguments.");
19723 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19724 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19725 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19726 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19727 DAG.getRegister(Vreg, SPTy));
19729 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19730 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19731 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19733 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19734 unsigned SPReg = RegInfo->getStackRegister();
19735 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19736 Chain = SP.getValue(1);
19739 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19740 DAG.getConstant(-(uint64_t)Align, dl, VT));
19741 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19747 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19748 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19750 SDValue Ops[2] = {Result, Chain};
19751 return DAG.getMergeValues(Ops, dl);
19754 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19755 MachineFunction &MF = DAG.getMachineFunction();
19756 auto PtrVT = getPointerTy(MF.getDataLayout());
19757 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19759 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19762 if (!Subtarget.is64Bit() ||
19763 Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
19764 // vastart just stores the address of the VarArgsFrameIndex slot into the
19765 // memory location argument.
19766 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19767 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19768 MachinePointerInfo(SV));
19772 // gp_offset (0 - 6 * 8)
19773 // fp_offset (48 - 48 + 8 * 16)
19774 // overflow_arg_area (point to parameters coming in memory).
19776 SmallVector<SDValue, 8> MemOps;
19777 SDValue FIN = Op.getOperand(1);
19779 SDValue Store = DAG.getStore(
19780 Op.getOperand(0), DL,
19781 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19782 MachinePointerInfo(SV));
19783 MemOps.push_back(Store);
19786 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19787 Store = DAG.getStore(
19788 Op.getOperand(0), DL,
19789 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19790 MachinePointerInfo(SV, 4));
19791 MemOps.push_back(Store);
19793 // Store ptr to overflow_arg_area
19794 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19795 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19797 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19798 MemOps.push_back(Store);
19800 // Store ptr to reg_save_area.
19801 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19802 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19803 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19804 Store = DAG.getStore(
19805 Op.getOperand(0), DL, RSFIN, FIN,
19806 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19807 MemOps.push_back(Store);
19808 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19811 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19812 assert(Subtarget.is64Bit() &&
19813 "LowerVAARG only handles 64-bit va_arg!");
19814 assert(Op.getNumOperands() == 4);
19816 MachineFunction &MF = DAG.getMachineFunction();
19817 if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
19818 // The Win64 ABI uses char* instead of a structure.
19819 return DAG.expandVAArg(Op.getNode());
19821 SDValue Chain = Op.getOperand(0);
19822 SDValue SrcPtr = Op.getOperand(1);
19823 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19824 unsigned Align = Op.getConstantOperandVal(3);
19827 EVT ArgVT = Op.getNode()->getValueType(0);
19828 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19829 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19832 // Decide which area this value should be read from.
19833 // TODO: Implement the AMD64 ABI in its entirety. This simple
19834 // selection mechanism works only for the basic types.
19835 if (ArgVT == MVT::f80) {
19836 llvm_unreachable("va_arg for f80 not yet implemented");
19837 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19838 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19839 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19840 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19842 llvm_unreachable("Unhandled argument type in LowerVAARG");
19845 if (ArgMode == 2) {
19846 // Sanity Check: Make sure using fp_offset makes sense.
19847 assert(!Subtarget.useSoftFloat() &&
19848 !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
19849 Subtarget.hasSSE1());
19852 // Insert VAARG_64 node into the DAG
19853 // VAARG_64 returns two values: Variable Argument Address, Chain
19854 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19855 DAG.getConstant(ArgMode, dl, MVT::i8),
19856 DAG.getConstant(Align, dl, MVT::i32)};
19857 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19858 SDValue VAARG = DAG.getMemIntrinsicNode(
19859 X86ISD::VAARG_64, dl,
19860 VTs, InstOps, MVT::i64,
19861 MachinePointerInfo(SV),
19863 MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
19864 Chain = VAARG.getValue(1);
19866 // Load the next argument and return it
19867 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19870 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19871 SelectionDAG &DAG) {
19872 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19873 // where a va_list is still an i8*.
19874 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19875 if (Subtarget.isCallingConvWin64(
19876 DAG.getMachineFunction().getFunction().getCallingConv()))
19877 // Probably a Win64 va_copy.
19878 return DAG.expandVACopy(Op.getNode());
19880 SDValue Chain = Op.getOperand(0);
19881 SDValue DstPtr = Op.getOperand(1);
19882 SDValue SrcPtr = Op.getOperand(2);
19883 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19884 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19887 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19888 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19890 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19893 /// Handle vector element shifts where the shift amount is a constant.
19894 /// Takes immediate version of shift as input.
19895 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19896 SDValue SrcOp, uint64_t ShiftAmt,
19897 SelectionDAG &DAG) {
19898 MVT ElementType = VT.getVectorElementType();
19900 // Bitcast the source vector to the output type, this is mainly necessary for
19901 // vXi8/vXi64 shifts.
19902 if (VT != SrcOp.getSimpleValueType())
19903 SrcOp = DAG.getBitcast(VT, SrcOp);
19905 // Fold this packed shift into its first operand if ShiftAmt is 0.
19909 // Check for ShiftAmt >= element width
19910 if (ShiftAmt >= ElementType.getSizeInBits()) {
19911 if (Opc == X86ISD::VSRAI)
19912 ShiftAmt = ElementType.getSizeInBits() - 1;
19914 return DAG.getConstant(0, dl, VT);
19917 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19918 && "Unknown target vector shift-by-constant node");
19920 // Fold this packed vector shift into a build vector if SrcOp is a
19921 // vector of Constants or UNDEFs.
19922 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19923 SmallVector<SDValue, 8> Elts;
19924 unsigned NumElts = SrcOp->getNumOperands();
19925 ConstantSDNode *ND;
19928 default: llvm_unreachable("Unknown opcode!");
19929 case X86ISD::VSHLI:
19930 for (unsigned i=0; i!=NumElts; ++i) {
19931 SDValue CurrentOp = SrcOp->getOperand(i);
19932 if (CurrentOp->isUndef()) {
19933 Elts.push_back(CurrentOp);
19936 ND = cast<ConstantSDNode>(CurrentOp);
19937 const APInt &C = ND->getAPIntValue();
19938 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19941 case X86ISD::VSRLI:
19942 for (unsigned i=0; i!=NumElts; ++i) {
19943 SDValue CurrentOp = SrcOp->getOperand(i);
19944 if (CurrentOp->isUndef()) {
19945 Elts.push_back(CurrentOp);
19948 ND = cast<ConstantSDNode>(CurrentOp);
19949 const APInt &C = ND->getAPIntValue();
19950 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19953 case X86ISD::VSRAI:
19954 for (unsigned i=0; i!=NumElts; ++i) {
19955 SDValue CurrentOp = SrcOp->getOperand(i);
19956 if (CurrentOp->isUndef()) {
19957 Elts.push_back(CurrentOp);
19960 ND = cast<ConstantSDNode>(CurrentOp);
19961 const APInt &C = ND->getAPIntValue();
19962 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19967 return DAG.getBuildVector(VT, dl, Elts);
19970 return DAG.getNode(Opc, dl, VT, SrcOp,
19971 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19974 /// Handle vector element shifts where the shift amount may or may not be a
19975 /// constant. Takes immediate version of shift as input.
19976 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19977 SDValue SrcOp, SDValue ShAmt,
19978 const X86Subtarget &Subtarget,
19979 SelectionDAG &DAG) {
19980 MVT SVT = ShAmt.getSimpleValueType();
19981 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19983 // Catch shift-by-constant.
19984 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19985 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19986 CShAmt->getZExtValue(), DAG);
19988 // Change opcode to non-immediate version
19990 default: llvm_unreachable("Unknown target vector shift node");
19991 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19992 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19993 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19996 // Need to build a vector containing shift amount.
19997 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19998 // +=================+============+=======================================+
19999 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
20000 // +=================+============+=======================================+
20001 // | i64 | Yes, No | Use ShAmt as lowest elt |
20002 // | i32 | Yes | zero-extend in-reg |
20003 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
20004 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
20005 // +=================+============+=======================================+
20007 if (SVT == MVT::i64)
20008 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
20009 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
20010 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
20011 ShAmt = ShAmt.getOperand(0);
20012 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
20013 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20014 } else if (Subtarget.hasSSE41() &&
20015 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
20016 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
20017 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
20019 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
20020 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
20021 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
20024 // The return type has to be a 128-bit type with the same element
20025 // type as the input type.
20026 MVT EltVT = VT.getVectorElementType();
20027 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
20029 ShAmt = DAG.getBitcast(ShVT, ShAmt);
20030 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
20033 /// \brief Return Mask with the necessary casting or extending
20034 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
20035 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
20036 const X86Subtarget &Subtarget, SelectionDAG &DAG,
20039 if (isAllOnesConstant(Mask))
20040 return DAG.getConstant(1, dl, MaskVT);
20041 if (X86::isZeroNode(Mask))
20042 return DAG.getConstant(0, dl, MaskVT);
20044 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
20045 // Mask should be extended
20046 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
20047 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
20050 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
20051 assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!");
20052 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
20053 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
20055 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20056 DAG.getConstant(0, dl, MVT::i32));
20057 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
20058 DAG.getConstant(1, dl, MVT::i32));
20060 Lo = DAG.getBitcast(MVT::v32i1, Lo);
20061 Hi = DAG.getBitcast(MVT::v32i1, Hi);
20063 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
20065 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20066 Mask.getSimpleValueType().getSizeInBits());
20067 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
20068 // are extracted by EXTRACT_SUBVECTOR.
20069 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
20070 DAG.getBitcast(BitcastVT, Mask),
20071 DAG.getIntPtrConstant(0, dl));
20075 /// \brief Return (and \p Op, \p Mask) for compare instructions or
20076 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
20077 /// necessary casting or extending for \p Mask when lowering masking intrinsics
20078 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
20079 SDValue PreservedSrc,
20080 const X86Subtarget &Subtarget,
20081 SelectionDAG &DAG) {
20082 MVT VT = Op.getSimpleValueType();
20083 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20084 unsigned OpcodeSelect = ISD::VSELECT;
20087 if (isAllOnesConstant(Mask))
20090 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20092 switch (Op.getOpcode()) {
20095 case X86ISD::CMPM_RND:
20096 case X86ISD::CMPMU:
20097 case X86ISD::VPSHUFBITQMB:
20098 case X86ISD::VFPCLASS:
20099 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
20100 case ISD::TRUNCATE:
20101 case X86ISD::VTRUNC:
20102 case X86ISD::VTRUNCS:
20103 case X86ISD::VTRUNCUS:
20104 case X86ISD::CVTPS2PH:
20105 // We can't use ISD::VSELECT here because it is not always "Legal"
20106 // for the destination type. For example vpmovqb require only AVX512
20107 // and vselect that can operate on byte element type require BWI
20108 OpcodeSelect = X86ISD::SELECT;
20111 if (PreservedSrc.isUndef())
20112 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20113 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
20116 /// \brief Creates an SDNode for a predicated scalar operation.
20117 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
20118 /// The mask is coming as MVT::i8 and it should be transformed
20119 /// to MVT::v1i1 while lowering masking intrinsics.
20120 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
20121 /// "X86select" instead of "vselect". We just can't create the "vselect" node
20122 /// for a scalar instruction.
20123 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
20124 SDValue PreservedSrc,
20125 const X86Subtarget &Subtarget,
20126 SelectionDAG &DAG) {
20128 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
20129 if (MaskConst->getZExtValue() & 0x1)
20132 MVT VT = Op.getSimpleValueType();
20135 assert(Mask.getValueType() == MVT::i8 && "Unexpect type");
20136 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
20137 if (Op.getOpcode() == X86ISD::FSETCCM ||
20138 Op.getOpcode() == X86ISD::FSETCCM_RND ||
20139 Op.getOpcode() == X86ISD::VFPCLASSS)
20140 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
20142 if (PreservedSrc.isUndef())
20143 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
20144 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
20147 static int getSEHRegistrationNodeSize(const Function *Fn) {
20148 if (!Fn->hasPersonalityFn())
20149 report_fatal_error(
20150 "querying registration node size for function without personality");
20151 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
20152 // WinEHStatePass for the full struct definition.
20153 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
20154 case EHPersonality::MSVC_X86SEH: return 24;
20155 case EHPersonality::MSVC_CXX: return 16;
20158 report_fatal_error(
20159 "can only recover FP for 32-bit MSVC EH personality functions");
20162 /// When the MSVC runtime transfers control to us, either to an outlined
20163 /// function or when returning to a parent frame after catching an exception, we
20164 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
20165 /// Here's the math:
20166 /// RegNodeBase = EntryEBP - RegNodeSize
20167 /// ParentFP = RegNodeBase - ParentFrameOffset
20168 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
20169 /// subtracting the offset (negative on x86) takes us back to the parent FP.
20170 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
20171 SDValue EntryEBP) {
20172 MachineFunction &MF = DAG.getMachineFunction();
20175 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20176 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20178 // It's possible that the parent function no longer has a personality function
20179 // if the exceptional code was optimized away, in which case we just return
20180 // the incoming EBP.
20181 if (!Fn->hasPersonalityFn())
20184 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
20185 // registration, or the .set_setframe offset.
20186 MCSymbol *OffsetSym =
20187 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
20188 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20189 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
20190 SDValue ParentFrameOffset =
20191 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
20193 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
20194 // prologue to RBP in the parent function.
20195 const X86Subtarget &Subtarget =
20196 static_cast<const X86Subtarget &>(DAG.getSubtarget());
20197 if (Subtarget.is64Bit())
20198 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
20200 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
20201 // RegNodeBase = EntryEBP - RegNodeSize
20202 // ParentFP = RegNodeBase - ParentFrameOffset
20203 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
20204 DAG.getConstant(RegNodeSize, dl, PtrVT));
20205 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
20208 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
20209 SelectionDAG &DAG) const {
20210 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
20211 auto isRoundModeCurDirection = [](SDValue Rnd) {
20212 if (!isa<ConstantSDNode>(Rnd))
20215 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
20216 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
20220 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20221 MVT VT = Op.getSimpleValueType();
20222 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
20224 switch(IntrData->Type) {
20225 case INTR_TYPE_1OP:
20226 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
20227 case INTR_TYPE_2OP:
20228 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20230 case INTR_TYPE_3OP:
20231 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20232 Op.getOperand(2), Op.getOperand(3));
20233 case INTR_TYPE_4OP:
20234 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
20235 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
20236 case INTR_TYPE_1OP_MASK_RM: {
20237 SDValue Src = Op.getOperand(1);
20238 SDValue PassThru = Op.getOperand(2);
20239 SDValue Mask = Op.getOperand(3);
20240 SDValue RoundingMode;
20241 // We always add rounding mode to the Node.
20242 // If the rounding mode is not specified, we add the
20243 // "current direction" mode.
20244 if (Op.getNumOperands() == 4)
20246 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20248 RoundingMode = Op.getOperand(4);
20249 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
20250 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20252 Mask, PassThru, Subtarget, DAG);
20254 case INTR_TYPE_1OP_MASK: {
20255 SDValue Src = Op.getOperand(1);
20256 SDValue PassThru = Op.getOperand(2);
20257 SDValue Mask = Op.getOperand(3);
20258 // We add rounding mode to the Node when
20259 // - RM Opcode is specified and
20260 // - RM is not "current direction".
20261 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20262 if (IntrWithRoundingModeOpcode != 0) {
20263 SDValue Rnd = Op.getOperand(4);
20264 if (!isRoundModeCurDirection(Rnd)) {
20265 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20266 dl, Op.getValueType(),
20268 Mask, PassThru, Subtarget, DAG);
20271 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
20272 Mask, PassThru, Subtarget, DAG);
20274 case INTR_TYPE_SCALAR_MASK: {
20275 SDValue Src1 = Op.getOperand(1);
20276 SDValue Src2 = Op.getOperand(2);
20277 SDValue passThru = Op.getOperand(3);
20278 SDValue Mask = Op.getOperand(4);
20279 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20280 // There are 2 kinds of intrinsics in this group:
20281 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20282 // (2) With rounding mode and sae - 7 operands.
20283 bool HasRounding = IntrWithRoundingModeOpcode != 0;
20284 if (Op.getNumOperands() == (5U + HasRounding)) {
20286 SDValue Rnd = Op.getOperand(5);
20287 if (!isRoundModeCurDirection(Rnd))
20288 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20289 dl, VT, Src1, Src2, Rnd),
20290 Mask, passThru, Subtarget, DAG);
20292 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20294 Mask, passThru, Subtarget, DAG);
20297 assert(Op.getNumOperands() == (6U + HasRounding) &&
20298 "Unexpected intrinsic form");
20299 SDValue RoundingMode = Op.getOperand(5);
20301 SDValue Sae = Op.getOperand(6);
20302 if (!isRoundModeCurDirection(Sae))
20303 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20304 dl, VT, Src1, Src2,
20305 RoundingMode, Sae),
20306 Mask, passThru, Subtarget, DAG);
20308 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20309 Src2, RoundingMode),
20310 Mask, passThru, Subtarget, DAG);
20312 case INTR_TYPE_SCALAR_MASK_RM: {
20313 SDValue Src1 = Op.getOperand(1);
20314 SDValue Src2 = Op.getOperand(2);
20315 SDValue Src0 = Op.getOperand(3);
20316 SDValue Mask = Op.getOperand(4);
20317 // There are 2 kinds of intrinsics in this group:
20318 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
20319 // (2) With rounding mode and sae - 7 operands.
20320 if (Op.getNumOperands() == 6) {
20321 SDValue Sae = Op.getOperand(5);
20322 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20324 Mask, Src0, Subtarget, DAG);
20326 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
20327 SDValue RoundingMode = Op.getOperand(5);
20328 SDValue Sae = Op.getOperand(6);
20329 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
20330 RoundingMode, Sae),
20331 Mask, Src0, Subtarget, DAG);
20333 case INTR_TYPE_2OP_MASK:
20334 case INTR_TYPE_2OP_IMM8_MASK: {
20335 SDValue Src1 = Op.getOperand(1);
20336 SDValue Src2 = Op.getOperand(2);
20337 SDValue PassThru = Op.getOperand(3);
20338 SDValue Mask = Op.getOperand(4);
20340 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
20341 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
20343 // We specify 2 possible opcodes for intrinsics with rounding modes.
20344 // First, we check if the intrinsic may have non-default rounding mode,
20345 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20346 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20347 if (IntrWithRoundingModeOpcode != 0) {
20348 SDValue Rnd = Op.getOperand(5);
20349 if (!isRoundModeCurDirection(Rnd)) {
20350 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20351 dl, Op.getValueType(),
20353 Mask, PassThru, Subtarget, DAG);
20356 // TODO: Intrinsics should have fast-math-flags to propagate.
20357 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
20358 Mask, PassThru, Subtarget, DAG);
20360 case INTR_TYPE_2OP_MASK_RM: {
20361 SDValue Src1 = Op.getOperand(1);
20362 SDValue Src2 = Op.getOperand(2);
20363 SDValue PassThru = Op.getOperand(3);
20364 SDValue Mask = Op.getOperand(4);
20365 // We specify 2 possible modes for intrinsics, with/without rounding
20367 // First, we check if the intrinsic have rounding mode (6 operands),
20368 // if not, we set rounding mode to "current".
20370 if (Op.getNumOperands() == 6)
20371 Rnd = Op.getOperand(5);
20373 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20374 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20376 Mask, PassThru, Subtarget, DAG);
20378 case INTR_TYPE_3OP_SCALAR_MASK: {
20379 SDValue Src1 = Op.getOperand(1);
20380 SDValue Src2 = Op.getOperand(2);
20381 SDValue Src3 = Op.getOperand(3);
20382 SDValue PassThru = Op.getOperand(4);
20383 SDValue Mask = Op.getOperand(5);
20385 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20386 if (IntrWithRoundingModeOpcode != 0) {
20387 SDValue Rnd = Op.getOperand(6);
20388 if (!isRoundModeCurDirection(Rnd))
20389 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20390 dl, VT, Src1, Src2, Src3, Rnd),
20391 Mask, PassThru, Subtarget, DAG);
20393 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
20395 Mask, PassThru, Subtarget, DAG);
20397 case INTR_TYPE_3OP_IMM8_MASK:
20398 case INTR_TYPE_3OP_MASK: {
20399 SDValue Src1 = Op.getOperand(1);
20400 SDValue Src2 = Op.getOperand(2);
20401 SDValue Src3 = Op.getOperand(3);
20402 SDValue PassThru = Op.getOperand(4);
20403 SDValue Mask = Op.getOperand(5);
20405 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
20406 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
20408 // We specify 2 possible opcodes for intrinsics with rounding modes.
20409 // First, we check if the intrinsic may have non-default rounding mode,
20410 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20411 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20412 if (IntrWithRoundingModeOpcode != 0) {
20413 SDValue Rnd = Op.getOperand(6);
20414 if (!isRoundModeCurDirection(Rnd)) {
20415 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20416 dl, Op.getValueType(),
20417 Src1, Src2, Src3, Rnd),
20418 Mask, PassThru, Subtarget, DAG);
20421 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20423 Mask, PassThru, Subtarget, DAG);
20425 case VPERM_2OP_MASK : {
20426 SDValue Src1 = Op.getOperand(1);
20427 SDValue Src2 = Op.getOperand(2);
20428 SDValue PassThru = Op.getOperand(3);
20429 SDValue Mask = Op.getOperand(4);
20431 // Swap Src1 and Src2 in the node creation
20432 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
20433 Mask, PassThru, Subtarget, DAG);
20435 case VPERM_3OP_MASKZ:
20436 case VPERM_3OP_MASK:{
20437 MVT VT = Op.getSimpleValueType();
20438 // Src2 is the PassThru
20439 SDValue Src1 = Op.getOperand(1);
20440 // PassThru needs to be the same type as the destination in order
20441 // to pattern match correctly.
20442 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
20443 SDValue Src3 = Op.getOperand(3);
20444 SDValue Mask = Op.getOperand(4);
20445 SDValue PassThru = SDValue();
20447 // set PassThru element
20448 if (IntrData->Type == VPERM_3OP_MASKZ)
20449 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20453 // Swap Src1 and Src2 in the node creation
20454 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20455 dl, Op.getValueType(),
20457 Mask, PassThru, Subtarget, DAG);
20461 case FMA_OP_MASK: {
20462 SDValue Src1 = Op.getOperand(1);
20463 SDValue Src2 = Op.getOperand(2);
20464 SDValue Src3 = Op.getOperand(3);
20465 SDValue Mask = Op.getOperand(4);
20466 MVT VT = Op.getSimpleValueType();
20467 SDValue PassThru = SDValue();
20469 // set PassThru element
20470 if (IntrData->Type == FMA_OP_MASKZ)
20471 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20472 else if (IntrData->Type == FMA_OP_MASK3)
20477 // We specify 2 possible opcodes for intrinsics with rounding modes.
20478 // First, we check if the intrinsic may have non-default rounding mode,
20479 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20480 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20481 if (IntrWithRoundingModeOpcode != 0) {
20482 SDValue Rnd = Op.getOperand(5);
20483 if (!isRoundModeCurDirection(Rnd))
20484 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20485 dl, Op.getValueType(),
20486 Src1, Src2, Src3, Rnd),
20487 Mask, PassThru, Subtarget, DAG);
20489 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20490 dl, Op.getValueType(),
20492 Mask, PassThru, Subtarget, DAG);
20494 case FMA_OP_SCALAR_MASK:
20495 case FMA_OP_SCALAR_MASK3:
20496 case FMA_OP_SCALAR_MASKZ: {
20497 SDValue Src1 = Op.getOperand(1);
20498 SDValue Src2 = Op.getOperand(2);
20499 SDValue Src3 = Op.getOperand(3);
20500 SDValue Mask = Op.getOperand(4);
20501 MVT VT = Op.getSimpleValueType();
20502 SDValue PassThru = SDValue();
20504 // set PassThru element
20505 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
20506 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20507 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
20512 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20513 if (IntrWithRoundingModeOpcode != 0) {
20514 SDValue Rnd = Op.getOperand(5);
20515 if (!isRoundModeCurDirection(Rnd))
20516 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
20517 Op.getValueType(), Src1, Src2,
20519 Mask, PassThru, Subtarget, DAG);
20522 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
20523 Op.getValueType(), Src1, Src2,
20525 Mask, PassThru, Subtarget, DAG);
20527 case IFMA_OP_MASKZ:
20528 case IFMA_OP_MASK: {
20529 SDValue Src1 = Op.getOperand(1);
20530 SDValue Src2 = Op.getOperand(2);
20531 SDValue Src3 = Op.getOperand(3);
20532 SDValue Mask = Op.getOperand(4);
20533 MVT VT = Op.getSimpleValueType();
20534 SDValue PassThru = Src1;
20536 // set PassThru element
20537 if (IntrData->Type == IFMA_OP_MASKZ)
20538 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20540 // Node we need to swizzle the operands to pass the multiply operands
20542 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20543 dl, Op.getValueType(),
20545 Mask, PassThru, Subtarget, DAG);
20547 case TERLOG_OP_MASK:
20548 case TERLOG_OP_MASKZ: {
20549 SDValue Src1 = Op.getOperand(1);
20550 SDValue Src2 = Op.getOperand(2);
20551 SDValue Src3 = Op.getOperand(3);
20552 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20553 SDValue Mask = Op.getOperand(5);
20554 MVT VT = Op.getSimpleValueType();
20555 SDValue PassThru = Src1;
20556 // Set PassThru element.
20557 if (IntrData->Type == TERLOG_OP_MASKZ)
20558 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20560 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20561 Src1, Src2, Src3, Src4),
20562 Mask, PassThru, Subtarget, DAG);
20565 // ISD::FP_ROUND has a second argument that indicates if the truncation
20566 // does not change the value. Set it to 0 since it can change.
20567 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20568 DAG.getIntPtrConstant(0, dl));
20569 case CVTPD2PS_MASK: {
20570 SDValue Src = Op.getOperand(1);
20571 SDValue PassThru = Op.getOperand(2);
20572 SDValue Mask = Op.getOperand(3);
20573 // We add rounding mode to the Node when
20574 // - RM Opcode is specified and
20575 // - RM is not "current direction".
20576 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20577 if (IntrWithRoundingModeOpcode != 0) {
20578 SDValue Rnd = Op.getOperand(4);
20579 if (!isRoundModeCurDirection(Rnd)) {
20580 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20581 dl, Op.getValueType(),
20583 Mask, PassThru, Subtarget, DAG);
20586 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20587 // ISD::FP_ROUND has a second argument that indicates if the truncation
20588 // does not change the value. Set it to 0 since it can change.
20589 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20590 DAG.getIntPtrConstant(0, dl)),
20591 Mask, PassThru, Subtarget, DAG);
20594 // FPclass intrinsics with mask
20595 SDValue Src1 = Op.getOperand(1);
20596 MVT VT = Src1.getSimpleValueType();
20597 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20598 SDValue Imm = Op.getOperand(2);
20599 SDValue Mask = Op.getOperand(3);
20600 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20601 Mask.getSimpleValueType().getSizeInBits());
20602 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20603 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
20605 // Need to fill with zeros to ensure the bitcast will produce zeroes
20606 // for the upper bits in the v2i1/v4i1 case.
20607 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20608 DAG.getConstant(0, dl, BitcastVT),
20609 FPclassMask, DAG.getIntPtrConstant(0, dl));
20610 return DAG.getBitcast(Op.getValueType(), Res);
20613 SDValue Src1 = Op.getOperand(1);
20614 SDValue Imm = Op.getOperand(2);
20615 SDValue Mask = Op.getOperand(3);
20616 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20617 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
20619 // Need to fill with zeros to ensure the bitcast will produce zeroes
20620 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20621 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20622 DAG.getConstant(0, dl, MVT::v8i1),
20623 FPclassMask, DAG.getIntPtrConstant(0, dl));
20624 return DAG.getBitcast(MVT::i8, Ins);
20627 // Comparison intrinsics with masks.
20628 // Example of transformation:
20629 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20630 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20632 // (v8i1 (insert_subvector zero,
20633 // (v2i1 (and (PCMPEQM %a, %b),
20634 // (extract_subvector
20635 // (v8i1 (bitcast %mask)), 0))), 0))))
20636 MVT VT = Op.getOperand(1).getSimpleValueType();
20637 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20638 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20639 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20640 Mask.getSimpleValueType().getSizeInBits());
20641 SDValue Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20643 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
20645 // Need to fill with zeros to ensure the bitcast will produce zeroes
20646 // for the upper bits in the v2i1/v4i1 case.
20647 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20648 DAG.getConstant(0, dl, BitcastVT),
20649 CmpMask, DAG.getIntPtrConstant(0, dl));
20650 return DAG.getBitcast(Op.getValueType(), Res);
20653 case CMP_MASK_CC: {
20654 MVT VT = Op.getOperand(1).getSimpleValueType();
20655 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20657 SDValue CC = Op.getOperand(3);
20658 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20659 // We specify 2 possible opcodes for intrinsics with rounding modes.
20660 // First, we check if the intrinsic may have non-default rounding mode,
20661 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20662 if (IntrData->Opc1 != 0) {
20663 SDValue Rnd = Op.getOperand(4);
20664 if (!isRoundModeCurDirection(Rnd))
20665 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20666 Op.getOperand(2), CC, Rnd);
20668 //default rounding mode
20669 if (!Cmp.getNode())
20670 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20671 Op.getOperand(2), CC);
20675 case CMP_MASK_SCALAR_CC: {
20676 SDValue Src1 = Op.getOperand(1);
20677 SDValue Src2 = Op.getOperand(2);
20678 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20679 SDValue Mask = Op.getOperand(4);
20682 if (IntrData->Opc1 != 0) {
20683 SDValue Rnd = Op.getOperand(5);
20684 if (!isRoundModeCurDirection(Rnd))
20685 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20687 //default rounding mode
20689 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20691 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
20693 // Need to fill with zeros to ensure the bitcast will produce zeroes
20694 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20695 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
20696 DAG.getConstant(0, dl, MVT::v8i1),
20697 CmpMask, DAG.getIntPtrConstant(0, dl));
20698 return DAG.getBitcast(MVT::i8, Ins);
20700 case COMI: { // Comparison intrinsics
20701 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20702 SDValue LHS = Op.getOperand(1);
20703 SDValue RHS = Op.getOperand(2);
20704 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20705 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20708 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20709 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20710 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20711 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20714 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20715 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20716 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20717 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20720 case ISD::SETGT: // (CF = 0 and ZF = 0)
20721 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20723 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20724 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20727 case ISD::SETGE: // CF = 0
20728 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20730 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20731 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20734 llvm_unreachable("Unexpected illegal condition!");
20736 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20738 case COMI_RM: { // Comparison intrinsics with Sae
20739 SDValue LHS = Op.getOperand(1);
20740 SDValue RHS = Op.getOperand(2);
20741 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20742 SDValue Sae = Op.getOperand(4);
20745 if (isRoundModeCurDirection(Sae))
20746 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20747 DAG.getConstant(CondVal, dl, MVT::i8));
20749 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20750 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20751 // Need to fill with zeros to ensure the bitcast will produce zeroes
20752 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
20753 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
20754 DAG.getConstant(0, dl, MVT::v16i1),
20755 FCmp, DAG.getIntPtrConstant(0, dl));
20756 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32,
20757 DAG.getBitcast(MVT::i16, Ins));
20760 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20761 Op.getOperand(1), Op.getOperand(2), Subtarget,
20763 case COMPRESS_EXPAND_IN_REG: {
20764 SDValue Mask = Op.getOperand(3);
20765 SDValue DataToCompress = Op.getOperand(1);
20766 SDValue PassThru = Op.getOperand(2);
20767 if (isAllOnesConstant(Mask)) // return data as is
20768 return Op.getOperand(1);
20770 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20772 Mask, PassThru, Subtarget, DAG);
20775 case FIXUPIMMS_MASKZ:
20777 case FIXUPIMM_MASKZ:{
20778 SDValue Src1 = Op.getOperand(1);
20779 SDValue Src2 = Op.getOperand(2);
20780 SDValue Src3 = Op.getOperand(3);
20781 SDValue Imm = Op.getOperand(4);
20782 SDValue Mask = Op.getOperand(5);
20783 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20784 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20785 // We specify 2 possible modes for intrinsics, with/without rounding
20787 // First, we check if the intrinsic have rounding mode (7 operands),
20788 // if not, we set rounding mode to "current".
20790 if (Op.getNumOperands() == 7)
20791 Rnd = Op.getOperand(6);
20793 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20794 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20795 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20796 Src1, Src2, Src3, Imm, Rnd),
20797 Mask, Passthru, Subtarget, DAG);
20798 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20799 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20800 Src1, Src2, Src3, Imm, Rnd),
20801 Mask, Passthru, Subtarget, DAG);
20804 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20805 // Clear the upper bits of the rounding immediate so that the legacy
20806 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20807 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20809 DAG.getConstant(0xf, dl, MVT::i32));
20810 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20811 Op.getOperand(1), RoundingMode);
20814 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20815 // Clear the upper bits of the rounding immediate so that the legacy
20816 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20817 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20819 DAG.getConstant(0xf, dl, MVT::i32));
20820 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20821 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20829 default: return SDValue(); // Don't custom lower most intrinsics.
20831 case Intrinsic::x86_sse41_pmuldq:
20832 case Intrinsic::x86_avx2_pmul_dq:
20833 case Intrinsic::x86_avx512_pmul_dq_512: {
20834 MVT OpVT = Op.getSimpleValueType();
20835 return DAG.getNode(X86ISD::PMULDQ, dl, OpVT,
20836 DAG.getBitcast(OpVT, Op.getOperand(1)),
20837 DAG.getBitcast(OpVT, Op.getOperand(2)));
20840 case Intrinsic::x86_sse2_pmulu_dq:
20841 case Intrinsic::x86_avx2_pmulu_dq:
20842 case Intrinsic::x86_avx512_pmulu_dq_512: {
20843 MVT OpVT = Op.getSimpleValueType();
20844 return DAG.getNode(X86ISD::PMULUDQ, dl, OpVT,
20845 DAG.getBitcast(OpVT, Op.getOperand(1)),
20846 DAG.getBitcast(OpVT, Op.getOperand(2)));
20849 case Intrinsic::x86_avx2_permd:
20850 case Intrinsic::x86_avx2_permps:
20851 // Operands intentionally swapped. Mask is last operand to intrinsic,
20852 // but second operand for node/instruction.
20853 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20854 Op.getOperand(2), Op.getOperand(1));
20856 // ptest and testp intrinsics. The intrinsic these come from are designed to
20857 // return an integer value, not just an instruction so lower it to the ptest
20858 // or testp pattern and a setcc for the result.
20859 case Intrinsic::x86_sse41_ptestz:
20860 case Intrinsic::x86_sse41_ptestc:
20861 case Intrinsic::x86_sse41_ptestnzc:
20862 case Intrinsic::x86_avx_ptestz_256:
20863 case Intrinsic::x86_avx_ptestc_256:
20864 case Intrinsic::x86_avx_ptestnzc_256:
20865 case Intrinsic::x86_avx_vtestz_ps:
20866 case Intrinsic::x86_avx_vtestc_ps:
20867 case Intrinsic::x86_avx_vtestnzc_ps:
20868 case Intrinsic::x86_avx_vtestz_pd:
20869 case Intrinsic::x86_avx_vtestc_pd:
20870 case Intrinsic::x86_avx_vtestnzc_pd:
20871 case Intrinsic::x86_avx_vtestz_ps_256:
20872 case Intrinsic::x86_avx_vtestc_ps_256:
20873 case Intrinsic::x86_avx_vtestnzc_ps_256:
20874 case Intrinsic::x86_avx_vtestz_pd_256:
20875 case Intrinsic::x86_avx_vtestc_pd_256:
20876 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20877 bool IsTestPacked = false;
20878 X86::CondCode X86CC;
20880 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20881 case Intrinsic::x86_avx_vtestz_ps:
20882 case Intrinsic::x86_avx_vtestz_pd:
20883 case Intrinsic::x86_avx_vtestz_ps_256:
20884 case Intrinsic::x86_avx_vtestz_pd_256:
20885 IsTestPacked = true;
20887 case Intrinsic::x86_sse41_ptestz:
20888 case Intrinsic::x86_avx_ptestz_256:
20890 X86CC = X86::COND_E;
20892 case Intrinsic::x86_avx_vtestc_ps:
20893 case Intrinsic::x86_avx_vtestc_pd:
20894 case Intrinsic::x86_avx_vtestc_ps_256:
20895 case Intrinsic::x86_avx_vtestc_pd_256:
20896 IsTestPacked = true;
20898 case Intrinsic::x86_sse41_ptestc:
20899 case Intrinsic::x86_avx_ptestc_256:
20901 X86CC = X86::COND_B;
20903 case Intrinsic::x86_avx_vtestnzc_ps:
20904 case Intrinsic::x86_avx_vtestnzc_pd:
20905 case Intrinsic::x86_avx_vtestnzc_ps_256:
20906 case Intrinsic::x86_avx_vtestnzc_pd_256:
20907 IsTestPacked = true;
20909 case Intrinsic::x86_sse41_ptestnzc:
20910 case Intrinsic::x86_avx_ptestnzc_256:
20912 X86CC = X86::COND_A;
20916 SDValue LHS = Op.getOperand(1);
20917 SDValue RHS = Op.getOperand(2);
20918 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20919 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20920 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20921 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20924 case Intrinsic::x86_sse42_pcmpistria128:
20925 case Intrinsic::x86_sse42_pcmpestria128:
20926 case Intrinsic::x86_sse42_pcmpistric128:
20927 case Intrinsic::x86_sse42_pcmpestric128:
20928 case Intrinsic::x86_sse42_pcmpistrio128:
20929 case Intrinsic::x86_sse42_pcmpestrio128:
20930 case Intrinsic::x86_sse42_pcmpistris128:
20931 case Intrinsic::x86_sse42_pcmpestris128:
20932 case Intrinsic::x86_sse42_pcmpistriz128:
20933 case Intrinsic::x86_sse42_pcmpestriz128: {
20935 X86::CondCode X86CC;
20937 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20938 case Intrinsic::x86_sse42_pcmpistria128:
20939 Opcode = X86ISD::PCMPISTRI;
20940 X86CC = X86::COND_A;
20942 case Intrinsic::x86_sse42_pcmpestria128:
20943 Opcode = X86ISD::PCMPESTRI;
20944 X86CC = X86::COND_A;
20946 case Intrinsic::x86_sse42_pcmpistric128:
20947 Opcode = X86ISD::PCMPISTRI;
20948 X86CC = X86::COND_B;
20950 case Intrinsic::x86_sse42_pcmpestric128:
20951 Opcode = X86ISD::PCMPESTRI;
20952 X86CC = X86::COND_B;
20954 case Intrinsic::x86_sse42_pcmpistrio128:
20955 Opcode = X86ISD::PCMPISTRI;
20956 X86CC = X86::COND_O;
20958 case Intrinsic::x86_sse42_pcmpestrio128:
20959 Opcode = X86ISD::PCMPESTRI;
20960 X86CC = X86::COND_O;
20962 case Intrinsic::x86_sse42_pcmpistris128:
20963 Opcode = X86ISD::PCMPISTRI;
20964 X86CC = X86::COND_S;
20966 case Intrinsic::x86_sse42_pcmpestris128:
20967 Opcode = X86ISD::PCMPESTRI;
20968 X86CC = X86::COND_S;
20970 case Intrinsic::x86_sse42_pcmpistriz128:
20971 Opcode = X86ISD::PCMPISTRI;
20972 X86CC = X86::COND_E;
20974 case Intrinsic::x86_sse42_pcmpestriz128:
20975 Opcode = X86ISD::PCMPESTRI;
20976 X86CC = X86::COND_E;
20979 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20980 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20981 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20982 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20983 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20986 case Intrinsic::x86_sse42_pcmpistri128:
20987 case Intrinsic::x86_sse42_pcmpestri128: {
20989 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20990 Opcode = X86ISD::PCMPISTRI;
20992 Opcode = X86ISD::PCMPESTRI;
20994 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20995 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20996 return DAG.getNode(Opcode, dl, VTs, NewOps);
20999 case Intrinsic::eh_sjlj_lsda: {
21000 MachineFunction &MF = DAG.getMachineFunction();
21001 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
21002 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
21003 auto &Context = MF.getMMI().getContext();
21004 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
21005 Twine(MF.getFunctionNumber()));
21006 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
21007 DAG.getMCSymbol(S, PtrVT));
21010 case Intrinsic::x86_seh_lsda: {
21011 // Compute the symbol for the LSDA. We know it'll get emitted later.
21012 MachineFunction &MF = DAG.getMachineFunction();
21013 SDValue Op1 = Op.getOperand(1);
21014 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
21015 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
21016 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
21018 // Generate a simple absolute symbol reference. This intrinsic is only
21019 // supported on 32-bit Windows, which isn't PIC.
21020 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
21021 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
21024 case Intrinsic::x86_seh_recoverfp: {
21025 SDValue FnOp = Op.getOperand(1);
21026 SDValue IncomingFPOp = Op.getOperand(2);
21027 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
21028 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
21030 report_fatal_error(
21031 "llvm.x86.seh.recoverfp must take a function as the first argument");
21032 return recoverFramePointer(DAG, Fn, IncomingFPOp);
21035 case Intrinsic::localaddress: {
21036 // Returns one of the stack, base, or frame pointer registers, depending on
21037 // which is used to reference local variables.
21038 MachineFunction &MF = DAG.getMachineFunction();
21039 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21041 if (RegInfo->hasBasePointer(MF))
21042 Reg = RegInfo->getBaseRegister();
21043 else // This function handles the SP or FP case.
21044 Reg = RegInfo->getPtrSizedFrameRegister(MF);
21045 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
21050 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21051 SDValue Src, SDValue Mask, SDValue Base,
21052 SDValue Index, SDValue ScaleOp, SDValue Chain,
21053 const X86Subtarget &Subtarget) {
21055 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21056 // Scale must be constant.
21059 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21060 EVT MaskVT = Mask.getValueType();
21061 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21062 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21063 SDValue Segment = DAG.getRegister(0, MVT::i32);
21064 // If source is undef or we know it won't be used, use a zero vector
21065 // to break register dependency.
21066 // TODO: use undef instead and let BreakFalseDeps deal with it?
21067 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
21068 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21069 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
21070 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21071 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21072 return DAG.getMergeValues(RetOps, dl);
21075 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21076 SDValue Src, SDValue Mask, SDValue Base,
21077 SDValue Index, SDValue ScaleOp, SDValue Chain,
21078 const X86Subtarget &Subtarget) {
21080 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21081 // Scale must be constant.
21084 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21085 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21086 Index.getSimpleValueType().getVectorNumElements());
21088 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21089 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
21090 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21091 SDValue Segment = DAG.getRegister(0, MVT::i32);
21092 // If source is undef or we know it won't be used, use a zero vector
21093 // to break register dependency.
21094 // TODO: use undef instead and let BreakFalseDeps deal with it?
21095 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
21096 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
21097 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
21098 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21099 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
21100 return DAG.getMergeValues(RetOps, dl);
21103 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21104 SDValue Src, SDValue Mask, SDValue Base,
21105 SDValue Index, SDValue ScaleOp, SDValue Chain,
21106 const X86Subtarget &Subtarget) {
21108 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21109 // Scale must be constant.
21112 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21113 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21114 SDValue Segment = DAG.getRegister(0, MVT::i32);
21115 MVT MaskVT = MVT::getVectorVT(MVT::i1,
21116 Index.getSimpleValueType().getVectorNumElements());
21118 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21119 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
21120 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
21121 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
21122 return SDValue(Res, 1);
21125 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
21126 SDValue Mask, SDValue Base, SDValue Index,
21127 SDValue ScaleOp, SDValue Chain,
21128 const X86Subtarget &Subtarget) {
21130 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
21131 // Scale must be constant.
21134 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
21135 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
21136 SDValue Segment = DAG.getRegister(0, MVT::i32);
21138 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
21139 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21140 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
21141 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
21142 return SDValue(Res, 0);
21145 /// Handles the lowering of builtin intrinsic that return the value
21146 /// of the extended control register.
21147 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
21149 const X86Subtarget &Subtarget,
21150 SmallVectorImpl<SDValue> &Results) {
21151 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21152 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21155 // The ECX register is used to select the index of the XCR register to
21158 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
21159 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
21160 Chain = SDValue(N1, 0);
21162 // Reads the content of XCR and returns it in registers EDX:EAX.
21163 if (Subtarget.is64Bit()) {
21164 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
21165 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21168 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
21169 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21172 Chain = HI.getValue(1);
21174 if (Subtarget.is64Bit()) {
21175 // Merge the two 32-bit values into a 64-bit one..
21176 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21177 DAG.getConstant(32, DL, MVT::i8));
21178 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21179 Results.push_back(Chain);
21183 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21184 SDValue Ops[] = { LO, HI };
21185 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21186 Results.push_back(Pair);
21187 Results.push_back(Chain);
21190 /// Handles the lowering of builtin intrinsics that read performance monitor
21191 /// counters (x86_rdpmc).
21192 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
21194 const X86Subtarget &Subtarget,
21195 SmallVectorImpl<SDValue> &Results) {
21196 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21197 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21200 // The ECX register is used to select the index of the performance counter
21202 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
21204 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
21206 // Reads the content of a 64-bit performance counter and returns it in the
21207 // registers EDX:EAX.
21208 if (Subtarget.is64Bit()) {
21209 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21210 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21213 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21214 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21217 Chain = HI.getValue(1);
21219 if (Subtarget.is64Bit()) {
21220 // The EAX register is loaded with the low-order 32 bits. The EDX register
21221 // is loaded with the supported high-order bits of the counter.
21222 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21223 DAG.getConstant(32, DL, MVT::i8));
21224 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21225 Results.push_back(Chain);
21229 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21230 SDValue Ops[] = { LO, HI };
21231 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21232 Results.push_back(Pair);
21233 Results.push_back(Chain);
21236 /// Handles the lowering of builtin intrinsics that read the time stamp counter
21237 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
21238 /// READCYCLECOUNTER nodes.
21239 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
21241 const X86Subtarget &Subtarget,
21242 SmallVectorImpl<SDValue> &Results) {
21243 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
21244 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
21247 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
21248 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
21249 // and the EAX register is loaded with the low-order 32 bits.
21250 if (Subtarget.is64Bit()) {
21251 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
21252 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
21255 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
21256 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
21259 SDValue Chain = HI.getValue(1);
21261 if (Opcode == X86ISD::RDTSCP_DAG) {
21262 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
21264 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
21265 // the ECX register. Add 'ecx' explicitly to the chain.
21266 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
21268 // Explicitly store the content of ECX at the location passed in input
21269 // to the 'rdtscp' intrinsic.
21270 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
21271 MachinePointerInfo());
21274 if (Subtarget.is64Bit()) {
21275 // The EDX register is loaded with the high-order 32 bits of the MSR, and
21276 // the EAX register is loaded with the low-order 32 bits.
21277 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
21278 DAG.getConstant(32, DL, MVT::i8));
21279 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
21280 Results.push_back(Chain);
21284 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
21285 SDValue Ops[] = { LO, HI };
21286 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
21287 Results.push_back(Pair);
21288 Results.push_back(Chain);
21291 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
21292 SelectionDAG &DAG) {
21293 SmallVector<SDValue, 2> Results;
21295 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
21297 return DAG.getMergeValues(Results, DL);
21300 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
21301 MachineFunction &MF = DAG.getMachineFunction();
21302 SDValue Chain = Op.getOperand(0);
21303 SDValue RegNode = Op.getOperand(2);
21304 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21306 report_fatal_error("EH registrations only live in functions using WinEH");
21308 // Cast the operand to an alloca, and remember the frame index.
21309 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
21311 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
21312 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
21314 // Return the chain operand without making any DAG nodes.
21318 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
21319 MachineFunction &MF = DAG.getMachineFunction();
21320 SDValue Chain = Op.getOperand(0);
21321 SDValue EHGuard = Op.getOperand(2);
21322 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
21324 report_fatal_error("EHGuard only live in functions using WinEH");
21326 // Cast the operand to an alloca, and remember the frame index.
21327 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
21329 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
21330 EHInfo->EHGuardFrameIndex = FINode->getIndex();
21332 // Return the chain operand without making any DAG nodes.
21336 /// Emit Truncating Store with signed or unsigned saturation.
21338 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
21339 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
21340 SelectionDAG &DAG) {
21342 SDVTList VTs = DAG.getVTList(MVT::Other);
21343 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
21344 SDValue Ops[] = { Chain, Val, Ptr, Undef };
21346 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21347 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21350 /// Emit Masked Truncating Store with signed or unsigned saturation.
21352 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
21353 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
21354 MachineMemOperand *MMO, SelectionDAG &DAG) {
21356 SDVTList VTs = DAG.getVTList(MVT::Other);
21357 SDValue Ops[] = { Chain, Ptr, Mask, Val };
21359 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
21360 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
21363 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
21364 SelectionDAG &DAG) {
21365 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
21367 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
21370 case llvm::Intrinsic::x86_seh_ehregnode:
21371 return MarkEHRegistrationNode(Op, DAG);
21372 case llvm::Intrinsic::x86_seh_ehguard:
21373 return MarkEHGuard(Op, DAG);
21374 case llvm::Intrinsic::x86_flags_read_u32:
21375 case llvm::Intrinsic::x86_flags_read_u64:
21376 case llvm::Intrinsic::x86_flags_write_u32:
21377 case llvm::Intrinsic::x86_flags_write_u64: {
21378 // We need a frame pointer because this will get lowered to a PUSH/POP
21380 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21381 MFI.setHasCopyImplyingStackAdjustment(true);
21382 // Don't do anything here, we will expand these intrinsics out later
21383 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
21386 case Intrinsic::x86_lwpins32:
21387 case Intrinsic::x86_lwpins64: {
21389 SDValue Chain = Op->getOperand(0);
21390 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
21392 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
21393 Op->getOperand(3), Op->getOperand(4));
21394 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
21395 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
21396 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
21397 LwpIns.getValue(1));
21404 switch(IntrData->Type) {
21405 default: llvm_unreachable("Unknown Intrinsic Type");
21408 // Emit the node with the right value type.
21409 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
21410 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21412 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
21413 // Otherwise return the value from Rand, which is always 0, casted to i32.
21414 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
21415 DAG.getConstant(1, dl, Op->getValueType(1)),
21416 DAG.getConstant(X86::COND_B, dl, MVT::i8),
21417 SDValue(Result.getNode(), 1) };
21418 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
21420 // Return { result, isValid, chain }.
21421 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
21422 SDValue(Result.getNode(), 2));
21424 case GATHER_AVX2: {
21425 SDValue Chain = Op.getOperand(0);
21426 SDValue Src = Op.getOperand(2);
21427 SDValue Base = Op.getOperand(3);
21428 SDValue Index = Op.getOperand(4);
21429 SDValue Mask = Op.getOperand(5);
21430 SDValue Scale = Op.getOperand(6);
21431 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21432 Scale, Chain, Subtarget);
21435 //gather(v1, mask, index, base, scale);
21436 SDValue Chain = Op.getOperand(0);
21437 SDValue Src = Op.getOperand(2);
21438 SDValue Base = Op.getOperand(3);
21439 SDValue Index = Op.getOperand(4);
21440 SDValue Mask = Op.getOperand(5);
21441 SDValue Scale = Op.getOperand(6);
21442 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
21446 //scatter(base, mask, index, v1, scale);
21447 SDValue Chain = Op.getOperand(0);
21448 SDValue Base = Op.getOperand(2);
21449 SDValue Mask = Op.getOperand(3);
21450 SDValue Index = Op.getOperand(4);
21451 SDValue Src = Op.getOperand(5);
21452 SDValue Scale = Op.getOperand(6);
21453 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
21454 Scale, Chain, Subtarget);
21457 SDValue Hint = Op.getOperand(6);
21458 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
21459 assert((HintVal == 2 || HintVal == 3) &&
21460 "Wrong prefetch hint in intrinsic: should be 2 or 3");
21461 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
21462 SDValue Chain = Op.getOperand(0);
21463 SDValue Mask = Op.getOperand(2);
21464 SDValue Index = Op.getOperand(3);
21465 SDValue Base = Op.getOperand(4);
21466 SDValue Scale = Op.getOperand(5);
21467 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
21470 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
21472 SmallVector<SDValue, 2> Results;
21473 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
21475 return DAG.getMergeValues(Results, dl);
21477 // Read Performance Monitoring Counters.
21479 SmallVector<SDValue, 2> Results;
21480 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21481 return DAG.getMergeValues(Results, dl);
21483 // Get Extended Control Register.
21485 SmallVector<SDValue, 2> Results;
21486 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21487 return DAG.getMergeValues(Results, dl);
21489 // XTEST intrinsics.
21491 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21492 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21494 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21495 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21496 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21497 Ret, SDValue(InTrans.getNode(), 1));
21501 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21502 SDVTList VTs = DAG.getVTList(Op.getOperand(3).getValueType(), MVT::i32);
21503 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21504 DAG.getConstant(-1, dl, MVT::i8));
21505 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21506 Op.getOperand(4), GenCF.getValue(1));
21507 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21508 Op.getOperand(5), MachinePointerInfo());
21509 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21510 SDValue Results[] = { SetCC, Store };
21511 return DAG.getMergeValues(Results, dl);
21513 case COMPRESS_TO_MEM: {
21514 SDValue Mask = Op.getOperand(4);
21515 SDValue DataToCompress = Op.getOperand(3);
21516 SDValue Addr = Op.getOperand(2);
21517 SDValue Chain = Op.getOperand(0);
21518 MVT VT = DataToCompress.getSimpleValueType();
21520 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21521 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21523 if (isAllOnesConstant(Mask)) // return just a store
21524 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21525 MemIntr->getMemOperand());
21527 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21528 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21530 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21531 MemIntr->getMemOperand(),
21532 false /* truncating */, true /* compressing */);
21534 case TRUNCATE_TO_MEM_VI8:
21535 case TRUNCATE_TO_MEM_VI16:
21536 case TRUNCATE_TO_MEM_VI32: {
21537 SDValue Mask = Op.getOperand(4);
21538 SDValue DataToTruncate = Op.getOperand(3);
21539 SDValue Addr = Op.getOperand(2);
21540 SDValue Chain = Op.getOperand(0);
21542 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21543 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21545 EVT MemVT = MemIntr->getMemoryVT();
21547 uint16_t TruncationOp = IntrData->Opc0;
21548 switch (TruncationOp) {
21549 case X86ISD::VTRUNC: {
21550 if (isAllOnesConstant(Mask)) // return just a truncate store
21551 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21552 MemIntr->getMemOperand());
21554 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21555 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21557 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21558 MemIntr->getMemOperand(), true /* truncating */);
21560 case X86ISD::VTRUNCUS:
21561 case X86ISD::VTRUNCS: {
21562 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21563 if (isAllOnesConstant(Mask))
21564 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21565 MemIntr->getMemOperand(), DAG);
21567 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21568 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21570 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21571 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21574 llvm_unreachable("Unsupported truncstore intrinsic");
21578 case EXPAND_FROM_MEM: {
21579 SDValue Mask = Op.getOperand(4);
21580 SDValue PassThru = Op.getOperand(3);
21581 SDValue Addr = Op.getOperand(2);
21582 SDValue Chain = Op.getOperand(0);
21583 MVT VT = Op.getSimpleValueType();
21585 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21586 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21588 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21589 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21590 if (X86::isZeroNode(Mask))
21591 return DAG.getUNDEF(VT);
21593 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21594 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21595 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21596 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21597 true /* expanding */);
21602 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21603 SelectionDAG &DAG) const {
21604 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21605 MFI.setReturnAddressIsTaken(true);
21607 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21610 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21612 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21615 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21616 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21617 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21618 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21619 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21620 MachinePointerInfo());
21623 // Just load the return address.
21624 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21625 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21626 MachinePointerInfo());
21629 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21630 SelectionDAG &DAG) const {
21631 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21632 return getReturnAddressFrameIndex(DAG);
21635 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21636 MachineFunction &MF = DAG.getMachineFunction();
21637 MachineFrameInfo &MFI = MF.getFrameInfo();
21638 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21639 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21640 EVT VT = Op.getValueType();
21642 MFI.setFrameAddressIsTaken(true);
21644 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21645 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21646 // is not possible to crawl up the stack without looking at the unwind codes
21648 int FrameAddrIndex = FuncInfo->getFAIndex();
21649 if (!FrameAddrIndex) {
21650 // Set up a frame object for the return address.
21651 unsigned SlotSize = RegInfo->getSlotSize();
21652 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21653 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21654 FuncInfo->setFAIndex(FrameAddrIndex);
21656 return DAG.getFrameIndex(FrameAddrIndex, VT);
21659 unsigned FrameReg =
21660 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21661 SDLoc dl(Op); // FIXME probably not meaningful
21662 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21663 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21664 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21665 "Invalid Frame Register!");
21666 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21668 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21669 MachinePointerInfo());
21673 // FIXME? Maybe this could be a TableGen attribute on some registers and
21674 // this table could be generated automatically from RegInfo.
21675 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21676 SelectionDAG &DAG) const {
21677 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21678 const MachineFunction &MF = DAG.getMachineFunction();
21680 unsigned Reg = StringSwitch<unsigned>(RegName)
21681 .Case("esp", X86::ESP)
21682 .Case("rsp", X86::RSP)
21683 .Case("ebp", X86::EBP)
21684 .Case("rbp", X86::RBP)
21687 if (Reg == X86::EBP || Reg == X86::RBP) {
21688 if (!TFI.hasFP(MF))
21689 report_fatal_error("register " + StringRef(RegName) +
21690 " is allocatable: function has no frame pointer");
21693 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21694 unsigned FrameReg =
21695 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21696 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21697 "Invalid Frame Register!");
21705 report_fatal_error("Invalid register name global variable");
21708 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21709 SelectionDAG &DAG) const {
21710 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21711 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21714 unsigned X86TargetLowering::getExceptionPointerRegister(
21715 const Constant *PersonalityFn) const {
21716 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21717 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21719 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21722 unsigned X86TargetLowering::getExceptionSelectorRegister(
21723 const Constant *PersonalityFn) const {
21724 // Funclet personalities don't use selectors (the runtime does the selection).
21725 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21726 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21729 bool X86TargetLowering::needsFixedCatchObjects() const {
21730 return Subtarget.isTargetWin64();
21733 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21734 SDValue Chain = Op.getOperand(0);
21735 SDValue Offset = Op.getOperand(1);
21736 SDValue Handler = Op.getOperand(2);
21739 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21740 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21741 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21742 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21743 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21744 "Invalid Frame Register!");
21745 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21746 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21748 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21749 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21751 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21752 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21753 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21755 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21756 DAG.getRegister(StoreAddrReg, PtrVT));
21759 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21760 SelectionDAG &DAG) const {
21762 // If the subtarget is not 64bit, we may need the global base reg
21763 // after isel expand pseudo, i.e., after CGBR pass ran.
21764 // Therefore, ask for the GlobalBaseReg now, so that the pass
21765 // inserts the code for us in case we need it.
21766 // Otherwise, we will end up in a situation where we will
21767 // reference a virtual register that is not defined!
21768 if (!Subtarget.is64Bit()) {
21769 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21770 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21772 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21773 DAG.getVTList(MVT::i32, MVT::Other),
21774 Op.getOperand(0), Op.getOperand(1));
21777 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21778 SelectionDAG &DAG) const {
21780 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21781 Op.getOperand(0), Op.getOperand(1));
21784 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21785 SelectionDAG &DAG) const {
21787 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21791 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21792 return Op.getOperand(0);
21795 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21796 SelectionDAG &DAG) const {
21797 SDValue Root = Op.getOperand(0);
21798 SDValue Trmp = Op.getOperand(1); // trampoline
21799 SDValue FPtr = Op.getOperand(2); // nested function
21800 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21803 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21804 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21806 if (Subtarget.is64Bit()) {
21807 SDValue OutChains[6];
21809 // Large code-model.
21810 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21811 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21813 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21814 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21816 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21818 // Load the pointer to the nested function into R11.
21819 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21820 SDValue Addr = Trmp;
21821 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21822 Addr, MachinePointerInfo(TrmpAddr));
21824 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21825 DAG.getConstant(2, dl, MVT::i64));
21827 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21828 /* Alignment = */ 2);
21830 // Load the 'nest' parameter value into R10.
21831 // R10 is specified in X86CallingConv.td
21832 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21833 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21834 DAG.getConstant(10, dl, MVT::i64));
21835 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21836 Addr, MachinePointerInfo(TrmpAddr, 10));
21838 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21839 DAG.getConstant(12, dl, MVT::i64));
21841 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21842 /* Alignment = */ 2);
21844 // Jump to the nested function.
21845 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21846 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21847 DAG.getConstant(20, dl, MVT::i64));
21848 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21849 Addr, MachinePointerInfo(TrmpAddr, 20));
21851 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21852 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21853 DAG.getConstant(22, dl, MVT::i64));
21854 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21855 Addr, MachinePointerInfo(TrmpAddr, 22));
21857 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21859 const Function *Func =
21860 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21861 CallingConv::ID CC = Func->getCallingConv();
21866 llvm_unreachable("Unsupported calling convention");
21867 case CallingConv::C:
21868 case CallingConv::X86_StdCall: {
21869 // Pass 'nest' parameter in ECX.
21870 // Must be kept in sync with X86CallingConv.td
21871 NestReg = X86::ECX;
21873 // Check that ECX wasn't needed by an 'inreg' parameter.
21874 FunctionType *FTy = Func->getFunctionType();
21875 const AttributeList &Attrs = Func->getAttributes();
21877 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21878 unsigned InRegCount = 0;
21881 for (FunctionType::param_iterator I = FTy->param_begin(),
21882 E = FTy->param_end(); I != E; ++I, ++Idx)
21883 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21884 auto &DL = DAG.getDataLayout();
21885 // FIXME: should only count parameters that are lowered to integers.
21886 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21889 if (InRegCount > 2) {
21890 report_fatal_error("Nest register in use - reduce number of inreg"
21896 case CallingConv::X86_FastCall:
21897 case CallingConv::X86_ThisCall:
21898 case CallingConv::Fast:
21899 // Pass 'nest' parameter in EAX.
21900 // Must be kept in sync with X86CallingConv.td
21901 NestReg = X86::EAX;
21905 SDValue OutChains[4];
21906 SDValue Addr, Disp;
21908 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21909 DAG.getConstant(10, dl, MVT::i32));
21910 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21912 // This is storing the opcode for MOV32ri.
21913 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21914 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21916 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21917 Trmp, MachinePointerInfo(TrmpAddr));
21919 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21920 DAG.getConstant(1, dl, MVT::i32));
21922 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21923 /* Alignment = */ 1);
21925 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21926 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21927 DAG.getConstant(5, dl, MVT::i32));
21928 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21929 Addr, MachinePointerInfo(TrmpAddr, 5),
21930 /* Alignment = */ 1);
21932 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21933 DAG.getConstant(6, dl, MVT::i32));
21935 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21936 /* Alignment = */ 1);
21938 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21942 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21943 SelectionDAG &DAG) const {
21945 The rounding mode is in bits 11:10 of FPSR, and has the following
21947 00 Round to nearest
21952 FLT_ROUNDS, on the other hand, expects the following:
21959 To perform the conversion, we do:
21960 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21963 MachineFunction &MF = DAG.getMachineFunction();
21964 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21965 unsigned StackAlignment = TFI.getStackAlignment();
21966 MVT VT = Op.getSimpleValueType();
21969 // Save FP Control Word to stack slot
21970 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21971 SDValue StackSlot =
21972 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21974 MachineMemOperand *MMO =
21975 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21976 MachineMemOperand::MOStore, 2, 2);
21978 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21979 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21980 DAG.getVTList(MVT::Other),
21981 Ops, MVT::i16, MMO);
21983 // Load FP Control Word from stack slot
21985 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21987 // Transform as necessary
21989 DAG.getNode(ISD::SRL, DL, MVT::i16,
21990 DAG.getNode(ISD::AND, DL, MVT::i16,
21991 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21992 DAG.getConstant(11, DL, MVT::i8));
21994 DAG.getNode(ISD::SRL, DL, MVT::i16,
21995 DAG.getNode(ISD::AND, DL, MVT::i16,
21996 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21997 DAG.getConstant(9, DL, MVT::i8));
22000 DAG.getNode(ISD::AND, DL, MVT::i16,
22001 DAG.getNode(ISD::ADD, DL, MVT::i16,
22002 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
22003 DAG.getConstant(1, DL, MVT::i16)),
22004 DAG.getConstant(3, DL, MVT::i16));
22006 return DAG.getNode((VT.getSizeInBits() < 16 ?
22007 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
22010 // Split an unary integer op into 2 half sized ops.
22011 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
22012 MVT VT = Op.getSimpleValueType();
22013 unsigned NumElems = VT.getVectorNumElements();
22014 unsigned SizeInBits = VT.getSizeInBits();
22016 // Extract the Lo/Hi vectors
22018 SDValue Src = Op.getOperand(0);
22019 unsigned SrcNumElems = Src.getSimpleValueType().getVectorNumElements();
22020 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
22021 SDValue Hi = extractSubVector(Src, SrcNumElems / 2, DAG, dl, SizeInBits / 2);
22023 MVT EltVT = VT.getVectorElementType();
22024 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
22025 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22026 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
22027 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
22030 // Decompose 256-bit ops into smaller 128-bit ops.
22031 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
22032 assert(Op.getSimpleValueType().is256BitVector() &&
22033 Op.getSimpleValueType().isInteger() &&
22034 "Only handle AVX 256-bit vector integer operation");
22035 return LowerVectorIntUnary(Op, DAG);
22038 // Decompose 512-bit ops into smaller 256-bit ops.
22039 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
22040 assert(Op.getSimpleValueType().is512BitVector() &&
22041 Op.getSimpleValueType().isInteger() &&
22042 "Only handle AVX 512-bit vector integer operation");
22043 return LowerVectorIntUnary(Op, DAG);
22046 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
22048 // i8/i16 vector implemented using dword LZCNT vector instruction
22049 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
22050 // split the vector, perform operation on it's Lo a Hi part and
22051 // concatenate the results.
22052 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
22053 const X86Subtarget &Subtarget) {
22054 assert(Op.getOpcode() == ISD::CTLZ);
22056 MVT VT = Op.getSimpleValueType();
22057 MVT EltVT = VT.getVectorElementType();
22058 unsigned NumElems = VT.getVectorNumElements();
22060 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
22061 "Unsupported element type");
22063 // Split vector, it's Lo and Hi parts will be handled in next iteration.
22064 if (NumElems > 16 ||
22065 (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
22066 return LowerVectorIntUnary(Op, DAG);
22068 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
22069 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
22070 "Unsupported value type for operation");
22072 // Use native supported vector instruction vplzcntd.
22073 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
22074 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
22075 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
22076 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
22078 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
22081 // Lower CTLZ using a PSHUFB lookup table implementation.
22082 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
22083 const X86Subtarget &Subtarget,
22084 SelectionDAG &DAG) {
22085 MVT VT = Op.getSimpleValueType();
22086 int NumElts = VT.getVectorNumElements();
22087 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
22088 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
22090 // Per-nibble leading zero PSHUFB lookup table.
22091 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
22092 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
22093 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
22094 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
22096 SmallVector<SDValue, 64> LUTVec;
22097 for (int i = 0; i < NumBytes; ++i)
22098 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22099 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
22101 // Begin by bitcasting the input to byte vector, then split those bytes
22102 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
22103 // If the hi input nibble is zero then we add both results together, otherwise
22104 // we just take the hi result (by masking the lo result to zero before the
22106 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
22107 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
22109 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
22110 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
22111 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
22112 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
22114 if (CurrVT.is512BitVector()) {
22115 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22116 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
22117 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22119 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
22122 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
22123 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
22124 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
22125 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
22127 // Merge result back from vXi8 back to VT, working on the lo/hi halves
22128 // of the current vector width in the same way we did for the nibbles.
22129 // If the upper half of the input element is zero then add the halves'
22130 // leading zero counts together, otherwise just use the upper half's.
22131 // Double the width of the result until we are at target width.
22132 while (CurrVT != VT) {
22133 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
22134 int CurrNumElts = CurrVT.getVectorNumElements();
22135 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
22136 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
22137 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
22139 // Check if the upper half of the input element is zero.
22140 if (CurrVT.is512BitVector()) {
22141 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
22142 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
22143 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22144 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
22146 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
22147 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
22149 HiZ = DAG.getBitcast(NextVT, HiZ);
22151 // Move the upper/lower halves to the lower bits as we'll be extending to
22152 // NextVT. Mask the lower result to zero if HiZ is true and add the results
22154 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
22155 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
22156 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
22157 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
22158 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
22165 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
22166 const X86Subtarget &Subtarget,
22167 SelectionDAG &DAG) {
22168 MVT VT = Op.getSimpleValueType();
22170 if (Subtarget.hasCDI() &&
22171 // vXi8 vectors need to be promoted to 512-bits for vXi32.
22172 (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8))
22173 return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget);
22175 // Decompose 256-bit ops into smaller 128-bit ops.
22176 if (VT.is256BitVector() && !Subtarget.hasInt256())
22177 return Lower256IntUnary(Op, DAG);
22179 // Decompose 512-bit ops into smaller 256-bit ops.
22180 if (VT.is512BitVector() && !Subtarget.hasBWI())
22181 return Lower512IntUnary(Op, DAG);
22183 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
22184 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
22187 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
22188 SelectionDAG &DAG) {
22189 MVT VT = Op.getSimpleValueType();
22191 unsigned NumBits = VT.getSizeInBits();
22193 unsigned Opc = Op.getOpcode();
22196 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
22198 Op = Op.getOperand(0);
22199 if (VT == MVT::i8) {
22200 // Zero extend to i32 since there is not an i8 bsr.
22202 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
22205 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
22206 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
22207 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
22209 if (Opc == ISD::CTLZ) {
22210 // If src is zero (i.e. bsr sets ZF), returns NumBits.
22213 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
22214 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22217 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
22220 // Finally xor with NumBits-1.
22221 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
22222 DAG.getConstant(NumBits - 1, dl, OpVT));
22225 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
22229 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
22230 MVT VT = Op.getSimpleValueType();
22231 unsigned NumBits = VT.getScalarSizeInBits();
22234 if (VT.isVector()) {
22235 SDValue N0 = Op.getOperand(0);
22236 SDValue Zero = DAG.getConstant(0, dl, VT);
22238 // lsb(x) = (x & -x)
22239 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
22240 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
22242 // cttz_undef(x) = (width - 1) - ctlz(lsb)
22243 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
22244 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
22245 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
22246 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
22249 // cttz(x) = ctpop(lsb - 1)
22250 SDValue One = DAG.getConstant(1, dl, VT);
22251 return DAG.getNode(ISD::CTPOP, dl, VT,
22252 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
22255 assert(Op.getOpcode() == ISD::CTTZ &&
22256 "Only scalar CTTZ requires custom lowering");
22258 // Issue a bsf (scan bits forward) which also sets EFLAGS.
22259 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22260 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
22262 // If src is zero (i.e. bsf sets ZF), returns NumBits.
22265 DAG.getConstant(NumBits, dl, VT),
22266 DAG.getConstant(X86::COND_E, dl, MVT::i8),
22269 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
22272 /// Break a 256-bit integer operation into two new 128-bit ones and then
22273 /// concatenate the result back.
22274 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
22275 MVT VT = Op.getSimpleValueType();
22277 assert(VT.is256BitVector() && VT.isInteger() &&
22278 "Unsupported value type for operation");
22280 unsigned NumElems = VT.getVectorNumElements();
22283 // Extract the LHS vectors
22284 SDValue LHS = Op.getOperand(0);
22285 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
22286 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
22288 // Extract the RHS vectors
22289 SDValue RHS = Op.getOperand(1);
22290 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
22291 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
22293 MVT EltVT = VT.getVectorElementType();
22294 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22296 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22297 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22298 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22301 /// Break a 512-bit integer operation into two new 256-bit ones and then
22302 /// concatenate the result back.
22303 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
22304 MVT VT = Op.getSimpleValueType();
22306 assert(VT.is512BitVector() && VT.isInteger() &&
22307 "Unsupported value type for operation");
22309 unsigned NumElems = VT.getVectorNumElements();
22312 // Extract the LHS vectors
22313 SDValue LHS = Op.getOperand(0);
22314 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
22315 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
22317 // Extract the RHS vectors
22318 SDValue RHS = Op.getOperand(1);
22319 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
22320 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
22322 MVT EltVT = VT.getVectorElementType();
22323 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
22325 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
22326 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
22327 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
22330 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
22331 MVT VT = Op.getSimpleValueType();
22332 if (VT.getScalarType() == MVT::i1)
22333 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
22334 Op.getOperand(0), Op.getOperand(1));
22335 assert(Op.getSimpleValueType().is256BitVector() &&
22336 Op.getSimpleValueType().isInteger() &&
22337 "Only handle AVX 256-bit vector integer operation");
22338 return Lower256IntArith(Op, DAG);
22341 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
22342 MVT VT = Op.getSimpleValueType();
22343 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
22344 // Since X86 does not have CMOV for 8-bit integer, we don't convert
22345 // 8-bit integer abs to NEG and CMOV.
22347 SDValue N0 = Op.getOperand(0);
22348 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
22349 DAG.getConstant(0, DL, VT), N0);
22350 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
22351 SDValue(Neg.getNode(), 1)};
22352 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
22355 assert(Op.getSimpleValueType().is256BitVector() &&
22356 Op.getSimpleValueType().isInteger() &&
22357 "Only handle AVX 256-bit vector integer operation");
22358 return Lower256IntUnary(Op, DAG);
22361 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
22362 MVT VT = Op.getSimpleValueType();
22364 // For AVX1 cases, split to use use legal ops (everything but v4i64).
22365 if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
22366 return Lower256IntArith(Op, DAG);
22369 unsigned Opcode = Op.getOpcode();
22370 SDValue N0 = Op.getOperand(0);
22371 SDValue N1 = Op.getOperand(1);
22373 // For pre-SSE41, we can perform UMIN/UMAX v8i16 by flipping the signbit,
22374 // using the SMIN/SMAX instructions and flipping the signbit back.
22375 if (VT == MVT::v8i16) {
22376 assert((Opcode == ISD::UMIN || Opcode == ISD::UMAX) &&
22377 "Unexpected MIN/MAX opcode");
22378 SDValue Sign = DAG.getConstant(APInt::getSignedMinValue(16), DL, VT);
22379 N0 = DAG.getNode(ISD::XOR, DL, VT, N0, Sign);
22380 N1 = DAG.getNode(ISD::XOR, DL, VT, N1, Sign);
22381 Opcode = (Opcode == ISD::UMIN ? ISD::SMIN : ISD::SMAX);
22382 SDValue Result = DAG.getNode(Opcode, DL, VT, N0, N1);
22383 return DAG.getNode(ISD::XOR, DL, VT, Result, Sign);
22386 // Else, expand to a compare/select.
22389 case ISD::SMIN: CC = ISD::CondCode::SETLT; break;
22390 case ISD::SMAX: CC = ISD::CondCode::SETGT; break;
22391 case ISD::UMIN: CC = ISD::CondCode::SETULT; break;
22392 case ISD::UMAX: CC = ISD::CondCode::SETUGT; break;
22393 default: llvm_unreachable("Unknown MINMAX opcode");
22396 SDValue Cond = DAG.getSetCC(DL, VT, N0, N1, CC);
22397 return DAG.getSelect(DL, VT, Cond, N0, N1);
22400 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
22401 SelectionDAG &DAG) {
22403 MVT VT = Op.getSimpleValueType();
22405 if (VT.getScalarType() == MVT::i1)
22406 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
22408 // Decompose 256-bit ops into smaller 128-bit ops.
22409 if (VT.is256BitVector() && !Subtarget.hasInt256())
22410 return Lower256IntArith(Op, DAG);
22412 SDValue A = Op.getOperand(0);
22413 SDValue B = Op.getOperand(1);
22415 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
22416 // vector pairs, multiply and truncate.
22417 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
22418 if (Subtarget.hasInt256()) {
22419 // For 512-bit vectors, split into 256-bit vectors to allow the
22420 // sign-extension to occur.
22421 if (VT == MVT::v64i8)
22422 return Lower512IntArith(Op, DAG);
22424 // For 256-bit vectors, split into 128-bit vectors to allow the
22425 // sign-extension to occur. We don't need this on AVX512BW as we can
22426 // safely sign-extend to v32i16.
22427 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
22428 return Lower256IntArith(Op, DAG);
22430 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
22431 return DAG.getNode(
22432 ISD::TRUNCATE, dl, VT,
22433 DAG.getNode(ISD::MUL, dl, ExVT,
22434 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
22435 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
22438 assert(VT == MVT::v16i8 &&
22439 "Pre-AVX2 support only supports v16i8 multiplication");
22440 MVT ExVT = MVT::v8i16;
22442 // Extract the lo parts and sign extend to i16
22444 if (Subtarget.hasSSE41()) {
22445 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
22446 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
22448 // We're going to mask off the low byte of each result element of the
22449 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22451 const int ShufMask[] = {0, -1, 1, -1, 2, -1, 3, -1,
22452 4, -1, 5, -1, 6, -1, 7, -1};
22453 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22454 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22455 ALo = DAG.getBitcast(ExVT, ALo);
22456 BLo = DAG.getBitcast(ExVT, BLo);
22459 // Extract the hi parts and sign extend to i16
22461 if (Subtarget.hasSSE41()) {
22462 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22463 -1, -1, -1, -1, -1, -1, -1, -1};
22464 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22465 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22466 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
22467 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
22469 // We're going to mask off the low byte of each result element of the
22470 // pmullw, so it doesn't matter what's in the high byte of each 16-bit
22472 const int ShufMask[] = {8, -1, 9, -1, 10, -1, 11, -1,
22473 12, -1, 13, -1, 14, -1, 15, -1};
22474 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22475 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22476 AHi = DAG.getBitcast(ExVT, AHi);
22477 BHi = DAG.getBitcast(ExVT, BHi);
22480 // Multiply, mask the lower 8bits of the lo/hi results and pack
22481 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22482 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22483 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
22484 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
22485 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22488 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
22489 if (VT == MVT::v4i32) {
22490 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
22491 "Should not custom lower when pmulld is available!");
22493 // Extract the odd parts.
22494 static const int UnpackMask[] = { 1, -1, 3, -1 };
22495 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
22496 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
22498 // Multiply the even parts.
22499 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22500 DAG.getBitcast(MVT::v2i64, A),
22501 DAG.getBitcast(MVT::v2i64, B));
22502 // Now multiply odd parts.
22503 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64,
22504 DAG.getBitcast(MVT::v2i64, Aodds),
22505 DAG.getBitcast(MVT::v2i64, Bodds));
22507 Evens = DAG.getBitcast(VT, Evens);
22508 Odds = DAG.getBitcast(VT, Odds);
22510 // Merge the two vectors back together with a shuffle. This expands into 2
22512 static const int ShufMask[] = { 0, 4, 2, 6 };
22513 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
22516 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
22517 "Only know how to lower V2I64/V4I64/V8I64 multiply");
22519 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22520 // 32-bits. We can lower with this if the sign bits stretch that far.
22521 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22522 DAG.ComputeNumSignBits(B) > 32) {
22523 return DAG.getNode(X86ISD::PMULDQ, dl, VT, A, B);
22526 // Ahi = psrlqi(a, 32);
22527 // Bhi = psrlqi(b, 32);
22529 // AloBlo = pmuludq(a, b);
22530 // AloBhi = pmuludq(a, Bhi);
22531 // AhiBlo = pmuludq(Ahi, b);
22533 // Hi = psllqi(AloBhi + AhiBlo, 32);
22534 // return AloBlo + Hi;
22535 KnownBits AKnown, BKnown;
22536 DAG.computeKnownBits(A, AKnown);
22537 DAG.computeKnownBits(B, BKnown);
22539 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22540 bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero);
22541 bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero);
22543 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22544 bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero);
22545 bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero);
22547 // If DQI is supported we can use MULLQ, but MULUDQ is still better if the
22548 // the high bits are known to be zero.
22549 if (Subtarget.hasDQI() && (!AHiIsZero || !BHiIsZero))
22552 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22554 // Only multiply lo/hi halves that aren't known to be zero.
22555 SDValue AloBlo = Zero;
22556 if (!ALoIsZero && !BLoIsZero)
22557 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B);
22559 SDValue AloBhi = Zero;
22560 if (!ALoIsZero && !BHiIsZero) {
22561 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22562 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi);
22565 SDValue AhiBlo = Zero;
22566 if (!AHiIsZero && !BLoIsZero) {
22567 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22568 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B);
22571 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22572 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22574 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22577 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22578 SelectionDAG &DAG) {
22580 MVT VT = Op.getSimpleValueType();
22582 // Decompose 256-bit ops into smaller 128-bit ops.
22583 if (VT.is256BitVector() && !Subtarget.hasInt256())
22584 return Lower256IntArith(Op, DAG);
22586 // Only i8 vectors should need custom lowering after this.
22587 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22588 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22589 "Unsupported vector type");
22591 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22592 // logical shift down the upper half and pack back to i8.
22593 SDValue A = Op.getOperand(0);
22594 SDValue B = Op.getOperand(1);
22596 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22597 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22598 unsigned Opcode = Op.getOpcode();
22599 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22600 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22602 // For 512-bit vectors, split into 256-bit vectors to allow the
22603 // sign-extension to occur.
22604 if (VT == MVT::v64i8)
22605 return Lower512IntArith(Op, DAG);
22607 // AVX2 implementations - extend xmm subvectors to ymm.
22608 if (Subtarget.hasInt256()) {
22609 unsigned NumElems = VT.getVectorNumElements();
22610 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22611 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22613 if (VT == MVT::v32i8) {
22614 if (Subtarget.canExtendTo512BW()) {
22615 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22616 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22617 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22618 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22619 DAG.getConstant(8, dl, MVT::v32i16));
22620 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22622 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22623 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22624 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22625 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22626 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22627 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22628 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22629 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22630 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22631 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22632 DAG.getConstant(8, dl, MVT::v16i16));
22633 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22634 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22635 DAG.getConstant(8, dl, MVT::v16i16));
22636 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22637 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22638 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22639 16, 17, 18, 19, 20, 21, 22, 23};
22640 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22641 24, 25, 26, 27, 28, 29, 30, 31};
22642 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22643 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22644 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22647 assert(VT == MVT::v16i8 && "Unexpected VT");
22649 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22650 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22651 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22652 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22653 DAG.getConstant(8, dl, MVT::v16i16));
22654 // If we have BWI we can use truncate instruction.
22655 if (Subtarget.hasBWI())
22656 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22657 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22658 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22659 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22662 assert(VT == MVT::v16i8 &&
22663 "Pre-AVX2 support only supports v16i8 multiplication");
22664 MVT ExVT = MVT::v8i16;
22665 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22667 // Extract the lo parts and zero/sign extend to i16.
22669 if (Subtarget.hasSSE41()) {
22670 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22671 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22673 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22674 -1, 4, -1, 5, -1, 6, -1, 7};
22675 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22676 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22677 ALo = DAG.getBitcast(ExVT, ALo);
22678 BLo = DAG.getBitcast(ExVT, BLo);
22679 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22680 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22683 // Extract the hi parts and zero/sign extend to i16.
22685 if (Subtarget.hasSSE41()) {
22686 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22687 -1, -1, -1, -1, -1, -1, -1, -1};
22688 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22689 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22690 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22691 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22693 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22694 -1, 12, -1, 13, -1, 14, -1, 15};
22695 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22696 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22697 AHi = DAG.getBitcast(ExVT, AHi);
22698 BHi = DAG.getBitcast(ExVT, BHi);
22699 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22700 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22703 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22704 // pack back to v16i8.
22705 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22706 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22707 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22708 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22709 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22712 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22713 assert(Subtarget.isTargetWin64() && "Unexpected target");
22714 EVT VT = Op.getValueType();
22715 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22716 "Unexpected return type for lowering");
22720 switch (Op->getOpcode()) {
22721 default: llvm_unreachable("Unexpected request for libcall!");
22722 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22723 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22724 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22725 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22726 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22727 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22731 SDValue InChain = DAG.getEntryNode();
22733 TargetLowering::ArgListTy Args;
22734 TargetLowering::ArgListEntry Entry;
22735 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22736 EVT ArgVT = Op->getOperand(i).getValueType();
22737 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22738 "Unexpected argument type for lowering");
22739 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22740 Entry.Node = StackPtr;
22741 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22742 MachinePointerInfo(), /* Alignment = */ 16);
22743 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22744 Entry.Ty = PointerType::get(ArgTy,0);
22745 Entry.IsSExt = false;
22746 Entry.IsZExt = false;
22747 Args.push_back(Entry);
22750 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22751 getPointerTy(DAG.getDataLayout()));
22753 TargetLowering::CallLoweringInfo CLI(DAG);
22754 CLI.setDebugLoc(dl)
22757 getLibcallCallingConv(LC),
22758 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22761 .setSExtResult(isSigned)
22762 .setZExtResult(!isSigned);
22764 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22765 return DAG.getBitcast(VT, CallInfo.first);
22768 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22769 SelectionDAG &DAG) {
22770 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22771 MVT VT = Op0.getSimpleValueType();
22774 // Decompose 256-bit ops into smaller 128-bit ops.
22775 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22776 unsigned Opcode = Op.getOpcode();
22777 unsigned NumElems = VT.getVectorNumElements();
22778 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22779 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22780 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22781 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22782 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22783 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22784 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22786 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22787 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22789 return DAG.getMergeValues(Ops, dl);
22792 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22793 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22794 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22796 int NumElts = VT.getVectorNumElements();
22798 // PMULxD operations multiply each even value (starting at 0) of LHS with
22799 // the related value of RHS and produce a widen result.
22800 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22801 // => <2 x i64> <ae|cg>
22803 // In other word, to have all the results, we need to perform two PMULxD:
22804 // 1. one with the even values.
22805 // 2. one with the odd values.
22806 // To achieve #2, with need to place the odd values at an even position.
22808 // Place the odd value at an even position (basically, shift all values 1
22809 // step to the left):
22810 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22811 // <a|b|c|d> => <b|undef|d|undef>
22812 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22813 makeArrayRef(&Mask[0], NumElts));
22814 // <e|f|g|h> => <f|undef|h|undef>
22815 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22816 makeArrayRef(&Mask[0], NumElts));
22818 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22820 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22821 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22823 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22824 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22825 // => <2 x i64> <ae|cg>
22826 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22827 DAG.getBitcast(MulVT, Op0),
22828 DAG.getBitcast(MulVT, Op1)));
22829 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22830 // => <2 x i64> <bf|dh>
22831 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT,
22832 DAG.getBitcast(MulVT, Odd0),
22833 DAG.getBitcast(MulVT, Odd1)));
22835 // Shuffle it back into the right order.
22836 SmallVector<int, 16> HighMask(NumElts);
22837 SmallVector<int, 16> LowMask(NumElts);
22838 for (int i = 0; i != NumElts; ++i) {
22839 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22840 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22843 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22844 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22846 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22847 // unsigned multiply.
22848 if (IsSigned && !Subtarget.hasSSE41()) {
22849 SDValue ShAmt = DAG.getConstant(
22851 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22852 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22853 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22854 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22855 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22857 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22858 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22861 // The first result of MUL_LOHI is actually the low value, followed by the
22863 SDValue Ops[] = {Lows, Highs};
22864 return DAG.getMergeValues(Ops, dl);
22867 // Return true if the required (according to Opcode) shift-imm form is natively
22868 // supported by the Subtarget
22869 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22871 if (VT.getScalarSizeInBits() < 16)
22874 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22875 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22878 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22879 (VT.is256BitVector() && Subtarget.hasInt256());
22881 bool AShift = LShift && (Subtarget.hasAVX512() ||
22882 (VT != MVT::v2i64 && VT != MVT::v4i64));
22883 return (Opcode == ISD::SRA) ? AShift : LShift;
22886 // The shift amount is a variable, but it is the same for all vector lanes.
22887 // These instructions are defined together with shift-immediate.
22889 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22891 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22894 // Return true if the required (according to Opcode) variable-shift form is
22895 // natively supported by the Subtarget
22896 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22899 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22902 // vXi16 supported only on AVX-512, BWI
22903 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22906 if (Subtarget.hasAVX512())
22909 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22910 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22911 return (Opcode == ISD::SRA) ? AShift : LShift;
22914 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22915 const X86Subtarget &Subtarget) {
22916 MVT VT = Op.getSimpleValueType();
22918 SDValue R = Op.getOperand(0);
22919 SDValue Amt = Op.getOperand(1);
22921 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22922 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22924 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22925 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22926 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22927 SDValue Ex = DAG.getBitcast(ExVT, R);
22929 // ashr(R, 63) === cmp_slt(R, 0)
22930 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22931 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22932 "Unsupported PCMPGT op");
22933 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22934 getZeroVector(VT, Subtarget, DAG, dl), R);
22937 if (ShiftAmt >= 32) {
22938 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22940 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22941 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22942 ShiftAmt - 32, DAG);
22943 if (VT == MVT::v2i64)
22944 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22945 if (VT == MVT::v4i64)
22946 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22947 {9, 1, 11, 3, 13, 5, 15, 7});
22949 // SRA upper i32, SHL whole i64 and select lower i32.
22950 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22953 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22954 Lower = DAG.getBitcast(ExVT, Lower);
22955 if (VT == MVT::v2i64)
22956 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22957 if (VT == MVT::v4i64)
22958 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22959 {8, 1, 10, 3, 12, 5, 14, 7});
22961 return DAG.getBitcast(VT, Ex);
22964 // Optimize shl/srl/sra with constant shift amount.
22965 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22966 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22967 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22969 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22970 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22972 // i64 SRA needs to be performed as partial shifts.
22973 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22974 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22975 Op.getOpcode() == ISD::SRA)
22976 return ArithmeticShiftRight64(ShiftAmt);
22978 if (VT == MVT::v16i8 ||
22979 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22980 VT == MVT::v64i8) {
22981 unsigned NumElts = VT.getVectorNumElements();
22982 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22984 // Simple i8 add case
22985 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22986 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22988 // ashr(R, 7) === cmp_slt(R, 0)
22989 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22990 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22991 if (VT.is512BitVector()) {
22992 assert(VT == MVT::v64i8 && "Unexpected element type!");
22993 SDValue CMP = DAG.getNode(X86ISD::CMPM, dl, MVT::v64i1, Zeros, R,
22994 DAG.getConstant(6, dl, MVT::i8));
22995 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22997 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
23000 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
23001 if (VT == MVT::v16i8 && Subtarget.hasXOP())
23004 if (Op.getOpcode() == ISD::SHL) {
23005 // Make a large shift.
23006 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
23008 SHL = DAG.getBitcast(VT, SHL);
23009 // Zero out the rightmost bits.
23010 return DAG.getNode(ISD::AND, dl, VT, SHL,
23011 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
23013 if (Op.getOpcode() == ISD::SRL) {
23014 // Make a large shift.
23015 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
23017 SRL = DAG.getBitcast(VT, SRL);
23018 // Zero out the leftmost bits.
23019 return DAG.getNode(ISD::AND, dl, VT, SRL,
23020 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
23022 if (Op.getOpcode() == ISD::SRA) {
23023 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
23024 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23026 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
23027 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
23028 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
23031 llvm_unreachable("Unknown shift opcode.");
23036 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23037 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
23038 if (!Subtarget.hasXOP() &&
23039 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
23040 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
23042 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
23043 unsigned SubVectorScale = 1;
23044 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
23046 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
23047 Amt = Amt.getOperand(0);
23050 // Peek through any splat that was introduced for i64 shift vectorization.
23051 int SplatIndex = -1;
23052 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
23053 if (SVN->isSplat()) {
23054 SplatIndex = SVN->getSplatIndex();
23055 Amt = Amt.getOperand(0);
23056 assert(SplatIndex < (int)VT.getVectorNumElements() &&
23057 "Splat shuffle referencing second operand");
23060 if (Amt.getOpcode() != ISD::BITCAST ||
23061 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
23064 Amt = Amt.getOperand(0);
23065 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23066 (SubVectorScale * VT.getVectorNumElements());
23067 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
23068 uint64_t ShiftAmt = 0;
23069 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
23070 for (unsigned i = 0; i != Ratio; ++i) {
23071 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
23075 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
23078 // Check remaining shift amounts (if not a splat).
23079 if (SplatIndex < 0) {
23080 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23081 uint64_t ShAmt = 0;
23082 for (unsigned j = 0; j != Ratio; ++j) {
23083 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
23087 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
23089 if (ShAmt != ShiftAmt)
23094 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
23095 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
23097 if (Op.getOpcode() == ISD::SRA)
23098 return ArithmeticShiftRight64(ShiftAmt);
23104 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
23105 const X86Subtarget &Subtarget) {
23106 MVT VT = Op.getSimpleValueType();
23108 SDValue R = Op.getOperand(0);
23109 SDValue Amt = Op.getOperand(1);
23111 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
23112 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
23114 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
23115 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
23117 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
23119 MVT EltVT = VT.getVectorElementType();
23121 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
23122 // Check if this build_vector node is doing a splat.
23123 // If so, then set BaseShAmt equal to the splat value.
23124 BaseShAmt = BV->getSplatValue();
23125 if (BaseShAmt && BaseShAmt.isUndef())
23126 BaseShAmt = SDValue();
23128 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
23129 Amt = Amt.getOperand(0);
23131 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
23132 if (SVN && SVN->isSplat()) {
23133 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
23134 SDValue InVec = Amt.getOperand(0);
23135 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
23136 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
23137 "Unexpected shuffle index found!");
23138 BaseShAmt = InVec.getOperand(SplatIdx);
23139 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
23140 if (ConstantSDNode *C =
23141 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
23142 if (C->getZExtValue() == SplatIdx)
23143 BaseShAmt = InVec.getOperand(1);
23148 // Avoid introducing an extract element from a shuffle.
23149 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
23150 DAG.getIntPtrConstant(SplatIdx, dl));
23154 if (BaseShAmt.getNode()) {
23155 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
23156 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
23157 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
23158 else if (EltVT.bitsLT(MVT::i32))
23159 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
23161 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
23165 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
23166 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
23167 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
23168 Amt = Amt.getOperand(0);
23169 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
23170 VT.getVectorNumElements();
23171 std::vector<SDValue> Vals(Ratio);
23172 for (unsigned i = 0; i != Ratio; ++i)
23173 Vals[i] = Amt.getOperand(i);
23174 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
23175 for (unsigned j = 0; j != Ratio; ++j)
23176 if (Vals[j] != Amt.getOperand(i + j))
23180 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
23181 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
23186 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
23187 SelectionDAG &DAG) {
23188 MVT VT = Op.getSimpleValueType();
23190 SDValue R = Op.getOperand(0);
23191 SDValue Amt = Op.getOperand(1);
23192 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23194 assert(VT.isVector() && "Custom lowering only for vector shifts!");
23195 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
23197 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
23200 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
23203 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
23206 // XOP has 128-bit variable logical/arithmetic shifts.
23207 // +ve/-ve Amt = shift left/right.
23208 if (Subtarget.hasXOP() &&
23209 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
23210 VT == MVT::v8i16 || VT == MVT::v16i8)) {
23211 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
23212 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
23213 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
23215 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
23216 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
23217 if (Op.getOpcode() == ISD::SRA)
23218 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
23221 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
23222 // shifts per-lane and then shuffle the partial results back together.
23223 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
23224 // Splat the shift amounts so the scalar shifts above will catch it.
23225 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
23226 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
23227 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
23228 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
23229 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
23232 // i64 vector arithmetic shift can be emulated with the transform:
23233 // M = lshr(SIGN_MASK, Amt)
23234 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
23235 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
23236 Op.getOpcode() == ISD::SRA) {
23237 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
23238 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
23239 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
23240 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
23241 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
23245 // If possible, lower this packed shift into a vector multiply instead of
23246 // expanding it into a sequence of scalar shifts.
23247 // Do this only if the vector shift count is a constant build_vector.
23248 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
23249 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
23250 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
23251 SmallVector<SDValue, 8> Elts;
23252 MVT SVT = VT.getVectorElementType();
23253 unsigned SVTBits = SVT.getSizeInBits();
23254 APInt One(SVTBits, 1);
23255 unsigned NumElems = VT.getVectorNumElements();
23257 for (unsigned i=0; i !=NumElems; ++i) {
23258 SDValue Op = Amt->getOperand(i);
23259 if (Op->isUndef()) {
23260 Elts.push_back(Op);
23264 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
23265 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
23266 uint64_t ShAmt = C.getZExtValue();
23267 if (ShAmt >= SVTBits) {
23268 Elts.push_back(DAG.getUNDEF(SVT));
23271 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
23273 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
23274 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
23277 // Lower SHL with variable shift amount.
23278 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
23279 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
23281 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
23282 DAG.getConstant(0x3f800000U, dl, VT));
23283 Op = DAG.getBitcast(MVT::v4f32, Op);
23284 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
23285 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
23288 // If possible, lower this shift as a sequence of two shifts by
23289 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
23291 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
23293 // Could be rewritten as:
23294 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
23296 // The advantage is that the two shifts from the example would be
23297 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
23298 // the vector shift into four scalar shifts plus four pairs of vector
23300 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
23301 bool UseMOVSD = false;
23302 bool CanBeSimplified;
23303 // The splat value for the first packed shift (the 'X' from the example).
23304 SDValue Amt1 = Amt->getOperand(0);
23305 // The splat value for the second packed shift (the 'Y' from the example).
23306 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
23308 // See if it is possible to replace this node with a sequence of
23309 // two shifts followed by a MOVSS/MOVSD/PBLEND.
23310 if (VT == MVT::v4i32) {
23311 // Check if it is legal to use a MOVSS.
23312 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
23313 Amt2 == Amt->getOperand(3);
23314 if (!CanBeSimplified) {
23315 // Otherwise, check if we can still simplify this node using a MOVSD.
23316 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
23317 Amt->getOperand(2) == Amt->getOperand(3);
23319 Amt2 = Amt->getOperand(2);
23322 // Do similar checks for the case where the machine value type
23324 CanBeSimplified = Amt1 == Amt->getOperand(1);
23325 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
23326 CanBeSimplified = Amt2 == Amt->getOperand(i);
23328 if (!CanBeSimplified) {
23330 CanBeSimplified = true;
23331 Amt2 = Amt->getOperand(4);
23332 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
23333 CanBeSimplified = Amt1 == Amt->getOperand(i);
23334 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
23335 CanBeSimplified = Amt2 == Amt->getOperand(j);
23339 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
23340 isa<ConstantSDNode>(Amt2)) {
23341 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
23343 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
23344 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
23346 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
23347 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
23348 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
23349 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
23351 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23352 BitCast2, {0, 1, 6, 7}));
23353 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
23354 BitCast2, {0, 5, 6, 7}));
23358 // v4i32 Non Uniform Shifts.
23359 // If the shift amount is constant we can shift each lane using the SSE2
23360 // immediate shifts, else we need to zero-extend each lane to the lower i64
23361 // and shift using the SSE2 variable shifts.
23362 // The separate results can then be blended together.
23363 if (VT == MVT::v4i32) {
23364 unsigned Opc = Op.getOpcode();
23365 SDValue Amt0, Amt1, Amt2, Amt3;
23367 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
23368 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
23369 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
23370 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
23372 // ISD::SHL is handled above but we include it here for completeness.
23375 llvm_unreachable("Unknown target vector shift node");
23377 Opc = X86ISD::VSHL;
23380 Opc = X86ISD::VSRL;
23383 Opc = X86ISD::VSRA;
23386 // The SSE2 shifts use the lower i64 as the same shift amount for
23387 // all lanes and the upper i64 is ignored. These shuffle masks
23388 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
23389 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23390 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
23391 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
23392 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
23393 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
23396 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
23397 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
23398 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
23399 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
23400 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
23401 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
23402 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
23405 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
23406 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
23407 // make the existing SSE solution better.
23408 // NOTE: We honor prefered vector width before promoting to 512-bits.
23409 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
23410 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) ||
23411 (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) ||
23412 (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) ||
23413 (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) {
23414 assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&
23415 "Unexpected vector type");
23416 MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32;
23417 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
23419 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
23420 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
23421 Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt);
23422 return DAG.getNode(ISD::TRUNCATE, dl, VT,
23423 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
23426 if (VT == MVT::v16i8 ||
23427 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
23428 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
23429 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
23430 unsigned ShiftOpcode = Op->getOpcode();
23432 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
23433 if (VT.is512BitVector()) {
23434 // On AVX512BW targets we make use of the fact that VSELECT lowers
23435 // to a masked blend which selects bytes based just on the sign bit
23436 // extracted to a mask.
23437 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
23438 V0 = DAG.getBitcast(VT, V0);
23439 V1 = DAG.getBitcast(VT, V1);
23440 Sel = DAG.getBitcast(VT, Sel);
23441 Sel = DAG.getNode(X86ISD::CMPM, dl, MaskVT,
23442 DAG.getConstant(0, dl, VT), Sel,
23443 DAG.getConstant(6, dl, MVT::i8));
23444 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23445 } else if (Subtarget.hasSSE41()) {
23446 // On SSE41 targets we make use of the fact that VSELECT lowers
23447 // to PBLENDVB which selects bytes based just on the sign bit.
23448 V0 = DAG.getBitcast(VT, V0);
23449 V1 = DAG.getBitcast(VT, V1);
23450 Sel = DAG.getBitcast(VT, Sel);
23451 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
23453 // On pre-SSE41 targets we test for the sign bit by comparing to
23454 // zero - a negative value will set all bits of the lanes to true
23455 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
23456 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
23457 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
23458 return DAG.getSelect(dl, SelVT, C, V0, V1);
23461 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
23462 // We can safely do this using i16 shifts as we're only interested in
23463 // the 3 lower bits of each byte.
23464 Amt = DAG.getBitcast(ExtVT, Amt);
23465 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
23466 Amt = DAG.getBitcast(VT, Amt);
23468 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
23469 // r = VSELECT(r, shift(r, 4), a);
23471 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23472 R = SignBitSelect(VT, Amt, M, R);
23475 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23477 // r = VSELECT(r, shift(r, 2), a);
23478 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23479 R = SignBitSelect(VT, Amt, M, R);
23482 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23484 // return VSELECT(r, shift(r, 1), a);
23485 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23486 R = SignBitSelect(VT, Amt, M, R);
23490 if (Op->getOpcode() == ISD::SRA) {
23491 // For SRA we need to unpack each byte to the higher byte of a i16 vector
23492 // so we can correctly sign extend. We don't care what happens to the
23494 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
23495 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
23496 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
23497 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
23498 ALo = DAG.getBitcast(ExtVT, ALo);
23499 AHi = DAG.getBitcast(ExtVT, AHi);
23500 RLo = DAG.getBitcast(ExtVT, RLo);
23501 RHi = DAG.getBitcast(ExtVT, RHi);
23503 // r = VSELECT(r, shift(r, 4), a);
23504 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23505 DAG.getConstant(4, dl, ExtVT));
23506 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23507 DAG.getConstant(4, dl, ExtVT));
23508 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23509 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23512 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23513 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23515 // r = VSELECT(r, shift(r, 2), a);
23516 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23517 DAG.getConstant(2, dl, ExtVT));
23518 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23519 DAG.getConstant(2, dl, ExtVT));
23520 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23521 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23524 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
23525 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
23527 // r = VSELECT(r, shift(r, 1), a);
23528 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
23529 DAG.getConstant(1, dl, ExtVT));
23530 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
23531 DAG.getConstant(1, dl, ExtVT));
23532 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
23533 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23535 // Logical shift the result back to the lower byte, leaving a zero upper
23537 // meaning that we can safely pack with PACKUSWB.
23539 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23541 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23542 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23546 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23547 MVT ExtVT = MVT::v8i32;
23548 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23549 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23550 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23551 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23552 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23553 ALo = DAG.getBitcast(ExtVT, ALo);
23554 AHi = DAG.getBitcast(ExtVT, AHi);
23555 RLo = DAG.getBitcast(ExtVT, RLo);
23556 RHi = DAG.getBitcast(ExtVT, RHi);
23557 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23558 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23559 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23560 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23561 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23564 if (VT == MVT::v8i16) {
23565 unsigned ShiftOpcode = Op->getOpcode();
23567 // If we have a constant shift amount, the non-SSE41 path is best as
23568 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23569 bool UseSSE41 = Subtarget.hasSSE41() &&
23570 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23572 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23573 // On SSE41 targets we make use of the fact that VSELECT lowers
23574 // to PBLENDVB which selects bytes based just on the sign bit.
23576 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23577 V0 = DAG.getBitcast(ExtVT, V0);
23578 V1 = DAG.getBitcast(ExtVT, V1);
23579 Sel = DAG.getBitcast(ExtVT, Sel);
23580 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23582 // On pre-SSE41 targets we splat the sign bit - a negative value will
23583 // set all bits of the lanes to true and VSELECT uses that in
23584 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23586 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23587 return DAG.getSelect(dl, VT, C, V0, V1);
23590 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23592 // On SSE41 targets we need to replicate the shift mask in both
23593 // bytes for PBLENDVB.
23596 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23597 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23599 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23602 // r = VSELECT(r, shift(r, 8), a);
23603 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23604 R = SignBitSelect(Amt, M, R);
23607 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23609 // r = VSELECT(r, shift(r, 4), a);
23610 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23611 R = SignBitSelect(Amt, M, R);
23614 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23616 // r = VSELECT(r, shift(r, 2), a);
23617 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23618 R = SignBitSelect(Amt, M, R);
23621 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23623 // return VSELECT(r, shift(r, 1), a);
23624 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23625 R = SignBitSelect(Amt, M, R);
23629 // Decompose 256-bit shifts into smaller 128-bit shifts.
23630 if (VT.is256BitVector())
23631 return Lower256IntArith(Op, DAG);
23636 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23637 SelectionDAG &DAG) {
23638 MVT VT = Op.getSimpleValueType();
23640 SDValue R = Op.getOperand(0);
23641 SDValue Amt = Op.getOperand(1);
23642 unsigned Opcode = Op.getOpcode();
23643 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23645 if (Subtarget.hasAVX512()) {
23646 // Attempt to rotate by immediate.
23648 SmallVector<APInt, 16> EltBits;
23649 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23650 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23651 return EltBits[0] == V;
23653 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23654 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23655 return DAG.getNode(Op, DL, VT, R,
23656 DAG.getConstant(RotateAmt, DL, MVT::i8));
23660 // Else, fall-back on VPROLV/VPRORV.
23664 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23665 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
23666 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23668 // XOP has 128-bit vector variable + immediate rotates.
23669 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23671 // Split 256-bit integers.
23672 if (VT.is256BitVector())
23673 return Lower256IntArith(Op, DAG);
23675 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23677 // Attempt to rotate by immediate.
23678 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23679 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23680 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23681 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23682 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23683 DAG.getConstant(RotateAmt, DL, MVT::i8));
23687 // Use general rotate by variable (per-element).
23691 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23692 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23693 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23694 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23695 // has only one use.
23696 SDNode *N = Op.getNode();
23697 SDValue LHS = N->getOperand(0);
23698 SDValue RHS = N->getOperand(1);
23699 unsigned BaseOp = 0;
23700 X86::CondCode Cond;
23702 switch (Op.getOpcode()) {
23703 default: llvm_unreachable("Unknown ovf instruction!");
23705 // A subtract of one will be selected as a INC. Note that INC doesn't
23706 // set CF, so we can't do this for UADDO.
23707 if (isOneConstant(RHS)) {
23708 BaseOp = X86ISD::INC;
23709 Cond = X86::COND_O;
23712 BaseOp = X86ISD::ADD;
23713 Cond = X86::COND_O;
23716 BaseOp = X86ISD::ADD;
23717 Cond = X86::COND_B;
23720 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23721 // set CF, so we can't do this for USUBO.
23722 if (isOneConstant(RHS)) {
23723 BaseOp = X86ISD::DEC;
23724 Cond = X86::COND_O;
23727 BaseOp = X86ISD::SUB;
23728 Cond = X86::COND_O;
23731 BaseOp = X86ISD::SUB;
23732 Cond = X86::COND_B;
23735 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23736 Cond = X86::COND_O;
23738 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23739 if (N->getValueType(0) == MVT::i8) {
23740 BaseOp = X86ISD::UMUL8;
23741 Cond = X86::COND_O;
23744 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23746 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23748 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23750 if (N->getValueType(1) == MVT::i1)
23751 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23753 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23757 // Also sets EFLAGS.
23758 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23759 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23761 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23763 if (N->getValueType(1) == MVT::i1)
23764 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23766 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23769 /// Returns true if the operand type is exactly twice the native width, and
23770 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23771 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23772 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23773 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23774 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23777 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23778 else if (OpWidth == 128)
23779 return Subtarget.hasCmpxchg16b();
23784 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23785 return needsCmpXchgNb(SI->getValueOperand()->getType());
23788 // Note: this turns large loads into lock cmpxchg8b/16b.
23789 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23790 TargetLowering::AtomicExpansionKind
23791 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23792 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23793 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23794 : AtomicExpansionKind::None;
23797 TargetLowering::AtomicExpansionKind
23798 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23799 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23800 Type *MemType = AI->getType();
23802 // If the operand is too big, we must see if cmpxchg8/16b is available
23803 // and default to library calls otherwise.
23804 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23805 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23806 : AtomicExpansionKind::None;
23809 AtomicRMWInst::BinOp Op = AI->getOperation();
23812 llvm_unreachable("Unknown atomic operation");
23813 case AtomicRMWInst::Xchg:
23814 case AtomicRMWInst::Add:
23815 case AtomicRMWInst::Sub:
23816 // It's better to use xadd, xsub or xchg for these in all cases.
23817 return AtomicExpansionKind::None;
23818 case AtomicRMWInst::Or:
23819 case AtomicRMWInst::And:
23820 case AtomicRMWInst::Xor:
23821 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23822 // prefix to a normal instruction for these operations.
23823 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23824 : AtomicExpansionKind::None;
23825 case AtomicRMWInst::Nand:
23826 case AtomicRMWInst::Max:
23827 case AtomicRMWInst::Min:
23828 case AtomicRMWInst::UMax:
23829 case AtomicRMWInst::UMin:
23830 // These always require a non-trivial set of data operations on x86. We must
23831 // use a cmpxchg loop.
23832 return AtomicExpansionKind::CmpXChg;
23837 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23838 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23839 Type *MemType = AI->getType();
23840 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23841 // there is no benefit in turning such RMWs into loads, and it is actually
23842 // harmful as it introduces a mfence.
23843 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23846 auto Builder = IRBuilder<>(AI);
23847 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23848 auto SSID = AI->getSyncScopeID();
23849 // We must restrict the ordering to avoid generating loads with Release or
23850 // ReleaseAcquire orderings.
23851 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23852 auto Ptr = AI->getPointerOperand();
23854 // Before the load we need a fence. Here is an example lifted from
23855 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23858 // x.store(1, relaxed);
23859 // r1 = y.fetch_add(0, release);
23861 // y.fetch_add(42, acquire);
23862 // r2 = x.load(relaxed);
23863 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23864 // lowered to just a load without a fence. A mfence flushes the store buffer,
23865 // making the optimization clearly correct.
23866 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23867 // otherwise, we might be able to be more aggressive on relaxed idempotent
23868 // rmw. In practice, they do not look useful, so we don't try to be
23869 // especially clever.
23870 if (SSID == SyncScope::SingleThread)
23871 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23872 // the IR level, so we must wrap it in an intrinsic.
23875 if (!Subtarget.hasMFence())
23876 // FIXME: it might make sense to use a locked operation here but on a
23877 // different cache-line to prevent cache-line bouncing. In practice it
23878 // is probably a small win, and x86 processors without mfence are rare
23879 // enough that we do not bother.
23883 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23884 Builder.CreateCall(MFence, {});
23886 // Finally we can emit the atomic load.
23887 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23888 AI->getType()->getPrimitiveSizeInBits());
23889 Loaded->setAtomic(Order, SSID);
23890 AI->replaceAllUsesWith(Loaded);
23891 AI->eraseFromParent();
23895 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23896 SelectionDAG &DAG) {
23898 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23899 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23900 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23901 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23903 // The only fence that needs an instruction is a sequentially-consistent
23904 // cross-thread fence.
23905 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23906 FenceSSID == SyncScope::System) {
23907 if (Subtarget.hasMFence())
23908 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23910 SDValue Chain = Op.getOperand(0);
23911 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23913 DAG.getRegister(X86::ESP, MVT::i32), // Base
23914 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23915 DAG.getRegister(0, MVT::i32), // Index
23916 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23917 DAG.getRegister(0, MVT::i32), // Segment.
23921 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23922 return SDValue(Res, 0);
23925 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23926 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23929 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23930 SelectionDAG &DAG) {
23931 MVT T = Op.getSimpleValueType();
23935 switch(T.SimpleTy) {
23936 default: llvm_unreachable("Invalid value type!");
23937 case MVT::i8: Reg = X86::AL; size = 1; break;
23938 case MVT::i16: Reg = X86::AX; size = 2; break;
23939 case MVT::i32: Reg = X86::EAX; size = 4; break;
23941 assert(Subtarget.is64Bit() && "Node not type legal!");
23942 Reg = X86::RAX; size = 8;
23945 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23946 Op.getOperand(2), SDValue());
23947 SDValue Ops[] = { cpIn.getValue(0),
23950 DAG.getTargetConstant(size, DL, MVT::i8),
23951 cpIn.getValue(1) };
23952 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23953 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23954 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23958 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23959 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23960 MVT::i32, cpOut.getValue(2));
23961 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23963 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23964 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23965 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23969 // Create MOVMSKB, taking into account whether we need to split for AVX1.
23970 static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG,
23971 const X86Subtarget &Subtarget) {
23972 MVT InVT = V.getSimpleValueType();
23974 if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) {
23976 std::tie(Lo, Hi) = DAG.SplitVector(V, DL);
23977 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
23978 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
23979 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
23980 DAG.getConstant(16, DL, MVT::i8));
23981 return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
23984 return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
23987 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23988 SelectionDAG &DAG) {
23989 SDValue Src = Op.getOperand(0);
23990 MVT SrcVT = Src.getSimpleValueType();
23991 MVT DstVT = Op.getSimpleValueType();
23993 // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each
23994 // half to v32i1 and concatenating the result.
23995 if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) {
23996 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
23997 assert(Subtarget.hasBWI() && "Expected BWI target");
23999 SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24000 DAG.getIntPtrConstant(0, dl));
24001 Lo = DAG.getBitcast(MVT::v32i1, Lo);
24002 SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24003 DAG.getIntPtrConstant(1, dl));
24004 Hi = DAG.getBitcast(MVT::v32i1, Hi);
24005 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
24008 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
24009 if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector())
24010 return Lower512IntUnary(Op, DAG);
24012 // Use MOVMSK for vector to scalar conversion to prevent scalarization.
24013 if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
24014 assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
24015 MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8;
24017 SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT);
24018 V = getPMOVMSKB(DL, V, DAG, Subtarget);
24019 return DAG.getZExtOrTrunc(V, DL, DstVT);
24022 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
24023 SrcVT == MVT::i64) {
24024 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24025 if (DstVT != MVT::f64)
24026 // This conversion needs to be expanded.
24029 SmallVector<SDValue, 16> Elts;
24033 if (SrcVT.isVector()) {
24034 NumElts = SrcVT.getVectorNumElements();
24035 SVT = SrcVT.getVectorElementType();
24037 // Widen the vector in input in the case of MVT::v2i32.
24038 // Example: from MVT::v2i32 to MVT::v4i32.
24039 for (unsigned i = 0, e = NumElts; i != e; ++i)
24040 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Src,
24041 DAG.getIntPtrConstant(i, dl)));
24043 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
24044 "Unexpected source type in LowerBITCAST");
24045 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24046 DAG.getIntPtrConstant(0, dl)));
24047 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Src,
24048 DAG.getIntPtrConstant(1, dl)));
24052 // Explicitly mark the extra elements as Undef.
24053 Elts.append(NumElts, DAG.getUNDEF(SVT));
24055 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24056 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
24057 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
24058 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
24059 DAG.getIntPtrConstant(0, dl));
24062 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
24063 Subtarget.hasMMX() && "Unexpected custom BITCAST");
24064 assert((DstVT == MVT::i64 ||
24065 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
24066 "Unexpected custom BITCAST");
24067 // i64 <=> MMX conversions are Legal.
24068 if (SrcVT==MVT::i64 && DstVT.isVector())
24070 if (DstVT==MVT::i64 && SrcVT.isVector())
24072 // MMX <=> MMX conversions are Legal.
24073 if (SrcVT.isVector() && DstVT.isVector())
24075 // All other conversions need to be expanded.
24079 /// Compute the horizontal sum of bytes in V for the elements of VT.
24081 /// Requires V to be a byte vector and VT to be an integer vector type with
24082 /// wider elements than V's type. The width of the elements of VT determines
24083 /// how many bytes of V are summed horizontally to produce each element of the
24085 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
24086 const X86Subtarget &Subtarget,
24087 SelectionDAG &DAG) {
24089 MVT ByteVecVT = V.getSimpleValueType();
24090 MVT EltVT = VT.getVectorElementType();
24091 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
24092 "Expected value to have byte element type.");
24093 assert(EltVT != MVT::i8 &&
24094 "Horizontal byte sum only makes sense for wider elements!");
24095 unsigned VecSize = VT.getSizeInBits();
24096 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
24098 // PSADBW instruction horizontally add all bytes and leave the result in i64
24099 // chunks, thus directly computes the pop count for v2i64 and v4i64.
24100 if (EltVT == MVT::i64) {
24101 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24102 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24103 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
24104 return DAG.getBitcast(VT, V);
24107 if (EltVT == MVT::i32) {
24108 // We unpack the low half and high half into i32s interleaved with zeros so
24109 // that we can use PSADBW to horizontally sum them. The most useful part of
24110 // this is that it lines up the results of two PSADBW instructions to be
24111 // two v2i64 vectors which concatenated are the 4 population counts. We can
24112 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
24113 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
24114 SDValue V32 = DAG.getBitcast(VT, V);
24115 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
24116 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
24118 // Do the horizontal sums into two v2i64s.
24119 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
24120 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
24121 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24122 DAG.getBitcast(ByteVecVT, Low), Zeros);
24123 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
24124 DAG.getBitcast(ByteVecVT, High), Zeros);
24126 // Merge them together.
24127 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
24128 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
24129 DAG.getBitcast(ShortVecVT, Low),
24130 DAG.getBitcast(ShortVecVT, High));
24132 return DAG.getBitcast(VT, V);
24135 // The only element type left is i16.
24136 assert(EltVT == MVT::i16 && "Unknown how to handle type");
24138 // To obtain pop count for each i16 element starting from the pop count for
24139 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
24140 // right by 8. It is important to shift as i16s as i8 vector shift isn't
24141 // directly supported.
24142 SDValue ShifterV = DAG.getConstant(8, DL, VT);
24143 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24144 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
24145 DAG.getBitcast(ByteVecVT, V));
24146 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
24149 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
24150 const X86Subtarget &Subtarget,
24151 SelectionDAG &DAG) {
24152 MVT VT = Op.getSimpleValueType();
24153 MVT EltVT = VT.getVectorElementType();
24154 unsigned VecSize = VT.getSizeInBits();
24156 // Implement a lookup table in register by using an algorithm based on:
24157 // http://wm.ite.pl/articles/sse-popcount.html
24159 // The general idea is that every lower byte nibble in the input vector is an
24160 // index into a in-register pre-computed pop count table. We then split up the
24161 // input vector in two new ones: (1) a vector with only the shifted-right
24162 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
24163 // masked out higher ones) for each byte. PSHUFB is used separately with both
24164 // to index the in-register table. Next, both are added and the result is a
24165 // i8 vector where each element contains the pop count for input byte.
24167 // To obtain the pop count for elements != i8, we follow up with the same
24168 // approach and use additional tricks as described below.
24170 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
24171 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
24172 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
24173 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
24175 int NumByteElts = VecSize / 8;
24176 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
24177 SDValue In = DAG.getBitcast(ByteVecVT, Op);
24178 SmallVector<SDValue, 64> LUTVec;
24179 for (int i = 0; i < NumByteElts; ++i)
24180 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
24181 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
24182 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
24185 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
24186 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
24189 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
24191 // The input vector is used as the shuffle mask that index elements into the
24192 // LUT. After counting low and high nibbles, add the vector to obtain the
24193 // final pop count per i8 element.
24194 SDValue HighPopCnt =
24195 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
24196 SDValue LowPopCnt =
24197 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
24198 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
24200 if (EltVT == MVT::i8)
24203 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
24206 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
24207 const X86Subtarget &Subtarget,
24208 SelectionDAG &DAG) {
24209 MVT VT = Op.getSimpleValueType();
24210 assert(VT.is128BitVector() &&
24211 "Only 128-bit vector bitmath lowering supported.");
24213 int VecSize = VT.getSizeInBits();
24214 MVT EltVT = VT.getVectorElementType();
24215 int Len = EltVT.getSizeInBits();
24217 // This is the vectorized version of the "best" algorithm from
24218 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
24219 // with a minor tweak to use a series of adds + shifts instead of vector
24220 // multiplications. Implemented for all integer vector types. We only use
24221 // this when we don't have SSSE3 which allows a LUT-based lowering that is
24222 // much faster, even faster than using native popcnt instructions.
24224 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
24225 MVT VT = V.getSimpleValueType();
24226 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
24227 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
24229 auto GetMask = [&](SDValue V, APInt Mask) {
24230 MVT VT = V.getSimpleValueType();
24231 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
24232 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
24235 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
24236 // x86, so set the SRL type to have elements at least i16 wide. This is
24237 // correct because all of our SRLs are followed immediately by a mask anyways
24238 // that handles any bits that sneak into the high bits of the byte elements.
24239 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
24243 // v = v - ((v >> 1) & 0x55555555...)
24245 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
24246 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
24247 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
24249 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
24250 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
24251 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
24252 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
24253 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
24255 // v = (v + (v >> 4)) & 0x0F0F0F0F...
24256 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
24257 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
24258 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
24260 // At this point, V contains the byte-wise population count, and we are
24261 // merely doing a horizontal sum if necessary to get the wider element
24263 if (EltVT == MVT::i8)
24266 return LowerHorizontalByteSum(
24267 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
24271 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
24272 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
24273 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24274 SelectionDAG &DAG) {
24275 MVT VT = Op.getSimpleValueType();
24276 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
24277 "Unknown CTPOP type to handle");
24278 SDLoc DL(Op.getNode());
24279 SDValue Op0 = Op.getOperand(0);
24281 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
24282 if (Subtarget.hasVPOPCNTDQ()) {
24283 unsigned NumElems = VT.getVectorNumElements();
24284 assert((VT.getVectorElementType() == MVT::i8 ||
24285 VT.getVectorElementType() == MVT::i16) && "Unexpected type");
24286 if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) {
24287 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
24288 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
24289 Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
24290 return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
24294 if (!Subtarget.hasSSSE3()) {
24295 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
24296 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
24297 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
24300 // Decompose 256-bit ops into smaller 128-bit ops.
24301 if (VT.is256BitVector() && !Subtarget.hasInt256())
24302 return Lower256IntUnary(Op, DAG);
24304 // Decompose 512-bit ops into smaller 256-bit ops.
24305 if (VT.is512BitVector() && !Subtarget.hasBWI())
24306 return Lower512IntUnary(Op, DAG);
24308 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
24311 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
24312 SelectionDAG &DAG) {
24313 assert(Op.getSimpleValueType().isVector() &&
24314 "We only do custom lowering for vector population count.");
24315 return LowerVectorCTPOP(Op, Subtarget, DAG);
24318 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
24319 MVT VT = Op.getSimpleValueType();
24320 SDValue In = Op.getOperand(0);
24323 // For scalars, its still beneficial to transfer to/from the SIMD unit to
24324 // perform the BITREVERSE.
24325 if (!VT.isVector()) {
24326 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
24327 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
24328 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
24329 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
24330 DAG.getIntPtrConstant(0, DL));
24333 int NumElts = VT.getVectorNumElements();
24334 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
24336 // Decompose 256-bit ops into smaller 128-bit ops.
24337 if (VT.is256BitVector())
24338 return Lower256IntUnary(Op, DAG);
24340 assert(VT.is128BitVector() &&
24341 "Only 128-bit vector bitreverse lowering supported.");
24343 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
24344 // perform the BSWAP in the shuffle.
24345 // Its best to shuffle using the second operand as this will implicitly allow
24346 // memory folding for multiple vectors.
24347 SmallVector<SDValue, 16> MaskElts;
24348 for (int i = 0; i != NumElts; ++i) {
24349 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
24350 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
24351 int PermuteByte = SourceByte | (2 << 5);
24352 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
24356 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
24357 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
24358 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
24360 return DAG.getBitcast(VT, Res);
24363 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
24364 SelectionDAG &DAG) {
24365 MVT VT = Op.getSimpleValueType();
24367 if (Subtarget.hasXOP() && !VT.is512BitVector())
24368 return LowerBITREVERSE_XOP(Op, DAG);
24370 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
24372 SDValue In = Op.getOperand(0);
24375 unsigned NumElts = VT.getVectorNumElements();
24376 assert(VT.getScalarType() == MVT::i8 &&
24377 "Only byte vector BITREVERSE supported");
24379 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
24380 if (VT.is256BitVector() && !Subtarget.hasInt256())
24381 return Lower256IntUnary(Op, DAG);
24383 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
24384 // two nibbles and a PSHUFB lookup to find the bitreverse of each
24385 // 0-15 value (moved to the other nibble).
24386 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
24387 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
24388 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
24390 const int LoLUT[16] = {
24391 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
24392 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
24393 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
24394 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
24395 const int HiLUT[16] = {
24396 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
24397 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
24398 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
24399 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
24401 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
24402 for (unsigned i = 0; i < NumElts; ++i) {
24403 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
24404 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
24407 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
24408 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
24409 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
24410 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
24411 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
24414 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
24415 const X86Subtarget &Subtarget,
24416 bool AllowIncDec = true) {
24417 unsigned NewOpc = 0;
24418 switch (N->getOpcode()) {
24419 case ISD::ATOMIC_LOAD_ADD:
24420 NewOpc = X86ISD::LADD;
24422 case ISD::ATOMIC_LOAD_SUB:
24423 NewOpc = X86ISD::LSUB;
24425 case ISD::ATOMIC_LOAD_OR:
24426 NewOpc = X86ISD::LOR;
24428 case ISD::ATOMIC_LOAD_XOR:
24429 NewOpc = X86ISD::LXOR;
24431 case ISD::ATOMIC_LOAD_AND:
24432 NewOpc = X86ISD::LAND;
24435 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
24438 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
24440 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
24441 // Convert to inc/dec if they aren't slow or we are optimizing for size.
24442 if (AllowIncDec && (!Subtarget.slowIncDec() ||
24443 DAG.getMachineFunction().getFunction().optForSize())) {
24444 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
24445 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
24446 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
24447 DAG.getVTList(MVT::i32, MVT::Other),
24448 {N->getOperand(0), N->getOperand(1)},
24449 /*MemVT=*/N->getSimpleValueType(0), MMO);
24450 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
24451 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
24452 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
24453 DAG.getVTList(MVT::i32, MVT::Other),
24454 {N->getOperand(0), N->getOperand(1)},
24455 /*MemVT=*/N->getSimpleValueType(0), MMO);
24459 return DAG.getMemIntrinsicNode(
24460 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
24461 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
24462 /*MemVT=*/N->getSimpleValueType(0), MMO);
24465 /// Lower atomic_load_ops into LOCK-prefixed operations.
24466 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
24467 const X86Subtarget &Subtarget) {
24468 SDValue Chain = N->getOperand(0);
24469 SDValue LHS = N->getOperand(1);
24470 SDValue RHS = N->getOperand(2);
24471 unsigned Opc = N->getOpcode();
24472 MVT VT = N->getSimpleValueType(0);
24475 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
24476 // can only be lowered when the result is unused. They should have already
24477 // been transformed into a cmpxchg loop in AtomicExpand.
24478 if (N->hasAnyUseOfValue(0)) {
24479 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
24480 // select LXADD if LOCK_SUB can't be selected.
24481 if (Opc == ISD::ATOMIC_LOAD_SUB) {
24482 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
24483 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
24484 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
24485 RHS, AN->getMemOperand());
24487 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
24488 "Used AtomicRMW ops other than Add should have been expanded!");
24492 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
24493 // RAUW the chain, but don't worry about the result, as it's unused.
24494 assert(!N->hasAnyUseOfValue(0));
24495 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
24499 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
24500 SDNode *Node = Op.getNode();
24502 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
24504 // Convert seq_cst store -> xchg
24505 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
24506 // FIXME: On 32-bit, store -> fist or movq would be more efficient
24507 // (The only way to get a 16-byte store is cmpxchg16b)
24508 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
24509 if (cast<AtomicSDNode>(Node)->getOrdering() ==
24510 AtomicOrdering::SequentiallyConsistent ||
24511 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
24512 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
24513 cast<AtomicSDNode>(Node)->getMemoryVT(),
24514 Node->getOperand(0),
24515 Node->getOperand(1), Node->getOperand(2),
24516 cast<AtomicSDNode>(Node)->getMemOperand());
24517 return Swap.getValue(1);
24519 // Other atomic stores have a simple pattern.
24523 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
24524 SDNode *N = Op.getNode();
24525 MVT VT = N->getSimpleValueType(0);
24527 // Let legalize expand this if it isn't a legal type yet.
24528 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
24531 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
24534 // Set the carry flag.
24535 SDValue Carry = Op.getOperand(2);
24536 EVT CarryVT = Carry.getValueType();
24537 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
24538 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
24539 Carry, DAG.getConstant(NegOne, DL, CarryVT));
24541 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
24542 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
24543 Op.getOperand(1), Carry.getValue(1));
24545 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
24546 if (N->getValueType(1) == MVT::i1)
24547 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
24549 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
24552 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
24553 SelectionDAG &DAG) {
24554 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
24556 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
24557 // which returns the values as { float, float } (in XMM0) or
24558 // { double, double } (which is returned in XMM0, XMM1).
24560 SDValue Arg = Op.getOperand(0);
24561 EVT ArgVT = Arg.getValueType();
24562 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
24564 TargetLowering::ArgListTy Args;
24565 TargetLowering::ArgListEntry Entry;
24569 Entry.IsSExt = false;
24570 Entry.IsZExt = false;
24571 Args.push_back(Entry);
24573 bool isF64 = ArgVT == MVT::f64;
24574 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
24575 // the small struct {f32, f32} is returned in (eax, edx). For f64,
24576 // the results are returned via SRet in memory.
24577 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24578 RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32;
24579 const char *LibcallName = TLI.getLibcallName(LC);
24581 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24583 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24584 : (Type *)VectorType::get(ArgTy, 4);
24586 TargetLowering::CallLoweringInfo CLI(DAG);
24587 CLI.setDebugLoc(dl)
24588 .setChain(DAG.getEntryNode())
24589 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24591 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24594 // Returned in xmm0 and xmm1.
24595 return CallResult.first;
24597 // Returned in bits 0:31 and 32:64 xmm0.
24598 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24599 CallResult.first, DAG.getIntPtrConstant(0, dl));
24600 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24601 CallResult.first, DAG.getIntPtrConstant(1, dl));
24602 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24603 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24606 /// Widen a vector input to a vector of NVT. The
24607 /// input vector must have the same element type as NVT.
24608 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24609 bool FillWithZeroes = false) {
24610 // Check if InOp already has the right width.
24611 MVT InVT = InOp.getSimpleValueType();
24615 if (InOp.isUndef())
24616 return DAG.getUNDEF(NVT);
24618 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24619 "input and widen element type must match");
24621 unsigned InNumElts = InVT.getVectorNumElements();
24622 unsigned WidenNumElts = NVT.getVectorNumElements();
24623 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24624 "Unexpected request for vector widening");
24627 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24628 InOp.getNumOperands() == 2) {
24629 SDValue N1 = InOp.getOperand(1);
24630 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24632 InOp = InOp.getOperand(0);
24633 InVT = InOp.getSimpleValueType();
24634 InNumElts = InVT.getVectorNumElements();
24637 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24638 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24639 SmallVector<SDValue, 16> Ops;
24640 for (unsigned i = 0; i < InNumElts; ++i)
24641 Ops.push_back(InOp.getOperand(i));
24643 EVT EltVT = InOp.getOperand(0).getValueType();
24645 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24646 DAG.getUNDEF(EltVT);
24647 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24648 Ops.push_back(FillVal);
24649 return DAG.getBuildVector(NVT, dl, Ops);
24651 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24653 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24654 InOp, DAG.getIntPtrConstant(0, dl));
24657 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24658 SelectionDAG &DAG) {
24659 assert(Subtarget.hasAVX512() &&
24660 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24662 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24663 SDValue Src = N->getValue();
24664 MVT VT = Src.getSimpleValueType();
24665 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24668 SDValue Scale = N->getScale();
24669 SDValue Index = N->getIndex();
24670 SDValue Mask = N->getMask();
24671 SDValue Chain = N->getChain();
24672 SDValue BasePtr = N->getBasePtr();
24674 if (VT == MVT::v2f32) {
24675 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24676 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24677 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24678 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24679 DAG.getUNDEF(MVT::v2f32));
24680 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24681 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24682 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24683 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24684 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24685 return SDValue(NewScatter.getNode(), 1);
24690 if (VT == MVT::v2i32) {
24691 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24692 Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
24693 DAG.getUNDEF(MVT::v2i32));
24694 // If the index is v2i64 and we have VLX we can use xmm for data and index.
24695 if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) {
24696 SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
24697 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24698 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24699 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24700 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24701 return SDValue(NewScatter.getNode(), 1);
24703 // Custom widen all the operands to avoid promotion.
24704 EVT NewIndexVT = EVT::getVectorVT(
24705 *DAG.getContext(), Index.getValueType().getVectorElementType(), 4);
24706 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
24707 DAG.getUNDEF(Index.getValueType()));
24708 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24709 DAG.getConstant(0, dl, MVT::v2i1));
24710 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24711 return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), N->getMemoryVT(), dl,
24712 Ops, N->getMemOperand());
24715 MVT IndexVT = Index.getSimpleValueType();
24716 MVT MaskVT = Mask.getSimpleValueType();
24718 // If the index is v2i32, we're being called by type legalization and we
24719 // should just let the default handling take care of it.
24720 if (IndexVT == MVT::v2i32)
24723 // If we don't have VLX and neither the passthru or index is 512-bits, we
24724 // need to widen until one is.
24725 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24726 !Index.getSimpleValueType().is512BitVector()) {
24727 // Determine how much we need to widen by to get a 512-bit type.
24728 unsigned Factor = std::min(512/VT.getSizeInBits(),
24729 512/IndexVT.getSizeInBits());
24730 unsigned NumElts = VT.getVectorNumElements() * Factor;
24732 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
24733 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
24734 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24736 Src = ExtendToType(Src, VT, DAG);
24737 Index = ExtendToType(Index, IndexVT, DAG);
24738 Mask = ExtendToType(Mask, MaskVT, DAG, true);
24741 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
24742 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
24743 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24744 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24745 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24746 return SDValue(NewScatter.getNode(), 1);
24749 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24750 SelectionDAG &DAG) {
24752 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24753 MVT VT = Op.getSimpleValueType();
24754 MVT ScalarVT = VT.getScalarType();
24755 SDValue Mask = N->getMask();
24758 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24759 "Expanding masked load is supported on AVX-512 target only!");
24761 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24762 "Expanding masked load is supported for 32 and 64-bit types only!");
24764 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24765 "Cannot lower masked load op.");
24767 assert((ScalarVT.getSizeInBits() >= 32 ||
24768 (Subtarget.hasBWI() &&
24769 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24770 "Unsupported masked load op.");
24772 // This operation is legal for targets with VLX, but without
24773 // VLX the vector should be widened to 512 bit
24774 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24775 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24776 SDValue Src0 = N->getSrc0();
24777 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24779 // Mask element has to be i1.
24780 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
24781 "Unexpected mask type");
24783 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
24785 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24786 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24787 N->getBasePtr(), Mask, Src0,
24788 N->getMemoryVT(), N->getMemOperand(),
24789 N->getExtensionType(),
24790 N->isExpandingLoad());
24792 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24793 NewLoad.getValue(0),
24794 DAG.getIntPtrConstant(0, dl));
24795 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24796 return DAG.getMergeValues(RetOps, dl);
24799 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24800 SelectionDAG &DAG) {
24801 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24802 SDValue DataToStore = N->getValue();
24803 MVT VT = DataToStore.getSimpleValueType();
24804 MVT ScalarVT = VT.getScalarType();
24805 SDValue Mask = N->getMask();
24808 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
24809 "Expanding masked load is supported on AVX-512 target only!");
24811 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
24812 "Expanding masked load is supported for 32 and 64-bit types only!");
24814 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24815 "Cannot lower masked store op.");
24817 assert((ScalarVT.getSizeInBits() >= 32 ||
24818 (Subtarget.hasBWI() &&
24819 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24820 "Unsupported masked store op.");
24822 // This operation is legal for targets with VLX, but without
24823 // VLX the vector should be widened to 512 bit
24824 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24825 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24827 // Mask element has to be i1.
24828 assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&
24829 "Unexpected mask type");
24831 MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
24833 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24834 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24835 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24836 Mask, N->getMemoryVT(), N->getMemOperand(),
24837 N->isTruncatingStore(), N->isCompressingStore());
24840 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24841 SelectionDAG &DAG) {
24842 assert(Subtarget.hasAVX2() &&
24843 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
24845 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24847 MVT VT = Op.getSimpleValueType();
24848 SDValue Index = N->getIndex();
24849 SDValue Mask = N->getMask();
24850 SDValue Src0 = N->getValue();
24851 MVT IndexVT = Index.getSimpleValueType();
24852 MVT MaskVT = Mask.getSimpleValueType();
24854 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
24856 // If the index is v2i32, we're being called by type legalization.
24857 if (IndexVT == MVT::v2i32)
24860 // If we don't have VLX and neither the passthru or index is 512-bits, we
24861 // need to widen until one is.
24863 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24864 !IndexVT.is512BitVector()) {
24865 // Determine how much we need to widen by to get a 512-bit type.
24866 unsigned Factor = std::min(512/VT.getSizeInBits(),
24867 512/IndexVT.getSizeInBits());
24869 unsigned NumElts = VT.getVectorNumElements() * Factor;
24871 VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
24872 IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
24873 MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24875 Src0 = ExtendToType(Src0, VT, DAG);
24876 Index = ExtendToType(Index, IndexVT, DAG);
24877 Mask = ExtendToType(Mask, MaskVT, DAG, true);
24880 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index,
24882 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24883 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24884 N->getMemOperand());
24885 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
24886 NewGather, DAG.getIntPtrConstant(0, dl));
24887 return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
24890 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24891 SelectionDAG &DAG) const {
24892 // TODO: Eventually, the lowering of these nodes should be informed by or
24893 // deferred to the GC strategy for the function in which they appear. For
24894 // now, however, they must be lowered to something. Since they are logically
24895 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24896 // require special handling for these nodes), lower them as literal NOOPs for
24898 SmallVector<SDValue, 2> Ops;
24900 Ops.push_back(Op.getOperand(0));
24901 if (Op->getGluedNode())
24902 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24905 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24906 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24911 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24912 SelectionDAG &DAG) const {
24913 // TODO: Eventually, the lowering of these nodes should be informed by or
24914 // deferred to the GC strategy for the function in which they appear. For
24915 // now, however, they must be lowered to something. Since they are logically
24916 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24917 // require special handling for these nodes), lower them as literal NOOPs for
24919 SmallVector<SDValue, 2> Ops;
24921 Ops.push_back(Op.getOperand(0));
24922 if (Op->getGluedNode())
24923 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24926 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24927 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24932 /// Provide custom lowering hooks for some operations.
24933 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24934 switch (Op.getOpcode()) {
24935 default: llvm_unreachable("Should not custom lower this!");
24936 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24937 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24938 return LowerCMP_SWAP(Op, Subtarget, DAG);
24939 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24940 case ISD::ATOMIC_LOAD_ADD:
24941 case ISD::ATOMIC_LOAD_SUB:
24942 case ISD::ATOMIC_LOAD_OR:
24943 case ISD::ATOMIC_LOAD_XOR:
24944 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24945 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24946 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24947 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24948 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24949 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24950 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24951 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24952 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24953 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24954 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24955 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24956 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24957 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24958 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24959 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24960 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24961 case ISD::SHL_PARTS:
24962 case ISD::SRA_PARTS:
24963 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24964 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24965 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24966 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24967 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24968 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24969 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24970 case ISD::ZERO_EXTEND_VECTOR_INREG:
24971 case ISD::SIGN_EXTEND_VECTOR_INREG:
24972 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24973 case ISD::FP_TO_SINT:
24974 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24975 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24976 case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
24977 case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
24979 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24980 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24981 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24982 case ISD::SETCC: return LowerSETCC(Op, DAG);
24983 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24984 case ISD::SELECT: return LowerSELECT(Op, DAG);
24985 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24986 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24987 case ISD::VASTART: return LowerVASTART(Op, DAG);
24988 case ISD::VAARG: return LowerVAARG(Op, DAG);
24989 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24990 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24991 case ISD::INTRINSIC_VOID:
24992 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24993 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24994 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24995 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24996 case ISD::FRAME_TO_ARGS_OFFSET:
24997 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24998 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24999 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
25000 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
25001 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
25002 case ISD::EH_SJLJ_SETUP_DISPATCH:
25003 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
25004 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
25005 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
25006 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
25008 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
25010 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
25011 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
25013 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
25014 case ISD::UMUL_LOHI:
25015 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
25017 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
25020 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
25026 case ISD::UMULO: return LowerXALUO(Op, DAG);
25027 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
25028 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
25029 case ISD::ADDCARRY:
25030 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
25032 case ISD::SUB: return LowerADD_SUB(Op, DAG);
25036 case ISD::UMIN: return LowerMINMAX(Op, DAG);
25037 case ISD::ABS: return LowerABS(Op, DAG);
25038 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
25039 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
25040 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
25041 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
25042 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
25043 case ISD::GC_TRANSITION_START:
25044 return LowerGC_TRANSITION_START(Op, DAG);
25045 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
25049 /// Places new result values for the node in Results (their number
25050 /// and types must exactly match those of the original return values of
25051 /// the node), or leaves Results empty, which indicates that the node is not
25052 /// to be custom lowered after all.
25053 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
25054 SmallVectorImpl<SDValue> &Results,
25055 SelectionDAG &DAG) const {
25056 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
25058 if (!Res.getNode())
25061 assert((N->getNumValues() <= Res->getNumValues()) &&
25062 "Lowering returned the wrong number of results!");
25064 // Places new result values base on N result number.
25065 // In some cases (LowerSINT_TO_FP for example) Res has more result values
25066 // than original node, chain should be dropped(last value).
25067 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
25068 Results.push_back(Res.getValue(I));
25071 /// Replace a node with an illegal result type with a new node built out of
25073 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
25074 SmallVectorImpl<SDValue>&Results,
25075 SelectionDAG &DAG) const {
25077 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
25078 switch (N->getOpcode()) {
25080 llvm_unreachable("Do not know how to custom type legalize this operation!");
25081 case X86ISD::AVG: {
25082 // Legalize types for X86ISD::AVG by expanding vectors.
25083 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25085 auto InVT = N->getValueType(0);
25086 assert(InVT.getSizeInBits() < 128);
25087 assert(128 % InVT.getSizeInBits() == 0);
25088 unsigned NumConcat = 128 / InVT.getSizeInBits();
25090 EVT RegVT = EVT::getVectorVT(*DAG.getContext(),
25091 InVT.getVectorElementType(),
25092 NumConcat * InVT.getVectorNumElements());
25094 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
25095 Ops[0] = N->getOperand(0);
25096 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25097 Ops[0] = N->getOperand(1);
25098 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
25100 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
25101 if (getTypeAction(*DAG.getContext(), InVT) != TypeWidenVector)
25102 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
25103 DAG.getIntPtrConstant(0, dl));
25104 Results.push_back(Res);
25108 // Widen v2i32 (setcc v2f32). This is really needed for AVX512VL when
25109 // setCC result type is v2i1 because type legalzation will end up with
25110 // a v4i1 setcc plus an extend.
25111 assert(N->getValueType(0) == MVT::v2i32 && "Unexpected type");
25112 if (N->getOperand(0).getValueType() != MVT::v2f32)
25114 SDValue UNDEF = DAG.getUNDEF(MVT::v2f32);
25115 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25116 N->getOperand(0), UNDEF);
25117 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25118 N->getOperand(1), UNDEF);
25119 SDValue Res = DAG.getNode(ISD::SETCC, dl, MVT::v4i32, LHS, RHS,
25121 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25122 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25123 DAG.getIntPtrConstant(0, dl));
25124 Results.push_back(Res);
25127 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
25128 case X86ISD::FMINC:
25130 case X86ISD::FMAXC:
25131 case X86ISD::FMAX: {
25132 EVT VT = N->getValueType(0);
25133 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
25134 SDValue UNDEF = DAG.getUNDEF(VT);
25135 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25136 N->getOperand(0), UNDEF);
25137 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25138 N->getOperand(1), UNDEF);
25139 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
25147 case ISD::UDIVREM: {
25148 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
25149 Results.push_back(V);
25152 case ISD::FP_TO_SINT:
25153 case ISD::FP_TO_UINT: {
25154 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
25156 if (N->getValueType(0) == MVT::v2i32) {
25157 assert((IsSigned || Subtarget.hasAVX512()) &&
25158 "Can only handle signed conversion without AVX512");
25159 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25160 SDValue Src = N->getOperand(0);
25161 if (Src.getValueType() == MVT::v2f64) {
25162 MVT ResVT = MVT::v4i32;
25163 unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
25164 if (!IsSigned && !Subtarget.hasVLX()) {
25165 // Widen to 512-bits.
25166 ResVT = MVT::v8i32;
25167 Opc = ISD::FP_TO_UINT;
25168 Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
25169 DAG.getUNDEF(MVT::v8f64),
25170 Src, DAG.getIntPtrConstant(0, dl));
25172 SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
25173 bool WidenType = getTypeAction(*DAG.getContext(),
25174 MVT::v2i32) == TypeWidenVector;
25175 ResVT = WidenType ? MVT::v4i32 : MVT::v2i32;
25176 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
25177 DAG.getIntPtrConstant(0, dl));
25178 Results.push_back(Res);
25181 if (Src.getValueType() == MVT::v2f32) {
25182 SDValue Idx = DAG.getIntPtrConstant(0, dl);
25183 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
25184 DAG.getUNDEF(MVT::v2f32));
25185 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
25186 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
25187 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25188 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
25189 Results.push_back(Res);
25193 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
25194 // so early out here.
25198 std::pair<SDValue,SDValue> Vals =
25199 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
25200 SDValue FIST = Vals.first, StackSlot = Vals.second;
25201 if (FIST.getNode()) {
25202 EVT VT = N->getValueType(0);
25203 // Return a load from the stack slot.
25204 if (StackSlot.getNode())
25206 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
25208 Results.push_back(FIST);
25212 case ISD::SINT_TO_FP: {
25213 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
25214 SDValue Src = N->getOperand(0);
25215 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
25217 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
25220 case ISD::UINT_TO_FP: {
25221 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25222 EVT VT = N->getValueType(0);
25223 if (VT != MVT::v2f32)
25225 SDValue Src = N->getOperand(0);
25226 EVT SrcVT = Src.getValueType();
25227 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
25228 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
25231 if (SrcVT != MVT::v2i32)
25233 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
25235 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
25236 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
25237 DAG.getBitcast(MVT::v2i64, VBias));
25238 Or = DAG.getBitcast(MVT::v2f64, Or);
25239 // TODO: Are there any fast-math-flags to propagate here?
25240 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
25241 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
25244 case ISD::FP_ROUND: {
25245 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
25247 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
25248 Results.push_back(V);
25251 case ISD::FP_EXTEND: {
25252 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
25253 // No other ValueType for FP_EXTEND should reach this point.
25254 assert(N->getValueType(0) == MVT::v2f32 &&
25255 "Do not know how to legalize this Node");
25258 case ISD::INTRINSIC_W_CHAIN: {
25259 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
25261 default : llvm_unreachable("Do not know how to custom type "
25262 "legalize this intrinsic operation!");
25263 case Intrinsic::x86_rdtsc:
25264 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25266 case Intrinsic::x86_rdtscp:
25267 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
25269 case Intrinsic::x86_rdpmc:
25270 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
25272 case Intrinsic::x86_xgetbv:
25273 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
25276 case ISD::INTRINSIC_WO_CHAIN: {
25277 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
25278 Results.push_back(V);
25281 case ISD::READCYCLECOUNTER: {
25282 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
25285 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
25286 EVT T = N->getValueType(0);
25287 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
25288 bool Regs64bit = T == MVT::i128;
25289 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
25290 SDValue cpInL, cpInH;
25291 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25292 DAG.getConstant(0, dl, HalfT));
25293 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
25294 DAG.getConstant(1, dl, HalfT));
25295 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
25296 Regs64bit ? X86::RAX : X86::EAX,
25298 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
25299 Regs64bit ? X86::RDX : X86::EDX,
25300 cpInH, cpInL.getValue(1));
25301 SDValue swapInL, swapInH;
25302 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25303 DAG.getConstant(0, dl, HalfT));
25304 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
25305 DAG.getConstant(1, dl, HalfT));
25307 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
25308 swapInH, cpInH.getValue(1));
25309 // If the current function needs the base pointer, RBX,
25310 // we shouldn't use cmpxchg directly.
25311 // Indeed the lowering of that instruction will clobber
25312 // that register and since RBX will be a reserved register
25313 // the register allocator will not make sure its value will
25314 // be properly saved and restored around this live-range.
25315 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25317 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
25318 unsigned BasePtr = TRI->getBaseRegister();
25319 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
25320 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
25321 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
25322 // ISel prefers the LCMPXCHG64 variant.
25323 // If that assert breaks, that means it is not the case anymore,
25324 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
25325 // not just EBX. This is a matter of accepting i64 input for that
25326 // pseudo, and restoring into the register of the right wide
25327 // in expand pseudo. Everything else should just work.
25328 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
25329 "Saving only half of the RBX");
25330 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
25331 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
25332 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
25333 Regs64bit ? X86::RBX : X86::EBX,
25334 HalfT, swapInH.getValue(1));
25335 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
25337 /*Glue*/ RBXSave.getValue(2)};
25338 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25341 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
25342 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
25343 Regs64bit ? X86::RBX : X86::EBX, swapInL,
25344 swapInH.getValue(1));
25345 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
25346 swapInL.getValue(1)};
25347 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
25349 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
25350 Regs64bit ? X86::RAX : X86::EAX,
25351 HalfT, Result.getValue(1));
25352 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
25353 Regs64bit ? X86::RDX : X86::EDX,
25354 HalfT, cpOutL.getValue(2));
25355 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
25357 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
25358 MVT::i32, cpOutH.getValue(2));
25359 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
25360 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
25362 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
25363 Results.push_back(Success);
25364 Results.push_back(EFLAGS.getValue(1));
25367 case ISD::ATOMIC_SWAP:
25368 case ISD::ATOMIC_LOAD_ADD:
25369 case ISD::ATOMIC_LOAD_SUB:
25370 case ISD::ATOMIC_LOAD_AND:
25371 case ISD::ATOMIC_LOAD_OR:
25372 case ISD::ATOMIC_LOAD_XOR:
25373 case ISD::ATOMIC_LOAD_NAND:
25374 case ISD::ATOMIC_LOAD_MIN:
25375 case ISD::ATOMIC_LOAD_MAX:
25376 case ISD::ATOMIC_LOAD_UMIN:
25377 case ISD::ATOMIC_LOAD_UMAX:
25378 case ISD::ATOMIC_LOAD: {
25379 // Delegate to generic TypeLegalization. Situations we can really handle
25380 // should have already been dealt with by AtomicExpandPass.cpp.
25383 case ISD::BITCAST: {
25384 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
25385 EVT DstVT = N->getValueType(0);
25386 EVT SrcVT = N->getOperand(0).getValueType();
25388 // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target
25389 // we can split using the k-register rather than memory.
25390 if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) {
25391 assert(!Subtarget.is64Bit() && "Expected 32-bit mode");
25393 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
25394 Lo = DAG.getBitcast(MVT::i32, Lo);
25395 Hi = DAG.getBitcast(MVT::i32, Hi);
25396 SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
25397 Results.push_back(Res);
25401 // Custom splitting for BWI types when AVX512F is available but BWI isn't.
25402 if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
25403 SrcVT.isVector() && isTypeLegal(SrcVT)) {
25404 SDValue Res = Lower512IntUnary(SDValue(N, 0), DAG);
25405 Results.push_back(Res);
25409 if (SrcVT != MVT::f64 ||
25410 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
25413 unsigned NumElts = DstVT.getVectorNumElements();
25414 EVT SVT = DstVT.getVectorElementType();
25415 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
25416 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
25417 MVT::v2f64, N->getOperand(0));
25418 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
25420 if (getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector) {
25421 // If we are legalizing vectors by widening, we already have the desired
25422 // legal vector type, just return it.
25423 Results.push_back(ToVecInt);
25427 SmallVector<SDValue, 8> Elts;
25428 for (unsigned i = 0, e = NumElts; i != e; ++i)
25429 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
25430 ToVecInt, DAG.getIntPtrConstant(i, dl)));
25432 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
25435 case ISD::MGATHER: {
25436 EVT VT = N->getValueType(0);
25437 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25438 auto *Gather = cast<MaskedGatherSDNode>(N);
25439 SDValue Index = Gather->getIndex();
25440 if (Index.getValueType() != MVT::v2i64)
25442 SDValue Mask = Gather->getMask();
25443 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25444 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
25445 Gather->getValue(),
25446 DAG.getUNDEF(MVT::v2f32));
25447 if (!Subtarget.hasVLX()) {
25448 // We need to widen the mask, but the instruction will only use 2
25449 // of its elements. So we can use undef.
25450 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25451 DAG.getUNDEF(MVT::v2i1));
25452 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25454 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25455 Index, Gather->getScale() };
25456 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25457 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
25458 Gather->getMemoryVT(), Gather->getMemOperand());
25459 Results.push_back(Res);
25460 Results.push_back(Res.getValue(2));
25463 if (VT == MVT::v2i32) {
25464 auto *Gather = cast<MaskedGatherSDNode>(N);
25465 SDValue Index = Gather->getIndex();
25466 SDValue Mask = Gather->getMask();
25467 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
25468 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
25469 Gather->getValue(),
25470 DAG.getUNDEF(MVT::v2i32));
25471 // If the index is v2i64 we can use it directly.
25472 if (Index.getValueType() == MVT::v2i64 &&
25473 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
25474 if (!Subtarget.hasVLX()) {
25475 // We need to widen the mask, but the instruction will only use 2
25476 // of its elements. So we can use undef.
25477 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25478 DAG.getUNDEF(MVT::v2i1));
25479 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
25481 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25482 Index, Gather->getScale() };
25483 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
25484 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
25485 Gather->getMemoryVT(), Gather->getMemOperand());
25486 SDValue Chain = Res.getValue(2);
25487 if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector)
25488 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25489 DAG.getIntPtrConstant(0, dl));
25490 Results.push_back(Res);
25491 Results.push_back(Chain);
25494 EVT IndexVT = Index.getValueType();
25495 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
25496 IndexVT.getScalarType(), 4);
25497 // Otherwise we need to custom widen everything to avoid promotion.
25498 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
25499 DAG.getUNDEF(IndexVT));
25500 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
25501 DAG.getConstant(0, dl, MVT::v2i1));
25502 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
25503 Index, Gather->getScale() };
25504 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
25505 Gather->getMemoryVT(), dl, Ops,
25506 Gather->getMemOperand());
25507 SDValue Chain = Res.getValue(1);
25508 if (getTypeAction(*DAG.getContext(), MVT::v2i32) != TypeWidenVector)
25509 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
25510 DAG.getIntPtrConstant(0, dl));
25511 Results.push_back(Res);
25512 Results.push_back(Chain);
25520 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
25521 switch ((X86ISD::NodeType)Opcode) {
25522 case X86ISD::FIRST_NUMBER: break;
25523 case X86ISD::BSF: return "X86ISD::BSF";
25524 case X86ISD::BSR: return "X86ISD::BSR";
25525 case X86ISD::SHLD: return "X86ISD::SHLD";
25526 case X86ISD::SHRD: return "X86ISD::SHRD";
25527 case X86ISD::FAND: return "X86ISD::FAND";
25528 case X86ISD::FANDN: return "X86ISD::FANDN";
25529 case X86ISD::FOR: return "X86ISD::FOR";
25530 case X86ISD::FXOR: return "X86ISD::FXOR";
25531 case X86ISD::FILD: return "X86ISD::FILD";
25532 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
25533 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
25534 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
25535 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
25536 case X86ISD::FLD: return "X86ISD::FLD";
25537 case X86ISD::FST: return "X86ISD::FST";
25538 case X86ISD::CALL: return "X86ISD::CALL";
25539 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
25540 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
25541 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
25542 case X86ISD::BT: return "X86ISD::BT";
25543 case X86ISD::CMP: return "X86ISD::CMP";
25544 case X86ISD::COMI: return "X86ISD::COMI";
25545 case X86ISD::UCOMI: return "X86ISD::UCOMI";
25546 case X86ISD::CMPM: return "X86ISD::CMPM";
25547 case X86ISD::CMPMU: return "X86ISD::CMPMU";
25548 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
25549 case X86ISD::SETCC: return "X86ISD::SETCC";
25550 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
25551 case X86ISD::FSETCC: return "X86ISD::FSETCC";
25552 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
25553 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
25554 case X86ISD::CMOV: return "X86ISD::CMOV";
25555 case X86ISD::BRCOND: return "X86ISD::BRCOND";
25556 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
25557 case X86ISD::IRET: return "X86ISD::IRET";
25558 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
25559 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
25560 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
25561 case X86ISD::Wrapper: return "X86ISD::Wrapper";
25562 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
25563 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
25564 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
25565 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
25566 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
25567 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
25568 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
25569 case X86ISD::PINSRB: return "X86ISD::PINSRB";
25570 case X86ISD::PINSRW: return "X86ISD::PINSRW";
25571 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
25572 case X86ISD::ANDNP: return "X86ISD::ANDNP";
25573 case X86ISD::BLENDI: return "X86ISD::BLENDI";
25574 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
25575 case X86ISD::ADDUS: return "X86ISD::ADDUS";
25576 case X86ISD::SUBUS: return "X86ISD::SUBUS";
25577 case X86ISD::HADD: return "X86ISD::HADD";
25578 case X86ISD::HSUB: return "X86ISD::HSUB";
25579 case X86ISD::FHADD: return "X86ISD::FHADD";
25580 case X86ISD::FHSUB: return "X86ISD::FHSUB";
25581 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
25582 case X86ISD::FMAX: return "X86ISD::FMAX";
25583 case X86ISD::FMAXS: return "X86ISD::FMAXS";
25584 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
25585 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
25586 case X86ISD::FMIN: return "X86ISD::FMIN";
25587 case X86ISD::FMINS: return "X86ISD::FMINS";
25588 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
25589 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
25590 case X86ISD::FMAXC: return "X86ISD::FMAXC";
25591 case X86ISD::FMINC: return "X86ISD::FMINC";
25592 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
25593 case X86ISD::FRCP: return "X86ISD::FRCP";
25594 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25595 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25596 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25597 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25598 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25599 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25600 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25601 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25602 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25603 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25604 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25605 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25606 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25607 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25608 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25609 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25610 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25611 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25612 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25613 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25614 case X86ISD::LADD: return "X86ISD::LADD";
25615 case X86ISD::LSUB: return "X86ISD::LSUB";
25616 case X86ISD::LOR: return "X86ISD::LOR";
25617 case X86ISD::LXOR: return "X86ISD::LXOR";
25618 case X86ISD::LAND: return "X86ISD::LAND";
25619 case X86ISD::LINC: return "X86ISD::LINC";
25620 case X86ISD::LDEC: return "X86ISD::LDEC";
25621 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25622 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25623 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25624 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25625 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25626 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25627 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25628 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25629 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25630 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25631 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25632 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25633 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25634 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25635 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25636 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25637 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25638 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25639 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25640 case X86ISD::VSHL: return "X86ISD::VSHL";
25641 case X86ISD::VSRL: return "X86ISD::VSRL";
25642 case X86ISD::VSRA: return "X86ISD::VSRA";
25643 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25644 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25645 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25646 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25647 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25648 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25649 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25650 case X86ISD::CMPP: return "X86ISD::CMPP";
25651 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25652 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25653 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25654 case X86ISD::ADD: return "X86ISD::ADD";
25655 case X86ISD::SUB: return "X86ISD::SUB";
25656 case X86ISD::ADC: return "X86ISD::ADC";
25657 case X86ISD::SBB: return "X86ISD::SBB";
25658 case X86ISD::SMUL: return "X86ISD::SMUL";
25659 case X86ISD::UMUL: return "X86ISD::UMUL";
25660 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25661 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25662 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25663 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25664 case X86ISD::INC: return "X86ISD::INC";
25665 case X86ISD::DEC: return "X86ISD::DEC";
25666 case X86ISD::OR: return "X86ISD::OR";
25667 case X86ISD::XOR: return "X86ISD::XOR";
25668 case X86ISD::AND: return "X86ISD::AND";
25669 case X86ISD::BEXTR: return "X86ISD::BEXTR";
25670 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25671 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25672 case X86ISD::PTEST: return "X86ISD::PTEST";
25673 case X86ISD::TESTP: return "X86ISD::TESTP";
25674 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25675 case X86ISD::KTEST: return "X86ISD::KTEST";
25676 case X86ISD::KADD: return "X86ISD::KADD";
25677 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25678 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25679 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25680 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25681 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25682 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25683 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25684 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25685 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25686 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25687 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25688 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25689 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25690 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25691 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25692 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25693 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25694 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25695 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25696 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25697 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25698 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25699 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25700 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25701 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25702 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25703 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25704 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25705 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25706 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25707 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25708 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25709 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25710 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25711 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25712 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25713 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25714 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25715 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25716 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25717 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25718 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25719 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25720 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25721 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25722 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25723 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25724 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25725 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25726 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25727 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25728 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25729 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25730 case X86ISD::SAHF: return "X86ISD::SAHF";
25731 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25732 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25733 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25734 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25735 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25736 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25737 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25738 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25739 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25740 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25741 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25742 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25743 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25744 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25745 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25746 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25747 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25748 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25749 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25750 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25751 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25752 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25753 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25754 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25755 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25756 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25757 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25758 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25759 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25760 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25761 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25762 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25763 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25764 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25765 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25766 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25767 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25768 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25769 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25770 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25771 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25772 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25773 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25774 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25775 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25776 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25777 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25778 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25779 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25780 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25781 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25782 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25783 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25784 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25785 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25786 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25787 case X86ISD::XTEST: return "X86ISD::XTEST";
25788 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25789 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25790 case X86ISD::SELECT: return "X86ISD::SELECT";
25791 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25792 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25793 case X86ISD::RCP14: return "X86ISD::RCP14";
25794 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25795 case X86ISD::RCP28: return "X86ISD::RCP28";
25796 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25797 case X86ISD::EXP2: return "X86ISD::EXP2";
25798 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25799 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25800 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25801 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25802 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25803 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25804 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25805 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25806 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25807 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25808 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25809 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25810 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25811 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25812 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25813 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25814 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25815 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25816 case X86ISD::ADDS: return "X86ISD::ADDS";
25817 case X86ISD::SUBS: return "X86ISD::SUBS";
25818 case X86ISD::AVG: return "X86ISD::AVG";
25819 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25820 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25821 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25822 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25823 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25824 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25825 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25826 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25827 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25828 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25829 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25830 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25831 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25832 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25833 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25834 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25835 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25836 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25837 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25838 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25839 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25840 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25841 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25842 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25843 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25844 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25845 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25846 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25847 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25848 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25849 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25850 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25851 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25852 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25853 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25854 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25859 /// Return true if the addressing mode represented by AM is legal for this
25860 /// target, for a load/store of the specified type.
25861 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25862 const AddrMode &AM, Type *Ty,
25864 Instruction *I) const {
25865 // X86 supports extremely general addressing modes.
25866 CodeModel::Model M = getTargetMachine().getCodeModel();
25868 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25869 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25873 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25875 // If a reference to this global requires an extra load, we can't fold it.
25876 if (isGlobalStubReference(GVFlags))
25879 // If BaseGV requires a register for the PIC base, we cannot also have a
25880 // BaseReg specified.
25881 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25884 // If lower 4G is not available, then we must use rip-relative addressing.
25885 if ((M != CodeModel::Small || isPositionIndependent()) &&
25886 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25890 switch (AM.Scale) {
25896 // These scales always work.
25901 // These scales are formed with basereg+scalereg. Only accept if there is
25906 default: // Other stuff never works.
25913 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25914 unsigned Bits = Ty->getScalarSizeInBits();
25916 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25917 // particularly cheaper than those without.
25921 // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
25922 if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
25923 (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
25926 // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
25927 // shifts just as cheap as scalar ones.
25928 if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
25931 // AVX512BW has shifts such as vpsllvw.
25932 if (Subtarget.hasBWI() && Bits == 16)
25935 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25936 // fully general vector.
25940 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25941 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25943 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25944 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25945 return NumBits1 > NumBits2;
25948 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25949 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25952 if (!isTypeLegal(EVT::getEVT(Ty1)))
25955 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
25957 // Assuming the caller doesn't have a zeroext or signext return parameter,
25958 // truncation all the way down to i1 is valid.
25962 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25963 return isInt<32>(Imm);
25966 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25967 // Can also use sub to handle negated immediates.
25968 return isInt<32>(Imm);
25971 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25972 if (!VT1.isInteger() || !VT2.isInteger())
25974 unsigned NumBits1 = VT1.getSizeInBits();
25975 unsigned NumBits2 = VT2.getSizeInBits();
25976 return NumBits1 > NumBits2;
25979 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25980 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25981 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25984 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25985 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25986 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25989 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25990 EVT VT1 = Val.getValueType();
25991 if (isZExtFree(VT1, VT2))
25994 if (Val.getOpcode() != ISD::LOAD)
25997 if (!VT1.isSimple() || !VT1.isInteger() ||
25998 !VT2.isSimple() || !VT2.isInteger())
26001 switch (VT1.getSimpleVT().SimpleTy) {
26006 // X86 has 8, 16, and 32-bit zero-extending loads.
26013 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
26014 EVT SrcVT = ExtVal.getOperand(0).getValueType();
26016 // There is no extending load for vXi1.
26017 if (SrcVT.getScalarType() == MVT::i1)
26024 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
26025 if (!Subtarget.hasAnyFMA())
26028 VT = VT.getScalarType();
26030 if (!VT.isSimple())
26033 switch (VT.getSimpleVT().SimpleTy) {
26044 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
26045 // i16 instructions are longer (0x66 prefix) and potentially slower.
26046 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
26049 /// Targets can use this to indicate that they only support *some*
26050 /// VECTOR_SHUFFLE operations, those with specific masks.
26051 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
26052 /// are assumed to be legal.
26053 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
26054 if (!VT.isSimple())
26057 // Not for i1 vectors
26058 if (VT.getSimpleVT().getScalarType() == MVT::i1)
26061 // Very little shuffling can be done for 64-bit vectors right now.
26062 if (VT.getSimpleVT().getSizeInBits() == 64)
26065 // We only care that the types being shuffled are legal. The lowering can
26066 // handle any possible shuffle mask that results.
26067 return isTypeLegal(VT.getSimpleVT());
26071 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
26073 // Just delegate to the generic legality, clear masks aren't special.
26074 return isShuffleMaskLegal(Mask, VT);
26077 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
26078 // If the subtarget is using retpolines, we need to not generate jump tables.
26079 if (Subtarget.useRetpoline())
26082 // Otherwise, fallback on the generic logic.
26083 return TargetLowering::areJTsAllowed(Fn);
26086 //===----------------------------------------------------------------------===//
26087 // X86 Scheduler Hooks
26088 //===----------------------------------------------------------------------===//
26090 /// Utility function to emit xbegin specifying the start of an RTM region.
26091 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
26092 const TargetInstrInfo *TII) {
26093 DebugLoc DL = MI.getDebugLoc();
26095 const BasicBlock *BB = MBB->getBasicBlock();
26096 MachineFunction::iterator I = ++MBB->getIterator();
26098 // For the v = xbegin(), we generate
26107 // eax = # XABORT_DEF
26111 // v = phi(s0/mainBB, s1/fallBB)
26113 MachineBasicBlock *thisMBB = MBB;
26114 MachineFunction *MF = MBB->getParent();
26115 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26116 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
26117 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26118 MF->insert(I, mainMBB);
26119 MF->insert(I, fallMBB);
26120 MF->insert(I, sinkMBB);
26122 // Transfer the remainder of BB and its successor edges to sinkMBB.
26123 sinkMBB->splice(sinkMBB->begin(), MBB,
26124 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26125 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26127 MachineRegisterInfo &MRI = MF->getRegInfo();
26128 unsigned DstReg = MI.getOperand(0).getReg();
26129 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26130 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26131 unsigned fallDstReg = MRI.createVirtualRegister(RC);
26135 // # fallthrough to mainMBB
26136 // # abortion to fallMBB
26137 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
26138 thisMBB->addSuccessor(mainMBB);
26139 thisMBB->addSuccessor(fallMBB);
26142 // mainDstReg := -1
26143 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
26144 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26145 mainMBB->addSuccessor(sinkMBB);
26148 // ; pseudo instruction to model hardware's definition from XABORT
26149 // EAX := XABORT_DEF
26150 // fallDstReg := EAX
26151 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
26152 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
26154 fallMBB->addSuccessor(sinkMBB);
26157 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
26158 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
26159 .addReg(mainDstReg).addMBB(mainMBB)
26160 .addReg(fallDstReg).addMBB(fallMBB);
26162 MI.eraseFromParent();
26166 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
26167 // or XMM0_V32I8 in AVX all of this code can be replaced with that
26168 // in the .td file.
26169 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
26170 const TargetInstrInfo *TII) {
26172 switch (MI.getOpcode()) {
26173 default: llvm_unreachable("illegal opcode!");
26174 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
26175 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
26176 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
26177 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
26178 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
26179 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
26180 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
26181 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
26184 DebugLoc dl = MI.getDebugLoc();
26185 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
26187 unsigned NumArgs = MI.getNumOperands();
26188 for (unsigned i = 1; i < NumArgs; ++i) {
26189 MachineOperand &Op = MI.getOperand(i);
26190 if (!(Op.isReg() && Op.isImplicit()))
26193 if (MI.hasOneMemOperand())
26194 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
26196 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26197 .addReg(X86::XMM0);
26199 MI.eraseFromParent();
26203 // FIXME: Custom handling because TableGen doesn't support multiple implicit
26204 // defs in an instruction pattern
26205 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
26206 const TargetInstrInfo *TII) {
26208 switch (MI.getOpcode()) {
26209 default: llvm_unreachable("illegal opcode!");
26210 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
26211 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
26212 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
26213 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
26214 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
26215 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
26216 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
26217 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
26220 DebugLoc dl = MI.getDebugLoc();
26221 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
26223 unsigned NumArgs = MI.getNumOperands(); // remove the results
26224 for (unsigned i = 1; i < NumArgs; ++i) {
26225 MachineOperand &Op = MI.getOperand(i);
26226 if (!(Op.isReg() && Op.isImplicit()))
26229 if (MI.hasOneMemOperand())
26230 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
26232 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26235 MI.eraseFromParent();
26239 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26240 const X86Subtarget &Subtarget) {
26241 DebugLoc dl = MI.getDebugLoc();
26242 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26244 // insert input VAL into EAX
26245 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
26246 .addReg(MI.getOperand(0).getReg());
26247 // insert zero to ECX
26248 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26250 // insert zero to EDX
26251 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
26253 // insert WRPKRU instruction
26254 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
26256 MI.eraseFromParent(); // The pseudo is gone now.
26260 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
26261 const X86Subtarget &Subtarget) {
26262 DebugLoc dl = MI.getDebugLoc();
26263 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26265 // insert zero to ECX
26266 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
26268 // insert RDPKRU instruction
26269 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
26270 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
26273 MI.eraseFromParent(); // The pseudo is gone now.
26277 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
26278 const X86Subtarget &Subtarget,
26280 DebugLoc dl = MI.getDebugLoc();
26281 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26282 // Address into RAX/EAX, other two args into ECX, EDX.
26283 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26284 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26285 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26286 for (int i = 0; i < X86::AddrNumOperands; ++i)
26287 MIB.add(MI.getOperand(i));
26289 unsigned ValOps = X86::AddrNumOperands;
26290 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
26291 .addReg(MI.getOperand(ValOps).getReg());
26292 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
26293 .addReg(MI.getOperand(ValOps + 1).getReg());
26295 // The instruction doesn't actually take any operands though.
26296 BuildMI(*BB, MI, dl, TII->get(Opc));
26298 MI.eraseFromParent(); // The pseudo is gone now.
26302 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
26303 const X86Subtarget &Subtarget) {
26304 DebugLoc dl = MI->getDebugLoc();
26305 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26306 // Address into RAX/EAX
26307 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
26308 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
26309 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
26310 for (int i = 0; i < X86::AddrNumOperands; ++i)
26311 MIB.add(MI->getOperand(i));
26313 // The instruction doesn't actually take any operands though.
26314 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
26316 MI->eraseFromParent(); // The pseudo is gone now.
26322 MachineBasicBlock *
26323 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
26324 MachineBasicBlock *MBB) const {
26325 // Emit va_arg instruction on X86-64.
26327 // Operands to this pseudo-instruction:
26328 // 0 ) Output : destination address (reg)
26329 // 1-5) Input : va_list address (addr, i64mem)
26330 // 6 ) ArgSize : Size (in bytes) of vararg type
26331 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
26332 // 8 ) Align : Alignment of type
26333 // 9 ) EFLAGS (implicit-def)
26335 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
26336 static_assert(X86::AddrNumOperands == 5,
26337 "VAARG_64 assumes 5 address operands");
26339 unsigned DestReg = MI.getOperand(0).getReg();
26340 MachineOperand &Base = MI.getOperand(1);
26341 MachineOperand &Scale = MI.getOperand(2);
26342 MachineOperand &Index = MI.getOperand(3);
26343 MachineOperand &Disp = MI.getOperand(4);
26344 MachineOperand &Segment = MI.getOperand(5);
26345 unsigned ArgSize = MI.getOperand(6).getImm();
26346 unsigned ArgMode = MI.getOperand(7).getImm();
26347 unsigned Align = MI.getOperand(8).getImm();
26349 // Memory Reference
26350 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
26351 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26352 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26354 // Machine Information
26355 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26356 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
26357 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
26358 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
26359 DebugLoc DL = MI.getDebugLoc();
26361 // struct va_list {
26364 // i64 overflow_area (address)
26365 // i64 reg_save_area (address)
26367 // sizeof(va_list) = 24
26368 // alignment(va_list) = 8
26370 unsigned TotalNumIntRegs = 6;
26371 unsigned TotalNumXMMRegs = 8;
26372 bool UseGPOffset = (ArgMode == 1);
26373 bool UseFPOffset = (ArgMode == 2);
26374 unsigned MaxOffset = TotalNumIntRegs * 8 +
26375 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
26377 /* Align ArgSize to a multiple of 8 */
26378 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
26379 bool NeedsAlign = (Align > 8);
26381 MachineBasicBlock *thisMBB = MBB;
26382 MachineBasicBlock *overflowMBB;
26383 MachineBasicBlock *offsetMBB;
26384 MachineBasicBlock *endMBB;
26386 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
26387 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
26388 unsigned OffsetReg = 0;
26390 if (!UseGPOffset && !UseFPOffset) {
26391 // If we only pull from the overflow region, we don't create a branch.
26392 // We don't need to alter control flow.
26393 OffsetDestReg = 0; // unused
26394 OverflowDestReg = DestReg;
26396 offsetMBB = nullptr;
26397 overflowMBB = thisMBB;
26400 // First emit code to check if gp_offset (or fp_offset) is below the bound.
26401 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
26402 // If not, pull from overflow_area. (branch to overflowMBB)
26407 // offsetMBB overflowMBB
26412 // Registers for the PHI in endMBB
26413 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
26414 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
26416 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26417 MachineFunction *MF = MBB->getParent();
26418 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26419 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26420 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26422 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26424 // Insert the new basic blocks
26425 MF->insert(MBBIter, offsetMBB);
26426 MF->insert(MBBIter, overflowMBB);
26427 MF->insert(MBBIter, endMBB);
26429 // Transfer the remainder of MBB and its successor edges to endMBB.
26430 endMBB->splice(endMBB->begin(), thisMBB,
26431 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
26432 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
26434 // Make offsetMBB and overflowMBB successors of thisMBB
26435 thisMBB->addSuccessor(offsetMBB);
26436 thisMBB->addSuccessor(overflowMBB);
26438 // endMBB is a successor of both offsetMBB and overflowMBB
26439 offsetMBB->addSuccessor(endMBB);
26440 overflowMBB->addSuccessor(endMBB);
26442 // Load the offset value into a register
26443 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26444 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
26448 .addDisp(Disp, UseFPOffset ? 4 : 0)
26450 .setMemRefs(MMOBegin, MMOEnd);
26452 // Check if there is enough room left to pull this argument.
26453 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
26455 .addImm(MaxOffset + 8 - ArgSizeA8);
26457 // Branch to "overflowMBB" if offset >= max
26458 // Fall through to "offsetMBB" otherwise
26459 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
26460 .addMBB(overflowMBB);
26463 // In offsetMBB, emit code to use the reg_save_area.
26465 assert(OffsetReg != 0);
26467 // Read the reg_save_area address.
26468 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
26469 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
26475 .setMemRefs(MMOBegin, MMOEnd);
26477 // Zero-extend the offset
26478 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
26479 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
26482 .addImm(X86::sub_32bit);
26484 // Add the offset to the reg_save_area to get the final address.
26485 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
26486 .addReg(OffsetReg64)
26487 .addReg(RegSaveReg);
26489 // Compute the offset for the next argument
26490 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
26491 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
26493 .addImm(UseFPOffset ? 16 : 8);
26495 // Store it back into the va_list.
26496 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
26500 .addDisp(Disp, UseFPOffset ? 4 : 0)
26502 .addReg(NextOffsetReg)
26503 .setMemRefs(MMOBegin, MMOEnd);
26506 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
26511 // Emit code to use overflow area
26514 // Load the overflow_area address into a register.
26515 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
26516 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
26522 .setMemRefs(MMOBegin, MMOEnd);
26524 // If we need to align it, do so. Otherwise, just copy the address
26525 // to OverflowDestReg.
26527 // Align the overflow address
26528 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
26529 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
26531 // aligned_addr = (addr + (align-1)) & ~(align-1)
26532 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
26533 .addReg(OverflowAddrReg)
26536 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
26538 .addImm(~(uint64_t)(Align-1));
26540 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
26541 .addReg(OverflowAddrReg);
26544 // Compute the next overflow address after this argument.
26545 // (the overflow address should be kept 8-byte aligned)
26546 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
26547 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
26548 .addReg(OverflowDestReg)
26549 .addImm(ArgSizeA8);
26551 // Store the new overflow address.
26552 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
26558 .addReg(NextAddrReg)
26559 .setMemRefs(MMOBegin, MMOEnd);
26561 // If we branched, emit the PHI to the front of endMBB.
26563 BuildMI(*endMBB, endMBB->begin(), DL,
26564 TII->get(X86::PHI), DestReg)
26565 .addReg(OffsetDestReg).addMBB(offsetMBB)
26566 .addReg(OverflowDestReg).addMBB(overflowMBB);
26569 // Erase the pseudo instruction
26570 MI.eraseFromParent();
26575 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
26576 MachineInstr &MI, MachineBasicBlock *MBB) const {
26577 // Emit code to save XMM registers to the stack. The ABI says that the
26578 // number of registers to save is given in %al, so it's theoretically
26579 // possible to do an indirect jump trick to avoid saving all of them,
26580 // however this code takes a simpler approach and just executes all
26581 // of the stores if %al is non-zero. It's less code, and it's probably
26582 // easier on the hardware branch predictor, and stores aren't all that
26583 // expensive anyway.
26585 // Create the new basic blocks. One block contains all the XMM stores,
26586 // and one block is the final destination regardless of whether any
26587 // stores were performed.
26588 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
26589 MachineFunction *F = MBB->getParent();
26590 MachineFunction::iterator MBBIter = ++MBB->getIterator();
26591 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
26592 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
26593 F->insert(MBBIter, XMMSaveMBB);
26594 F->insert(MBBIter, EndMBB);
26596 // Transfer the remainder of MBB and its successor edges to EndMBB.
26597 EndMBB->splice(EndMBB->begin(), MBB,
26598 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26599 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
26601 // The original block will now fall through to the XMM save block.
26602 MBB->addSuccessor(XMMSaveMBB);
26603 // The XMMSaveMBB will fall through to the end block.
26604 XMMSaveMBB->addSuccessor(EndMBB);
26606 // Now add the instructions.
26607 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26608 DebugLoc DL = MI.getDebugLoc();
26610 unsigned CountReg = MI.getOperand(0).getReg();
26611 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
26612 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
26614 if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
26615 // If %al is 0, branch around the XMM save block.
26616 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26617 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26618 MBB->addSuccessor(EndMBB);
26621 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26622 // that was just emitted, but clearly shouldn't be "saved".
26623 assert((MI.getNumOperands() <= 3 ||
26624 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26625 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26626 "Expected last argument to be EFLAGS");
26627 unsigned MOVOpc = Subtarget.hasAVX() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26628 // In the XMM save block, save all the XMM argument registers.
26629 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26630 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26631 MachineMemOperand *MMO = F->getMachineMemOperand(
26632 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26633 MachineMemOperand::MOStore,
26634 /*Size=*/16, /*Align=*/16);
26635 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26636 .addFrameIndex(RegSaveFrameIndex)
26637 .addImm(/*Scale=*/1)
26638 .addReg(/*IndexReg=*/0)
26639 .addImm(/*Disp=*/Offset)
26640 .addReg(/*Segment=*/0)
26641 .addReg(MI.getOperand(i).getReg())
26642 .addMemOperand(MMO);
26645 MI.eraseFromParent(); // The pseudo instruction is gone now.
26650 // The EFLAGS operand of SelectItr might be missing a kill marker
26651 // because there were multiple uses of EFLAGS, and ISel didn't know
26652 // which to mark. Figure out whether SelectItr should have had a
26653 // kill marker, and set it if it should. Returns the correct kill
26655 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26656 MachineBasicBlock* BB,
26657 const TargetRegisterInfo* TRI) {
26658 // Scan forward through BB for a use/def of EFLAGS.
26659 MachineBasicBlock::iterator miI(std::next(SelectItr));
26660 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26661 const MachineInstr& mi = *miI;
26662 if (mi.readsRegister(X86::EFLAGS))
26664 if (mi.definesRegister(X86::EFLAGS))
26665 break; // Should have kill-flag - update below.
26668 // If we hit the end of the block, check whether EFLAGS is live into a
26670 if (miI == BB->end()) {
26671 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26672 sEnd = BB->succ_end();
26673 sItr != sEnd; ++sItr) {
26674 MachineBasicBlock* succ = *sItr;
26675 if (succ->isLiveIn(X86::EFLAGS))
26680 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26681 // out. SelectMI should have a kill flag on EFLAGS.
26682 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26686 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26687 // together with other CMOV pseudo-opcodes into a single basic-block with
26688 // conditional jump around it.
26689 static bool isCMOVPseudo(MachineInstr &MI) {
26690 switch (MI.getOpcode()) {
26691 case X86::CMOV_FR32:
26692 case X86::CMOV_FR64:
26693 case X86::CMOV_GR8:
26694 case X86::CMOV_GR16:
26695 case X86::CMOV_GR32:
26696 case X86::CMOV_RFP32:
26697 case X86::CMOV_RFP64:
26698 case X86::CMOV_RFP80:
26699 case X86::CMOV_V2F64:
26700 case X86::CMOV_V2I64:
26701 case X86::CMOV_V4F32:
26702 case X86::CMOV_V4F64:
26703 case X86::CMOV_V4I64:
26704 case X86::CMOV_V16F32:
26705 case X86::CMOV_V8F32:
26706 case X86::CMOV_V8F64:
26707 case X86::CMOV_V8I64:
26708 case X86::CMOV_V8I1:
26709 case X86::CMOV_V16I1:
26710 case X86::CMOV_V32I1:
26711 case X86::CMOV_V64I1:
26719 // Helper function, which inserts PHI functions into SinkMBB:
26720 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26721 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26722 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26723 // the last PHI function inserted.
26724 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26725 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26726 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26727 MachineBasicBlock *SinkMBB) {
26728 MachineFunction *MF = TrueMBB->getParent();
26729 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26730 DebugLoc DL = MIItBegin->getDebugLoc();
26732 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26733 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26735 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26737 // As we are creating the PHIs, we have to be careful if there is more than
26738 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26739 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26740 // That also means that PHI construction must work forward from earlier to
26741 // later, and that the code must maintain a mapping from earlier PHI's
26742 // destination registers, and the registers that went into the PHI.
26743 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26744 MachineInstrBuilder MIB;
26746 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26747 unsigned DestReg = MIIt->getOperand(0).getReg();
26748 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26749 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26751 // If this CMOV we are generating is the opposite condition from
26752 // the jump we generated, then we have to swap the operands for the
26753 // PHI that is going to be generated.
26754 if (MIIt->getOperand(3).getImm() == OppCC)
26755 std::swap(Op1Reg, Op2Reg);
26757 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26758 Op1Reg = RegRewriteTable[Op1Reg].first;
26760 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26761 Op2Reg = RegRewriteTable[Op2Reg].second;
26763 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26769 // Add this PHI to the rewrite table.
26770 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26776 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26777 MachineBasicBlock *
26778 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26779 MachineInstr &SecondCascadedCMOV,
26780 MachineBasicBlock *ThisMBB) const {
26781 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26782 DebugLoc DL = FirstCMOV.getDebugLoc();
26784 // We lower cascaded CMOVs such as
26786 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26788 // to two successive branches.
26790 // Without this, we would add a PHI between the two jumps, which ends up
26791 // creating a few copies all around. For instance, for
26793 // (sitofp (zext (fcmp une)))
26795 // we would generate:
26797 // ucomiss %xmm1, %xmm0
26798 // movss <1.0f>, %xmm0
26799 // movaps %xmm0, %xmm1
26801 // xorps %xmm1, %xmm1
26804 // movaps %xmm1, %xmm0
26808 // because this custom-inserter would have generated:
26820 // A: X = ...; Y = ...
26822 // C: Z = PHI [X, A], [Y, B]
26824 // E: PHI [X, C], [Z, D]
26826 // If we lower both CMOVs in a single step, we can instead generate:
26838 // A: X = ...; Y = ...
26840 // E: PHI [X, A], [X, C], [Y, D]
26842 // Which, in our sitofp/fcmp example, gives us something like:
26844 // ucomiss %xmm1, %xmm0
26845 // movss <1.0f>, %xmm0
26848 // xorps %xmm0, %xmm0
26853 // We lower cascaded CMOV into two successive branches to the same block.
26854 // EFLAGS is used by both, so mark it as live in the second.
26855 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26856 MachineFunction *F = ThisMBB->getParent();
26857 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26858 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26859 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26861 MachineFunction::iterator It = ++ThisMBB->getIterator();
26862 F->insert(It, FirstInsertedMBB);
26863 F->insert(It, SecondInsertedMBB);
26864 F->insert(It, SinkMBB);
26866 // For a cascaded CMOV, we lower it to two successive branches to
26867 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26868 // the FirstInsertedMBB.
26869 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26871 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26872 // live into the sink and copy blocks.
26873 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26874 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26875 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26876 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26877 SinkMBB->addLiveIn(X86::EFLAGS);
26880 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26881 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26882 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26884 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26886 // Fallthrough block for ThisMBB.
26887 ThisMBB->addSuccessor(FirstInsertedMBB);
26888 // The true block target of the first branch is always SinkMBB.
26889 ThisMBB->addSuccessor(SinkMBB);
26890 // Fallthrough block for FirstInsertedMBB.
26891 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26892 // The true block for the branch of FirstInsertedMBB.
26893 FirstInsertedMBB->addSuccessor(SinkMBB);
26894 // This is fallthrough.
26895 SecondInsertedMBB->addSuccessor(SinkMBB);
26897 // Create the conditional branch instructions.
26898 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26899 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26900 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26902 X86::CondCode SecondCC =
26903 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26904 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26905 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26908 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26909 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26910 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26911 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26912 MachineInstrBuilder MIB =
26913 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26915 .addMBB(SecondInsertedMBB)
26919 // The second SecondInsertedMBB provides the same incoming value as the
26920 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26921 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26922 // Copy the PHI result to the register defined by the second CMOV.
26923 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26924 TII->get(TargetOpcode::COPY),
26925 SecondCascadedCMOV.getOperand(0).getReg())
26926 .addReg(FirstCMOV.getOperand(0).getReg());
26928 // Now remove the CMOVs.
26929 FirstCMOV.eraseFromParent();
26930 SecondCascadedCMOV.eraseFromParent();
26935 MachineBasicBlock *
26936 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26937 MachineBasicBlock *ThisMBB) const {
26938 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26939 DebugLoc DL = MI.getDebugLoc();
26941 // To "insert" a SELECT_CC instruction, we actually have to insert the
26942 // diamond control-flow pattern. The incoming instruction knows the
26943 // destination vreg to set, the condition code register to branch on, the
26944 // true/false values to select between and a branch opcode to use.
26949 // cmpTY ccX, r1, r2
26951 // fallthrough --> FalseMBB
26953 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26954 // as described above, by inserting a BB, and then making a PHI at the join
26955 // point to select the true and false operands of the CMOV in the PHI.
26957 // The code also handles two different cases of multiple CMOV opcodes
26961 // In this case, there are multiple CMOVs in a row, all which are based on
26962 // the same condition setting (or the exact opposite condition setting).
26963 // In this case we can lower all the CMOVs using a single inserted BB, and
26964 // then make a number of PHIs at the join point to model the CMOVs. The only
26965 // trickiness here, is that in a case like:
26967 // t2 = CMOV cond1 t1, f1
26968 // t3 = CMOV cond1 t2, f2
26970 // when rewriting this into PHIs, we have to perform some renaming on the
26971 // temps since you cannot have a PHI operand refer to a PHI result earlier
26972 // in the same block. The "simple" but wrong lowering would be:
26974 // t2 = PHI t1(BB1), f1(BB2)
26975 // t3 = PHI t2(BB1), f2(BB2)
26977 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26978 // renaming is to note that on the path through BB1, t2 is really just a
26979 // copy of t1, and do that renaming, properly generating:
26981 // t2 = PHI t1(BB1), f1(BB2)
26982 // t3 = PHI t1(BB1), f2(BB2)
26985 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26986 // function - EmitLoweredCascadedSelect.
26988 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26989 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26990 MachineInstr *LastCMOV = &MI;
26991 MachineBasicBlock::iterator NextMIIt =
26992 std::next(MachineBasicBlock::iterator(MI));
26994 // Check for case 1, where there are multiple CMOVs with the same condition
26995 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26996 // number of jumps the most.
26998 if (isCMOVPseudo(MI)) {
26999 // See if we have a string of CMOVS with the same condition.
27000 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
27001 (NextMIIt->getOperand(3).getImm() == CC ||
27002 NextMIIt->getOperand(3).getImm() == OppCC)) {
27003 LastCMOV = &*NextMIIt;
27008 // This checks for case 2, but only do this if we didn't already find
27009 // case 1, as indicated by LastCMOV == MI.
27010 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
27011 NextMIIt->getOpcode() == MI.getOpcode() &&
27012 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
27013 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
27014 NextMIIt->getOperand(1).isKill()) {
27015 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
27018 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
27019 MachineFunction *F = ThisMBB->getParent();
27020 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
27021 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
27023 MachineFunction::iterator It = ++ThisMBB->getIterator();
27024 F->insert(It, FalseMBB);
27025 F->insert(It, SinkMBB);
27027 // If the EFLAGS register isn't dead in the terminator, then claim that it's
27028 // live into the sink and copy blocks.
27029 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27030 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
27031 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
27032 FalseMBB->addLiveIn(X86::EFLAGS);
27033 SinkMBB->addLiveIn(X86::EFLAGS);
27036 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
27037 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
27038 std::next(MachineBasicBlock::iterator(LastCMOV)),
27040 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
27042 // Fallthrough block for ThisMBB.
27043 ThisMBB->addSuccessor(FalseMBB);
27044 // The true block target of the first (or only) branch is always a SinkMBB.
27045 ThisMBB->addSuccessor(SinkMBB);
27046 // Fallthrough block for FalseMBB.
27047 FalseMBB->addSuccessor(SinkMBB);
27049 // Create the conditional branch instruction.
27050 unsigned Opc = X86::GetCondBranchFromCond(CC);
27051 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
27054 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
27056 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
27057 MachineBasicBlock::iterator MIItEnd =
27058 std::next(MachineBasicBlock::iterator(LastCMOV));
27059 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
27061 // Now remove the CMOV(s).
27062 ThisMBB->erase(MIItBegin, MIItEnd);
27067 MachineBasicBlock *
27068 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
27069 MachineBasicBlock *BB) const {
27070 // Combine the following atomic floating-point modification pattern:
27071 // a.store(reg OP a.load(acquire), release)
27072 // Transform them into:
27073 // OPss (%gpr), %xmm
27074 // movss %xmm, (%gpr)
27075 // Or sd equivalent for 64-bit operations.
27077 switch (MI.getOpcode()) {
27078 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
27079 case X86::RELEASE_FADD32mr:
27080 FOp = X86::ADDSSrm;
27081 MOp = X86::MOVSSmr;
27083 case X86::RELEASE_FADD64mr:
27084 FOp = X86::ADDSDrm;
27085 MOp = X86::MOVSDmr;
27088 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27089 DebugLoc DL = MI.getDebugLoc();
27090 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
27091 unsigned ValOpIdx = X86::AddrNumOperands;
27092 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
27093 MachineInstrBuilder MIB =
27094 BuildMI(*BB, MI, DL, TII->get(FOp),
27095 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
27097 for (int i = 0; i < X86::AddrNumOperands; ++i) {
27098 MachineOperand &Operand = MI.getOperand(i);
27099 // Clear any kill flags on register operands as we'll create a second
27100 // instruction using the same address operands.
27101 if (Operand.isReg())
27102 Operand.setIsKill(false);
27105 MachineInstr *FOpMI = MIB;
27106 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
27107 for (int i = 0; i < X86::AddrNumOperands; ++i)
27108 MIB.add(MI.getOperand(i));
27109 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
27110 MI.eraseFromParent(); // The pseudo instruction is gone now.
27114 MachineBasicBlock *
27115 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
27116 MachineBasicBlock *BB) const {
27117 MachineFunction *MF = BB->getParent();
27118 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27119 DebugLoc DL = MI.getDebugLoc();
27120 const BasicBlock *LLVM_BB = BB->getBasicBlock();
27122 assert(MF->shouldSplitStack());
27124 const bool Is64Bit = Subtarget.is64Bit();
27125 const bool IsLP64 = Subtarget.isTarget64BitLP64();
27127 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
27128 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
27131 // ... [Till the alloca]
27132 // If stacklet is not large enough, jump to mallocMBB
27135 // Allocate by subtracting from RSP
27136 // Jump to continueMBB
27139 // Allocate by call to runtime
27143 // [rest of original BB]
27146 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27147 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27148 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
27150 MachineRegisterInfo &MRI = MF->getRegInfo();
27151 const TargetRegisterClass *AddrRegClass =
27152 getRegClassFor(getPointerTy(MF->getDataLayout()));
27154 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27155 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
27156 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
27157 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
27158 sizeVReg = MI.getOperand(1).getReg(),
27160 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
27162 MachineFunction::iterator MBBIter = ++BB->getIterator();
27164 MF->insert(MBBIter, bumpMBB);
27165 MF->insert(MBBIter, mallocMBB);
27166 MF->insert(MBBIter, continueMBB);
27168 continueMBB->splice(continueMBB->begin(), BB,
27169 std::next(MachineBasicBlock::iterator(MI)), BB->end());
27170 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
27172 // Add code to the main basic block to check if the stack limit has been hit,
27173 // and if so, jump to mallocMBB otherwise to bumpMBB.
27174 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
27175 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
27176 .addReg(tmpSPVReg).addReg(sizeVReg);
27177 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
27178 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
27179 .addReg(SPLimitVReg);
27180 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
27182 // bumpMBB simply decreases the stack pointer, since we know the current
27183 // stacklet has enough space.
27184 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
27185 .addReg(SPLimitVReg);
27186 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
27187 .addReg(SPLimitVReg);
27188 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27190 // Calls into a routine in libgcc to allocate more space from the heap.
27191 const uint32_t *RegMask =
27192 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
27194 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
27196 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27197 .addExternalSymbol("__morestack_allocate_stack_space")
27198 .addRegMask(RegMask)
27199 .addReg(X86::RDI, RegState::Implicit)
27200 .addReg(X86::RAX, RegState::ImplicitDefine);
27201 } else if (Is64Bit) {
27202 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
27204 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
27205 .addExternalSymbol("__morestack_allocate_stack_space")
27206 .addRegMask(RegMask)
27207 .addReg(X86::EDI, RegState::Implicit)
27208 .addReg(X86::EAX, RegState::ImplicitDefine);
27210 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
27212 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
27213 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
27214 .addExternalSymbol("__morestack_allocate_stack_space")
27215 .addRegMask(RegMask)
27216 .addReg(X86::EAX, RegState::ImplicitDefine);
27220 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
27223 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
27224 .addReg(IsLP64 ? X86::RAX : X86::EAX);
27225 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
27227 // Set up the CFG correctly.
27228 BB->addSuccessor(bumpMBB);
27229 BB->addSuccessor(mallocMBB);
27230 mallocMBB->addSuccessor(continueMBB);
27231 bumpMBB->addSuccessor(continueMBB);
27233 // Take care of the PHI nodes.
27234 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
27235 MI.getOperand(0).getReg())
27236 .addReg(mallocPtrVReg)
27238 .addReg(bumpSPPtrVReg)
27241 // Delete the original pseudo instruction.
27242 MI.eraseFromParent();
27245 return continueMBB;
27248 MachineBasicBlock *
27249 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
27250 MachineBasicBlock *BB) const {
27251 MachineFunction *MF = BB->getParent();
27252 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27253 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
27254 DebugLoc DL = MI.getDebugLoc();
27256 assert(!isAsynchronousEHPersonality(
27257 classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
27258 "SEH does not use catchret!");
27260 // Only 32-bit EH needs to worry about manually restoring stack pointers.
27261 if (!Subtarget.is32Bit())
27264 // C++ EH creates a new target block to hold the restore code, and wires up
27265 // the new block to the return destination with a normal JMP_4.
27266 MachineBasicBlock *RestoreMBB =
27267 MF->CreateMachineBasicBlock(BB->getBasicBlock());
27268 assert(BB->succ_size() == 1);
27269 MF->insert(std::next(BB->getIterator()), RestoreMBB);
27270 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
27271 BB->addSuccessor(RestoreMBB);
27272 MI.getOperand(0).setMBB(RestoreMBB);
27274 auto RestoreMBBI = RestoreMBB->begin();
27275 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
27276 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
27280 MachineBasicBlock *
27281 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
27282 MachineBasicBlock *BB) const {
27283 MachineFunction *MF = BB->getParent();
27284 const Constant *PerFn = MF->getFunction().getPersonalityFn();
27285 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
27286 // Only 32-bit SEH requires special handling for catchpad.
27287 if (IsSEH && Subtarget.is32Bit()) {
27288 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27289 DebugLoc DL = MI.getDebugLoc();
27290 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
27292 MI.eraseFromParent();
27296 MachineBasicBlock *
27297 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
27298 MachineBasicBlock *BB) const {
27299 // So, here we replace TLSADDR with the sequence:
27300 // adjust_stackdown -> TLSADDR -> adjust_stackup.
27301 // We need this because TLSADDR is lowered into calls
27302 // inside MC, therefore without the two markers shrink-wrapping
27303 // may push the prologue/epilogue pass them.
27304 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
27305 DebugLoc DL = MI.getDebugLoc();
27306 MachineFunction &MF = *BB->getParent();
27308 // Emit CALLSEQ_START right before the instruction.
27309 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
27310 MachineInstrBuilder CallseqStart =
27311 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
27312 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
27314 // Emit CALLSEQ_END right after the instruction.
27315 // We don't call erase from parent because we want to keep the
27316 // original instruction around.
27317 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
27318 MachineInstrBuilder CallseqEnd =
27319 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
27320 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
27325 MachineBasicBlock *
27326 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
27327 MachineBasicBlock *BB) const {
27328 // This is pretty easy. We're taking the value that we received from
27329 // our load from the relocation, sticking it in either RDI (x86-64)
27330 // or EAX and doing an indirect call. The return value will then
27331 // be in the normal return register.
27332 MachineFunction *F = BB->getParent();
27333 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27334 DebugLoc DL = MI.getDebugLoc();
27336 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
27337 assert(MI.getOperand(3).isGlobal() && "This should be a global");
27339 // Get a register mask for the lowered call.
27340 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
27341 // proper register mask.
27342 const uint32_t *RegMask =
27343 Subtarget.is64Bit() ?
27344 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
27345 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
27346 if (Subtarget.is64Bit()) {
27347 MachineInstrBuilder MIB =
27348 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
27352 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27353 MI.getOperand(3).getTargetFlags())
27355 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
27356 addDirectMem(MIB, X86::RDI);
27357 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
27358 } else if (!isPositionIndependent()) {
27359 MachineInstrBuilder MIB =
27360 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27364 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27365 MI.getOperand(3).getTargetFlags())
27367 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27368 addDirectMem(MIB, X86::EAX);
27369 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27371 MachineInstrBuilder MIB =
27372 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
27373 .addReg(TII->getGlobalBaseReg(F))
27376 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
27377 MI.getOperand(3).getTargetFlags())
27379 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
27380 addDirectMem(MIB, X86::EAX);
27381 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
27384 MI.eraseFromParent(); // The pseudo instruction is gone now.
27388 static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
27390 case X86::RETPOLINE_CALL32:
27391 return X86::CALLpcrel32;
27392 case X86::RETPOLINE_CALL64:
27393 return X86::CALL64pcrel32;
27394 case X86::RETPOLINE_TCRETURN32:
27395 return X86::TCRETURNdi;
27396 case X86::RETPOLINE_TCRETURN64:
27397 return X86::TCRETURNdi64;
27399 llvm_unreachable("not retpoline opcode");
27402 static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
27404 if (Subtarget.useRetpolineExternalThunk()) {
27405 // When using an external thunk for retpolines, we pick names that match the
27406 // names GCC happens to use as well. This helps simplify the implementation
27407 // of the thunks for kernels where they have no easy ability to create
27408 // aliases and are doing non-trivial configuration of the thunk's body. For
27409 // example, the Linux kernel will do boot-time hot patching of the thunk
27410 // bodies and cannot easily export aliases of these to loaded modules.
27412 // Note that at any point in the future, we may need to change the semantics
27413 // of how we implement retpolines and at that time will likely change the
27414 // name of the called thunk. Essentially, there is no hard guarantee that
27415 // LLVM will generate calls to specific thunks, we merely make a best-effort
27416 // attempt to help out kernels and other systems where duplicating the
27417 // thunks is costly.
27420 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27421 return "__x86_indirect_thunk_eax";
27423 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27424 return "__x86_indirect_thunk_ecx";
27426 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27427 return "__x86_indirect_thunk_edx";
27429 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27430 return "__x86_indirect_thunk_edi";
27432 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27433 return "__x86_indirect_thunk_r11";
27435 llvm_unreachable("unexpected reg for retpoline");
27438 // When targeting an internal COMDAT thunk use an LLVM-specific name.
27441 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27442 return "__llvm_retpoline_eax";
27444 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27445 return "__llvm_retpoline_ecx";
27447 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27448 return "__llvm_retpoline_edx";
27450 assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
27451 return "__llvm_retpoline_edi";
27453 assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
27454 return "__llvm_retpoline_r11";
27456 llvm_unreachable("unexpected reg for retpoline");
27459 MachineBasicBlock *
27460 X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
27461 MachineBasicBlock *BB) const {
27462 // Copy the virtual register into the R11 physical register and
27463 // call the retpoline thunk.
27464 DebugLoc DL = MI.getDebugLoc();
27465 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27466 unsigned CalleeVReg = MI.getOperand(0).getReg();
27467 unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
27469 // Find an available scratch register to hold the callee. On 64-bit, we can
27470 // just use R11, but we scan for uses anyway to ensure we don't generate
27471 // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't
27472 // already a register use operand to the call to hold the callee. If none
27473 // are available, use EDI instead. EDI is chosen because EBX is the PIC base
27474 // register and ESI is the base pointer to realigned stack frames with VLAs.
27475 SmallVector<unsigned, 3> AvailableRegs;
27476 if (Subtarget.is64Bit())
27477 AvailableRegs.push_back(X86::R11);
27479 AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI});
27481 // Zero out any registers that are already used.
27482 for (const auto &MO : MI.operands()) {
27483 if (MO.isReg() && MO.isUse())
27484 for (unsigned &Reg : AvailableRegs)
27485 if (Reg == MO.getReg())
27489 // Choose the first remaining non-zero available register.
27490 unsigned AvailableReg = 0;
27491 for (unsigned MaybeReg : AvailableRegs) {
27493 AvailableReg = MaybeReg;
27498 report_fatal_error("calling convention incompatible with retpoline, no "
27499 "available registers");
27501 const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
27503 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
27504 .addReg(CalleeVReg);
27505 MI.getOperand(0).ChangeToES(Symbol);
27506 MI.setDesc(TII->get(Opc));
27507 MachineInstrBuilder(*BB->getParent(), &MI)
27508 .addReg(AvailableReg, RegState::Implicit | RegState::Kill);
27512 MachineBasicBlock *
27513 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
27514 MachineBasicBlock *MBB) const {
27515 DebugLoc DL = MI.getDebugLoc();
27516 MachineFunction *MF = MBB->getParent();
27517 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27518 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
27519 MachineRegisterInfo &MRI = MF->getRegInfo();
27521 const BasicBlock *BB = MBB->getBasicBlock();
27522 MachineFunction::iterator I = ++MBB->getIterator();
27524 // Memory Reference
27525 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27526 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27529 unsigned MemOpndSlot = 0;
27531 unsigned CurOp = 0;
27533 DstReg = MI.getOperand(CurOp++).getReg();
27534 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
27535 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
27537 unsigned mainDstReg = MRI.createVirtualRegister(RC);
27538 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
27540 MemOpndSlot = CurOp;
27542 MVT PVT = getPointerTy(MF->getDataLayout());
27543 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27544 "Invalid Pointer Size!");
27546 // For v = setjmp(buf), we generate
27549 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
27550 // SjLjSetup restoreMBB
27556 // v = phi(main, restore)
27559 // if base pointer being used, load it from frame
27562 MachineBasicBlock *thisMBB = MBB;
27563 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
27564 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
27565 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
27566 MF->insert(I, mainMBB);
27567 MF->insert(I, sinkMBB);
27568 MF->push_back(restoreMBB);
27569 restoreMBB->setHasAddressTaken();
27571 MachineInstrBuilder MIB;
27573 // Transfer the remainder of BB and its successor edges to sinkMBB.
27574 sinkMBB->splice(sinkMBB->begin(), MBB,
27575 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
27576 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
27579 unsigned PtrStoreOpc = 0;
27580 unsigned LabelReg = 0;
27581 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27582 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27583 !isPositionIndependent();
27585 // Prepare IP either in reg or imm.
27586 if (!UseImmLabel) {
27587 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27588 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
27589 LabelReg = MRI.createVirtualRegister(PtrRC);
27590 if (Subtarget.is64Bit()) {
27591 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
27595 .addMBB(restoreMBB)
27598 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
27599 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
27600 .addReg(XII->getGlobalBaseReg(MF))
27603 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
27607 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27609 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
27610 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27611 if (i == X86::AddrDisp)
27612 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
27614 MIB.add(MI.getOperand(MemOpndSlot + i));
27617 MIB.addReg(LabelReg);
27619 MIB.addMBB(restoreMBB);
27620 MIB.setMemRefs(MMOBegin, MMOEnd);
27622 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
27623 .addMBB(restoreMBB);
27625 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27626 MIB.addRegMask(RegInfo->getNoPreservedMask());
27627 thisMBB->addSuccessor(mainMBB);
27628 thisMBB->addSuccessor(restoreMBB);
27632 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
27633 mainMBB->addSuccessor(sinkMBB);
27636 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
27637 TII->get(X86::PHI), DstReg)
27638 .addReg(mainDstReg).addMBB(mainMBB)
27639 .addReg(restoreDstReg).addMBB(restoreMBB);
27642 if (RegInfo->hasBasePointer(*MF)) {
27643 const bool Uses64BitFramePtr =
27644 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27645 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
27646 X86FI->setRestoreBasePointer(MF);
27647 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
27648 unsigned BasePtr = RegInfo->getBaseRegister();
27649 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
27650 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
27651 FramePtr, true, X86FI->getRestoreBasePointerOffset())
27652 .setMIFlag(MachineInstr::FrameSetup);
27654 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
27655 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
27656 restoreMBB->addSuccessor(sinkMBB);
27658 MI.eraseFromParent();
27662 MachineBasicBlock *
27663 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
27664 MachineBasicBlock *MBB) const {
27665 DebugLoc DL = MI.getDebugLoc();
27666 MachineFunction *MF = MBB->getParent();
27667 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27668 MachineRegisterInfo &MRI = MF->getRegInfo();
27670 // Memory Reference
27671 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
27672 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
27674 MVT PVT = getPointerTy(MF->getDataLayout());
27675 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
27676 "Invalid Pointer Size!");
27678 const TargetRegisterClass *RC =
27679 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27680 unsigned Tmp = MRI.createVirtualRegister(RC);
27681 // Since FP is only updated here but NOT referenced, it's treated as GPR.
27682 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
27683 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
27684 unsigned SP = RegInfo->getStackRegister();
27686 MachineInstrBuilder MIB;
27688 const int64_t LabelOffset = 1 * PVT.getStoreSize();
27689 const int64_t SPOffset = 2 * PVT.getStoreSize();
27691 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
27692 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
27695 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
27696 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
27697 MIB.add(MI.getOperand(i));
27698 MIB.setMemRefs(MMOBegin, MMOEnd);
27700 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
27701 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27702 if (i == X86::AddrDisp)
27703 MIB.addDisp(MI.getOperand(i), LabelOffset);
27705 MIB.add(MI.getOperand(i));
27707 MIB.setMemRefs(MMOBegin, MMOEnd);
27709 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
27710 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
27711 if (i == X86::AddrDisp)
27712 MIB.addDisp(MI.getOperand(i), SPOffset);
27714 MIB.add(MI.getOperand(i));
27716 MIB.setMemRefs(MMOBegin, MMOEnd);
27718 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
27720 MI.eraseFromParent();
27724 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
27725 MachineBasicBlock *MBB,
27726 MachineBasicBlock *DispatchBB,
27728 DebugLoc DL = MI.getDebugLoc();
27729 MachineFunction *MF = MBB->getParent();
27730 MachineRegisterInfo *MRI = &MF->getRegInfo();
27731 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27733 MVT PVT = getPointerTy(MF->getDataLayout());
27734 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
27739 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27740 !isPositionIndependent();
27743 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27745 const TargetRegisterClass *TRC =
27746 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27747 VR = MRI->createVirtualRegister(TRC);
27748 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27750 if (Subtarget.is64Bit())
27751 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27755 .addMBB(DispatchBB)
27758 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27759 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27762 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27766 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27767 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27769 MIB.addMBB(DispatchBB);
27774 MachineBasicBlock *
27775 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27776 MachineBasicBlock *BB) const {
27777 DebugLoc DL = MI.getDebugLoc();
27778 MachineFunction *MF = BB->getParent();
27779 MachineFrameInfo &MFI = MF->getFrameInfo();
27780 MachineRegisterInfo *MRI = &MF->getRegInfo();
27781 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27782 int FI = MFI.getFunctionContextIndex();
27784 // Get a mapping of the call site numbers to all of the landing pads they're
27785 // associated with.
27786 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27787 unsigned MaxCSNum = 0;
27788 for (auto &MBB : *MF) {
27789 if (!MBB.isEHPad())
27792 MCSymbol *Sym = nullptr;
27793 for (const auto &MI : MBB) {
27794 if (MI.isDebugValue())
27797 assert(MI.isEHLabel() && "expected EH_LABEL");
27798 Sym = MI.getOperand(0).getMCSymbol();
27802 if (!MF->hasCallSiteLandingPad(Sym))
27805 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27806 CallSiteNumToLPad[CSI].push_back(&MBB);
27807 MaxCSNum = std::max(MaxCSNum, CSI);
27811 // Get an ordered list of the machine basic blocks for the jump table.
27812 std::vector<MachineBasicBlock *> LPadList;
27813 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27814 LPadList.reserve(CallSiteNumToLPad.size());
27816 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27817 for (auto &LP : CallSiteNumToLPad[CSI]) {
27818 LPadList.push_back(LP);
27819 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27823 assert(!LPadList.empty() &&
27824 "No landing pad destinations for the dispatch jump table!");
27826 // Create the MBBs for the dispatch code.
27828 // Shove the dispatch's address into the return slot in the function context.
27829 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27830 DispatchBB->setIsEHPad(true);
27832 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27833 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27834 DispatchBB->addSuccessor(TrapBB);
27836 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27837 DispatchBB->addSuccessor(DispContBB);
27840 MF->push_back(DispatchBB);
27841 MF->push_back(DispContBB);
27842 MF->push_back(TrapBB);
27844 // Insert code into the entry block that creates and registers the function
27846 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27848 // Create the jump table and associated information
27849 unsigned JTE = getJumpTableEncoding();
27850 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27851 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27853 const X86RegisterInfo &RI = TII->getRegisterInfo();
27854 // Add a register mask with no preserved registers. This results in all
27855 // registers being marked as clobbered.
27856 if (RI.hasBasePointer(*MF)) {
27857 const bool FPIs64Bit =
27858 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27859 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27860 MFI->setRestoreBasePointer(MF);
27862 unsigned FP = RI.getFrameRegister(*MF);
27863 unsigned BP = RI.getBaseRegister();
27864 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27865 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27866 MFI->getRestoreBasePointerOffset())
27867 .addRegMask(RI.getNoPreservedMask());
27869 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27870 .addRegMask(RI.getNoPreservedMask());
27873 // IReg is used as an index in a memory operand and therefore can't be SP
27874 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27875 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27876 Subtarget.is64Bit() ? 8 : 4);
27877 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27879 .addImm(LPadList.size());
27880 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27882 if (Subtarget.is64Bit()) {
27883 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27884 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27886 // leaq .LJTI0_0(%rip), BReg
27887 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27891 .addJumpTableIndex(MJTI)
27893 // movzx IReg64, IReg
27894 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27897 .addImm(X86::sub_32bit);
27900 case MachineJumpTableInfo::EK_BlockAddress:
27901 // jmpq *(BReg,IReg64,8)
27902 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27909 case MachineJumpTableInfo::EK_LabelDifference32: {
27910 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27911 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27912 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27914 // movl (BReg,IReg64,4), OReg
27915 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27921 // movsx OReg64, OReg
27922 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27923 // addq BReg, OReg64, TReg
27924 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27928 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27932 llvm_unreachable("Unexpected jump table encoding");
27935 // jmpl *.LJTI0_0(,IReg,4)
27936 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27940 .addJumpTableIndex(MJTI)
27944 // Add the jump table entries as successors to the MBB.
27945 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27946 for (auto &LP : LPadList)
27947 if (SeenMBBs.insert(LP).second)
27948 DispContBB->addSuccessor(LP);
27950 // N.B. the order the invoke BBs are processed in doesn't matter here.
27951 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27952 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27953 for (MachineBasicBlock *MBB : InvokeBBs) {
27954 // Remove the landing pad successor from the invoke block and replace it
27955 // with the new dispatch block.
27956 // Keep a copy of Successors since it's modified inside the loop.
27957 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27959 // FIXME: Avoid quadratic complexity.
27960 for (auto MBBS : Successors) {
27961 if (MBBS->isEHPad()) {
27962 MBB->removeSuccessor(MBBS);
27963 MBBLPads.push_back(MBBS);
27967 MBB->addSuccessor(DispatchBB);
27969 // Find the invoke call and mark all of the callee-saved registers as
27970 // 'implicit defined' so that they're spilled. This prevents code from
27971 // moving instructions to before the EH block, where they will never be
27973 for (auto &II : reverse(*MBB)) {
27977 DenseMap<unsigned, bool> DefRegs;
27978 for (auto &MOp : II.operands())
27980 DefRegs[MOp.getReg()] = true;
27982 MachineInstrBuilder MIB(*MF, &II);
27983 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27984 unsigned Reg = SavedRegs[RI];
27986 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27993 // Mark all former landing pads as non-landing pads. The dispatch is the only
27994 // landing pad now.
27995 for (auto &LP : MBBLPads)
27996 LP->setIsEHPad(false);
27998 // The instruction is gone now.
27999 MI.eraseFromParent();
28003 MachineBasicBlock *
28004 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
28005 MachineBasicBlock *BB) const {
28006 MachineFunction *MF = BB->getParent();
28007 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
28008 DebugLoc DL = MI.getDebugLoc();
28010 switch (MI.getOpcode()) {
28011 default: llvm_unreachable("Unexpected instr type to insert");
28012 case X86::TLS_addr32:
28013 case X86::TLS_addr64:
28014 case X86::TLS_base_addr32:
28015 case X86::TLS_base_addr64:
28016 return EmitLoweredTLSAddr(MI, BB);
28017 case X86::RETPOLINE_CALL32:
28018 case X86::RETPOLINE_CALL64:
28019 case X86::RETPOLINE_TCRETURN32:
28020 case X86::RETPOLINE_TCRETURN64:
28021 return EmitLoweredRetpoline(MI, BB);
28022 case X86::CATCHRET:
28023 return EmitLoweredCatchRet(MI, BB);
28024 case X86::CATCHPAD:
28025 return EmitLoweredCatchPad(MI, BB);
28026 case X86::SEG_ALLOCA_32:
28027 case X86::SEG_ALLOCA_64:
28028 return EmitLoweredSegAlloca(MI, BB);
28029 case X86::TLSCall_32:
28030 case X86::TLSCall_64:
28031 return EmitLoweredTLSCall(MI, BB);
28032 case X86::CMOV_FR32:
28033 case X86::CMOV_FR64:
28034 case X86::CMOV_FR128:
28035 case X86::CMOV_GR8:
28036 case X86::CMOV_GR16:
28037 case X86::CMOV_GR32:
28038 case X86::CMOV_RFP32:
28039 case X86::CMOV_RFP64:
28040 case X86::CMOV_RFP80:
28041 case X86::CMOV_V2F64:
28042 case X86::CMOV_V2I64:
28043 case X86::CMOV_V4F32:
28044 case X86::CMOV_V4F64:
28045 case X86::CMOV_V4I64:
28046 case X86::CMOV_V16F32:
28047 case X86::CMOV_V8F32:
28048 case X86::CMOV_V8F64:
28049 case X86::CMOV_V8I64:
28050 case X86::CMOV_V8I1:
28051 case X86::CMOV_V16I1:
28052 case X86::CMOV_V32I1:
28053 case X86::CMOV_V64I1:
28054 return EmitLoweredSelect(MI, BB);
28056 case X86::RDFLAGS32:
28057 case X86::RDFLAGS64: {
28059 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
28060 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
28061 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
28062 // Permit reads of the FLAGS register without it being defined.
28063 // This intrinsic exists to read external processor state in flags, such as
28064 // the trap flag, interrupt flag, and direction flag, none of which are
28065 // modeled by the backend.
28066 Push->getOperand(2).setIsUndef();
28067 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
28069 MI.eraseFromParent(); // The pseudo is gone now.
28073 case X86::WRFLAGS32:
28074 case X86::WRFLAGS64: {
28076 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
28078 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
28079 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
28080 BuildMI(*BB, MI, DL, TII->get(PopF));
28082 MI.eraseFromParent(); // The pseudo is gone now.
28086 case X86::RELEASE_FADD32mr:
28087 case X86::RELEASE_FADD64mr:
28088 return EmitLoweredAtomicFP(MI, BB);
28090 case X86::FP32_TO_INT16_IN_MEM:
28091 case X86::FP32_TO_INT32_IN_MEM:
28092 case X86::FP32_TO_INT64_IN_MEM:
28093 case X86::FP64_TO_INT16_IN_MEM:
28094 case X86::FP64_TO_INT32_IN_MEM:
28095 case X86::FP64_TO_INT64_IN_MEM:
28096 case X86::FP80_TO_INT16_IN_MEM:
28097 case X86::FP80_TO_INT32_IN_MEM:
28098 case X86::FP80_TO_INT64_IN_MEM: {
28099 // Change the floating point control register to use "round towards zero"
28100 // mode when truncating to an integer value.
28101 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
28102 addFrameReference(BuildMI(*BB, MI, DL,
28103 TII->get(X86::FNSTCW16m)), CWFrameIdx);
28105 // Load the old value of the high byte of the control word...
28107 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
28108 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
28111 // Set the high part to be round to zero...
28112 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
28115 // Reload the modified control word now...
28116 addFrameReference(BuildMI(*BB, MI, DL,
28117 TII->get(X86::FLDCW16m)), CWFrameIdx);
28119 // Restore the memory image of control word to original value
28120 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
28123 // Get the X86 opcode to use.
28125 switch (MI.getOpcode()) {
28126 default: llvm_unreachable("illegal opcode!");
28127 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
28128 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
28129 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
28130 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
28131 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
28132 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
28133 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
28134 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
28135 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
28138 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28139 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
28140 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
28142 // Reload the original control word now.
28143 addFrameReference(BuildMI(*BB, MI, DL,
28144 TII->get(X86::FLDCW16m)), CWFrameIdx);
28146 MI.eraseFromParent(); // The pseudo instruction is gone now.
28149 // String/text processing lowering.
28150 case X86::PCMPISTRM128REG:
28151 case X86::VPCMPISTRM128REG:
28152 case X86::PCMPISTRM128MEM:
28153 case X86::VPCMPISTRM128MEM:
28154 case X86::PCMPESTRM128REG:
28155 case X86::VPCMPESTRM128REG:
28156 case X86::PCMPESTRM128MEM:
28157 case X86::VPCMPESTRM128MEM:
28158 assert(Subtarget.hasSSE42() &&
28159 "Target must have SSE4.2 or AVX features enabled");
28160 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
28162 // String/text processing lowering.
28163 case X86::PCMPISTRIREG:
28164 case X86::VPCMPISTRIREG:
28165 case X86::PCMPISTRIMEM:
28166 case X86::VPCMPISTRIMEM:
28167 case X86::PCMPESTRIREG:
28168 case X86::VPCMPESTRIREG:
28169 case X86::PCMPESTRIMEM:
28170 case X86::VPCMPESTRIMEM:
28171 assert(Subtarget.hasSSE42() &&
28172 "Target must have SSE4.2 or AVX features enabled");
28173 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
28175 // Thread synchronization.
28177 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
28178 case X86::MONITORX:
28179 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
28183 return emitClzero(&MI, BB, Subtarget);
28187 return emitWRPKRU(MI, BB, Subtarget);
28189 return emitRDPKRU(MI, BB, Subtarget);
28192 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
28194 case X86::VASTART_SAVE_XMM_REGS:
28195 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
28197 case X86::VAARG_64:
28198 return EmitVAARG64WithCustomInserter(MI, BB);
28200 case X86::EH_SjLj_SetJmp32:
28201 case X86::EH_SjLj_SetJmp64:
28202 return emitEHSjLjSetJmp(MI, BB);
28204 case X86::EH_SjLj_LongJmp32:
28205 case X86::EH_SjLj_LongJmp64:
28206 return emitEHSjLjLongJmp(MI, BB);
28208 case X86::Int_eh_sjlj_setup_dispatch:
28209 return EmitSjLjDispatchBlock(MI, BB);
28211 case TargetOpcode::STATEPOINT:
28212 // As an implementation detail, STATEPOINT shares the STACKMAP format at
28213 // this point in the process. We diverge later.
28214 return emitPatchPoint(MI, BB);
28216 case TargetOpcode::STACKMAP:
28217 case TargetOpcode::PATCHPOINT:
28218 return emitPatchPoint(MI, BB);
28220 case TargetOpcode::PATCHABLE_EVENT_CALL:
28221 return emitXRayCustomEvent(MI, BB);
28223 case X86::LCMPXCHG8B: {
28224 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
28225 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
28226 // requires a memory operand. If it happens that current architecture is
28227 // i686 and for current function we need a base pointer
28228 // - which is ESI for i686 - register allocator would not be able to
28229 // allocate registers for an address in form of X(%reg, %reg, Y)
28230 // - there never would be enough unreserved registers during regalloc
28231 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
28232 // We are giving a hand to register allocator by precomputing the address in
28233 // a new vreg using LEA.
28235 // If it is not i686 or there is no base pointer - nothing to do here.
28236 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
28239 // Even though this code does not necessarily needs the base pointer to
28240 // be ESI, we check for that. The reason: if this assert fails, there are
28241 // some changes happened in the compiler base pointer handling, which most
28242 // probably have to be addressed somehow here.
28243 assert(TRI->getBaseRegister() == X86::ESI &&
28244 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
28245 "base pointer in mind");
28247 MachineRegisterInfo &MRI = MF->getRegInfo();
28248 MVT SPTy = getPointerTy(MF->getDataLayout());
28249 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
28250 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
28252 X86AddressMode AM = getAddressFromInstr(&MI, 0);
28253 // Regalloc does not need any help when the memory operand of CMPXCHG8B
28254 // does not use index register.
28255 if (AM.IndexReg == X86::NoRegister)
28258 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
28259 // four operand definitions that are E[ABCD] registers. We skip them and
28260 // then insert the LEA.
28261 MachineBasicBlock::iterator MBBI(MI);
28262 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
28263 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
28266 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
28268 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
28272 case X86::LCMPXCHG16B:
28274 case X86::LCMPXCHG8B_SAVE_EBX:
28275 case X86::LCMPXCHG16B_SAVE_RBX: {
28277 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
28278 if (!BB->isLiveIn(BasePtr))
28279 BB->addLiveIn(BasePtr);
28285 //===----------------------------------------------------------------------===//
28286 // X86 Optimization Hooks
28287 //===----------------------------------------------------------------------===//
28290 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
28291 const APInt &Demanded,
28292 TargetLoweringOpt &TLO) const {
28293 // Only optimize Ands to prevent shrinking a constant that could be
28294 // matched by movzx.
28295 if (Op.getOpcode() != ISD::AND)
28298 EVT VT = Op.getValueType();
28304 unsigned Size = VT.getSizeInBits();
28306 // Make sure the RHS really is a constant.
28307 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
28311 const APInt &Mask = C->getAPIntValue();
28313 // Clear all non-demanded bits initially.
28314 APInt ShrunkMask = Mask & Demanded;
28316 // Find the width of the shrunk mask.
28317 unsigned Width = ShrunkMask.getActiveBits();
28319 // If the mask is all 0s there's nothing to do here.
28323 // Find the next power of 2 width, rounding up to a byte.
28324 Width = PowerOf2Ceil(std::max(Width, 8U));
28325 // Truncate the width to size to handle illegal types.
28326 Width = std::min(Width, Size);
28328 // Calculate a possible zero extend mask for this constant.
28329 APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
28331 // If we aren't changing the mask, just return true to keep it and prevent
28332 // the caller from optimizing.
28333 if (ZeroExtendMask == Mask)
28336 // Make sure the new mask can be represented by a combination of mask bits
28337 // and non-demanded bits.
28338 if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
28341 // Replace the constant with the zero extend mask.
28343 SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT);
28344 SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC);
28345 return TLO.CombineTo(Op, NewOp);
28348 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
28350 const APInt &DemandedElts,
28351 const SelectionDAG &DAG,
28352 unsigned Depth) const {
28353 unsigned BitWidth = Known.getBitWidth();
28354 unsigned Opc = Op.getOpcode();
28355 EVT VT = Op.getValueType();
28356 assert((Opc >= ISD::BUILTIN_OP_END ||
28357 Opc == ISD::INTRINSIC_WO_CHAIN ||
28358 Opc == ISD::INTRINSIC_W_CHAIN ||
28359 Opc == ISD::INTRINSIC_VOID) &&
28360 "Should use MaskedValueIsZero if you don't know whether Op"
28361 " is a target node!");
28366 case X86ISD::SETCC:
28367 Known.Zero.setBitsFrom(1);
28369 case X86ISD::MOVMSK: {
28370 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
28371 Known.Zero.setBitsFrom(NumLoBits);
28374 case X86ISD::PEXTRB:
28375 case X86ISD::PEXTRW: {
28376 SDValue Src = Op.getOperand(0);
28377 EVT SrcVT = Src.getValueType();
28378 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
28379 Op.getConstantOperandVal(1));
28380 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
28381 Known = Known.zextOrTrunc(BitWidth);
28382 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
28385 case X86ISD::VSHLI:
28386 case X86ISD::VSRLI: {
28387 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
28388 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
28389 Known.setAllZero();
28393 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
28394 unsigned ShAmt = ShiftImm->getZExtValue();
28395 if (Opc == X86ISD::VSHLI) {
28396 Known.Zero <<= ShAmt;
28397 Known.One <<= ShAmt;
28398 // Low bits are known zero.
28399 Known.Zero.setLowBits(ShAmt);
28401 Known.Zero.lshrInPlace(ShAmt);
28402 Known.One.lshrInPlace(ShAmt);
28403 // High bits are known zero.
28404 Known.Zero.setHighBits(ShAmt);
28409 case X86ISD::VZEXT: {
28410 // TODO: Add DemandedElts support.
28411 SDValue N0 = Op.getOperand(0);
28412 unsigned NumElts = VT.getVectorNumElements();
28414 EVT SrcVT = N0.getValueType();
28415 unsigned InNumElts = SrcVT.getVectorNumElements();
28416 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
28417 assert(InNumElts >= NumElts && "Illegal VZEXT input");
28419 Known = KnownBits(InBitWidth);
28420 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
28421 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
28422 Known = Known.zext(BitWidth);
28423 Known.Zero.setBitsFrom(InBitWidth);
28426 case X86ISD::CMOV: {
28427 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
28428 // If we don't know any bits, early out.
28429 if (Known.isUnknown())
28432 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
28434 // Only known if known in both the LHS and RHS.
28435 Known.One &= Known2.One;
28436 Known.Zero &= Known2.Zero;
28439 case X86ISD::UDIVREM8_ZEXT_HREG:
28440 // TODO: Support more than just the zero extended bits?
28441 if (Op.getResNo() != 1)
28443 // The remainder is zero extended.
28444 Known.Zero.setBitsFrom(8);
28449 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
28450 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
28451 unsigned Depth) const {
28452 unsigned VTBits = Op.getScalarValueSizeInBits();
28453 unsigned Opcode = Op.getOpcode();
28455 case X86ISD::SETCC_CARRY:
28456 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
28459 case X86ISD::VSEXT: {
28460 // TODO: Add DemandedElts support.
28461 SDValue Src = Op.getOperand(0);
28462 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28463 Tmp += VTBits - Src.getScalarValueSizeInBits();
28467 case X86ISD::VTRUNC: {
28468 // TODO: Add DemandedElts support.
28469 SDValue Src = Op.getOperand(0);
28470 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
28471 assert(VTBits < NumSrcBits && "Illegal truncation input type");
28472 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
28473 if (Tmp > (NumSrcBits - VTBits))
28474 return Tmp - (NumSrcBits - VTBits);
28478 case X86ISD::PACKSS: {
28479 // PACKSS is just a truncation if the sign bits extend to the packed size.
28480 // TODO: Add DemandedElts support.
28481 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
28482 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
28483 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
28484 unsigned Tmp = std::min(Tmp0, Tmp1);
28485 if (Tmp > (SrcBits - VTBits))
28486 return Tmp - (SrcBits - VTBits);
28490 case X86ISD::VSHLI: {
28491 SDValue Src = Op.getOperand(0);
28492 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28493 if (ShiftVal.uge(VTBits))
28494 return VTBits; // Shifted all bits out --> zero.
28495 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28496 if (ShiftVal.uge(Tmp))
28497 return 1; // Shifted all sign bits out --> unknown.
28498 return Tmp - ShiftVal.getZExtValue();
28501 case X86ISD::VSRAI: {
28502 SDValue Src = Op.getOperand(0);
28503 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
28504 if (ShiftVal.uge(VTBits - 1))
28505 return VTBits; // Sign splat.
28506 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
28508 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
28511 case X86ISD::PCMPGT:
28512 case X86ISD::PCMPEQ:
28514 case X86ISD::VPCOM:
28515 case X86ISD::VPCOMU:
28516 // Vector compares return zero/all-bits result values.
28519 case X86ISD::CMOV: {
28520 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
28521 if (Tmp0 == 1) return 1; // Early out.
28522 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
28523 return std::min(Tmp0, Tmp1);
28525 case X86ISD::SDIVREM8_SEXT_HREG:
28526 // TODO: Support more than just the sign extended bits?
28527 if (Op.getResNo() != 1)
28529 // The remainder is sign extended.
28537 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
28538 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
28539 return N->getOperand(0);
28543 /// Returns true (and the GlobalValue and the offset) if the node is a
28544 /// GlobalAddress + offset.
28545 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
28546 const GlobalValue* &GA,
28547 int64_t &Offset) const {
28548 if (N->getOpcode() == X86ISD::Wrapper) {
28549 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
28550 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
28551 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
28555 return TargetLowering::isGAPlusOffset(N, GA, Offset);
28558 // Attempt to match a combined shuffle mask against supported unary shuffle
28560 // TODO: Investigate sharing more of this with shuffle lowering.
28561 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28562 bool AllowFloatDomain, bool AllowIntDomain,
28563 SDValue &V1, const SDLoc &DL,
28565 const X86Subtarget &Subtarget,
28566 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
28567 unsigned NumMaskElts = Mask.size();
28568 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
28570 // Match against a VZEXT_MOVL vXi32 zero-extending instruction.
28571 if (MaskEltSize == 32 && isUndefOrEqual(Mask[0], 0) &&
28572 isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) {
28573 Shuffle = X86ISD::VZEXT_MOVL;
28574 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
28578 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
28579 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
28580 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
28581 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
28582 unsigned MaxScale = 64 / MaskEltSize;
28583 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
28585 unsigned NumDstElts = NumMaskElts / Scale;
28586 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
28587 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
28588 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
28591 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
28592 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
28593 MVT::getIntegerVT(MaskEltSize);
28594 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
28596 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
28597 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
28598 Shuffle = unsigned(X86ISD::VZEXT);
28600 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
28602 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
28603 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
28609 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
28610 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
28611 isUndefOrEqual(Mask[0], 0) &&
28612 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
28613 Shuffle = X86ISD::VZEXT_MOVL;
28614 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
28618 // Check if we have SSE3 which will let us use MOVDDUP etc. The
28619 // instructions are no slower than UNPCKLPD but has the option to
28620 // fold the input operand into even an unaligned memory load.
28621 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
28622 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
28623 Shuffle = X86ISD::MOVDDUP;
28624 SrcVT = DstVT = MVT::v2f64;
28627 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28628 Shuffle = X86ISD::MOVSLDUP;
28629 SrcVT = DstVT = MVT::v4f32;
28632 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
28633 Shuffle = X86ISD::MOVSHDUP;
28634 SrcVT = DstVT = MVT::v4f32;
28639 if (MaskVT.is256BitVector() && AllowFloatDomain) {
28640 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
28641 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
28642 Shuffle = X86ISD::MOVDDUP;
28643 SrcVT = DstVT = MVT::v4f64;
28646 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28647 Shuffle = X86ISD::MOVSLDUP;
28648 SrcVT = DstVT = MVT::v8f32;
28651 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
28652 Shuffle = X86ISD::MOVSHDUP;
28653 SrcVT = DstVT = MVT::v8f32;
28658 if (MaskVT.is512BitVector() && AllowFloatDomain) {
28659 assert(Subtarget.hasAVX512() &&
28660 "AVX512 required for 512-bit vector shuffles");
28661 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
28662 Shuffle = X86ISD::MOVDDUP;
28663 SrcVT = DstVT = MVT::v8f64;
28666 if (isTargetShuffleEquivalent(
28667 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
28668 Shuffle = X86ISD::MOVSLDUP;
28669 SrcVT = DstVT = MVT::v16f32;
28672 if (isTargetShuffleEquivalent(
28673 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
28674 Shuffle = X86ISD::MOVSHDUP;
28675 SrcVT = DstVT = MVT::v16f32;
28680 // Attempt to match against broadcast-from-vector.
28681 if (Subtarget.hasAVX2()) {
28682 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
28683 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
28684 SrcVT = DstVT = MaskVT;
28685 Shuffle = X86ISD::VBROADCAST;
28693 // Attempt to match a combined shuffle mask against supported unary immediate
28694 // permute instructions.
28695 // TODO: Investigate sharing more of this with shuffle lowering.
28696 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28697 const APInt &Zeroable,
28698 bool AllowFloatDomain,
28699 bool AllowIntDomain,
28700 const X86Subtarget &Subtarget,
28701 unsigned &Shuffle, MVT &ShuffleVT,
28702 unsigned &PermuteImm) {
28703 unsigned NumMaskElts = Mask.size();
28704 unsigned InputSizeInBits = MaskVT.getSizeInBits();
28705 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
28706 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
28708 bool ContainsZeros =
28709 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28711 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
28712 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
28713 // Check for lane crossing permutes.
28714 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
28715 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
28716 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
28717 Shuffle = X86ISD::VPERMI;
28718 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
28719 PermuteImm = getV4X86ShuffleImm(Mask);
28722 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
28723 SmallVector<int, 4> RepeatedMask;
28724 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
28725 Shuffle = X86ISD::VPERMI;
28726 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
28727 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
28731 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
28732 // VPERMILPD can permute with a non-repeating shuffle.
28733 Shuffle = X86ISD::VPERMILPI;
28734 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
28736 for (int i = 0, e = Mask.size(); i != e; ++i) {
28738 if (M == SM_SentinelUndef)
28740 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
28741 PermuteImm |= (M & 1) << i;
28747 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
28748 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
28749 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
28750 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
28751 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
28752 SmallVector<int, 4> RepeatedMask;
28753 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28754 // Narrow the repeated mask to create 32-bit element permutes.
28755 SmallVector<int, 4> WordMask = RepeatedMask;
28756 if (MaskScalarSizeInBits == 64)
28757 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
28759 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
28760 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
28761 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
28762 PermuteImm = getV4X86ShuffleImm(WordMask);
28767 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
28768 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
28769 SmallVector<int, 4> RepeatedMask;
28770 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
28771 ArrayRef<int> LoMask(Mask.data() + 0, 4);
28772 ArrayRef<int> HiMask(Mask.data() + 4, 4);
28774 // PSHUFLW: permute lower 4 elements only.
28775 if (isUndefOrInRange(LoMask, 0, 4) &&
28776 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
28777 Shuffle = X86ISD::PSHUFLW;
28778 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28779 PermuteImm = getV4X86ShuffleImm(LoMask);
28783 // PSHUFHW: permute upper 4 elements only.
28784 if (isUndefOrInRange(HiMask, 4, 8) &&
28785 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
28786 // Offset the HiMask so that we can create the shuffle immediate.
28787 int OffsetHiMask[4];
28788 for (int i = 0; i != 4; ++i)
28789 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
28791 Shuffle = X86ISD::PSHUFHW;
28792 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
28793 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
28799 // Attempt to match against byte/bit shifts.
28800 // FIXME: Add 512-bit support.
28801 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28802 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28803 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28804 MaskScalarSizeInBits, Mask,
28805 0, Zeroable, Subtarget);
28806 if (0 < ShiftAmt) {
28807 PermuteImm = (unsigned)ShiftAmt;
28815 // Attempt to match a combined unary shuffle mask against supported binary
28816 // shuffle instructions.
28817 // TODO: Investigate sharing more of this with shuffle lowering.
28818 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28819 bool AllowFloatDomain, bool AllowIntDomain,
28820 SDValue &V1, SDValue &V2, const SDLoc &DL,
28822 const X86Subtarget &Subtarget,
28823 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28825 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28827 if (MaskVT.is128BitVector()) {
28828 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28830 V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1);
28831 Shuffle = X86ISD::MOVLHPS;
28832 SrcVT = DstVT = MVT::v4f32;
28835 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28837 Shuffle = X86ISD::MOVHLPS;
28838 SrcVT = DstVT = MVT::v4f32;
28841 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28842 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28844 Shuffle = X86ISD::MOVSD;
28845 SrcVT = DstVT = MaskVT;
28848 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28849 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28850 Shuffle = X86ISD::MOVSS;
28851 SrcVT = DstVT = MaskVT;
28856 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28857 // TODO add support for 256/512-bit types.
28858 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28859 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28866 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28867 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28868 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28869 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28870 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28871 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28872 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28874 SrcVT = DstVT = MaskVT;
28875 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28876 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28884 static bool matchBinaryPermuteVectorShuffle(
28885 MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable,
28886 bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2,
28887 const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget,
28888 unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) {
28889 unsigned NumMaskElts = Mask.size();
28890 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28892 // Attempt to match against PALIGNR byte rotate.
28893 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28894 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28895 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28896 if (0 < ByteRotation) {
28897 Shuffle = X86ISD::PALIGNR;
28898 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28899 PermuteImm = ByteRotation;
28904 // Attempt to combine to X86ISD::BLENDI.
28905 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28906 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28907 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28908 uint64_t BlendMask = 0;
28909 bool ForceV1Zero = false, ForceV2Zero = false;
28910 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28911 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28913 if (MaskVT == MVT::v16i16) {
28914 // We can only use v16i16 PBLENDW if the lanes are repeated.
28915 SmallVector<int, 8> RepeatedMask;
28916 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28918 assert(RepeatedMask.size() == 8 &&
28919 "Repeated mask size doesn't match!");
28921 for (int i = 0; i < 8; ++i)
28922 if (RepeatedMask[i] >= 8)
28923 PermuteImm |= 1 << i;
28924 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28925 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28926 Shuffle = X86ISD::BLENDI;
28927 ShuffleVT = MaskVT;
28931 // Determine a type compatible with X86ISD::BLENDI.
28932 ShuffleVT = MaskVT;
28933 if (Subtarget.hasAVX2()) {
28934 if (ShuffleVT == MVT::v4i64)
28935 ShuffleVT = MVT::v8i32;
28936 else if (ShuffleVT == MVT::v2i64)
28937 ShuffleVT = MVT::v4i32;
28939 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28940 ShuffleVT = MVT::v8i16;
28941 else if (ShuffleVT == MVT::v4i64)
28942 ShuffleVT = MVT::v4f64;
28943 else if (ShuffleVT == MVT::v8i32)
28944 ShuffleVT = MVT::v8f32;
28947 if (!ShuffleVT.isFloatingPoint()) {
28948 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28950 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28951 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28952 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28955 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28956 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28957 PermuteImm = (unsigned)BlendMask;
28958 Shuffle = X86ISD::BLENDI;
28964 // Attempt to combine to INSERTPS.
28965 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28966 MaskVT.is128BitVector()) {
28967 if (Zeroable.getBoolValue() &&
28968 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28969 Shuffle = X86ISD::INSERTPS;
28970 ShuffleVT = MVT::v4f32;
28975 // Attempt to combine to SHUFPD.
28976 if (AllowFloatDomain && EltSizeInBits == 64 &&
28977 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28978 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28979 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28980 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28981 Shuffle = X86ISD::SHUFP;
28982 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28987 // Attempt to combine to SHUFPS.
28988 if (AllowFloatDomain && EltSizeInBits == 32 &&
28989 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28990 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28991 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28992 SmallVector<int, 4> RepeatedMask;
28993 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28994 // Match each half of the repeated mask, to determine if its just
28995 // referencing one of the vectors, is zeroable or entirely undef.
28996 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28997 int M0 = RepeatedMask[Offset];
28998 int M1 = RepeatedMask[Offset + 1];
29000 if (isUndefInRange(RepeatedMask, Offset, 2)) {
29001 return DAG.getUNDEF(MaskVT);
29002 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
29003 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
29004 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
29005 return getZeroVector(MaskVT, Subtarget, DAG, DL);
29006 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
29007 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29008 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29010 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
29011 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
29012 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
29019 int ShufMask[4] = {-1, -1, -1, -1};
29020 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
29021 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
29026 Shuffle = X86ISD::SHUFP;
29027 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
29028 PermuteImm = getV4X86ShuffleImm(ShufMask);
29037 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
29040 /// This is the leaf of the recursive combine below. When we have found some
29041 /// chain of single-use x86 shuffle instructions and accumulated the combined
29042 /// shuffle mask represented by them, this will try to pattern match that mask
29043 /// into either a single instruction if there is a special purpose instruction
29044 /// for this operation, or into a PSHUFB instruction which is a fully general
29045 /// instruction but should only be used to replace chains over a certain depth.
29046 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
29047 ArrayRef<int> BaseMask, int Depth,
29048 bool HasVariableMask, SelectionDAG &DAG,
29049 TargetLowering::DAGCombinerInfo &DCI,
29050 const X86Subtarget &Subtarget) {
29051 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
29052 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
29053 "Unexpected number of shuffle inputs!");
29055 // Find the inputs that enter the chain. Note that multiple uses are OK
29056 // here, we're not going to remove the operands we find.
29057 bool UnaryShuffle = (Inputs.size() == 1);
29058 SDValue V1 = peekThroughBitcasts(Inputs[0]);
29059 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
29060 : peekThroughBitcasts(Inputs[1]));
29062 MVT VT1 = V1.getSimpleValueType();
29063 MVT VT2 = V2.getSimpleValueType();
29064 MVT RootVT = Root.getSimpleValueType();
29065 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
29066 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
29067 "Vector size mismatch");
29072 unsigned NumBaseMaskElts = BaseMask.size();
29073 if (NumBaseMaskElts == 1) {
29074 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
29075 return DAG.getBitcast(RootVT, V1);
29078 unsigned RootSizeInBits = RootVT.getSizeInBits();
29079 unsigned NumRootElts = RootVT.getVectorNumElements();
29080 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
29081 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
29082 (RootVT.isFloatingPoint() && Depth >= 2) ||
29083 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
29085 // Don't combine if we are a AVX512/EVEX target and the mask element size
29086 // is different from the root element size - this would prevent writemasks
29087 // from being reused.
29088 // TODO - this currently prevents all lane shuffles from occurring.
29089 // TODO - check for writemasks usage instead of always preventing combining.
29090 // TODO - attempt to narrow Mask back to writemask size.
29091 bool IsEVEXShuffle =
29092 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
29094 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
29096 // Handle 128-bit lane shuffles of 256-bit vectors.
29097 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
29098 // we need to use the zeroing feature.
29099 // TODO - this should support binary shuffles.
29100 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
29101 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
29102 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
29103 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
29104 return SDValue(); // Nothing to do!
29105 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
29106 unsigned PermMask = 0;
29107 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
29108 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
29110 Res = DAG.getBitcast(ShuffleVT, V1);
29111 DCI.AddToWorklist(Res.getNode());
29112 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
29113 DAG.getUNDEF(ShuffleVT),
29114 DAG.getConstant(PermMask, DL, MVT::i8));
29115 DCI.AddToWorklist(Res.getNode());
29116 return DAG.getBitcast(RootVT, Res);
29119 // For masks that have been widened to 128-bit elements or more,
29120 // narrow back down to 64-bit elements.
29121 SmallVector<int, 64> Mask;
29122 if (BaseMaskEltSizeInBits > 64) {
29123 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
29124 int MaskScale = BaseMaskEltSizeInBits / 64;
29125 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
29127 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
29130 unsigned NumMaskElts = Mask.size();
29131 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
29133 // Determine the effective mask value type.
29134 FloatDomain &= (32 <= MaskEltSizeInBits);
29135 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
29136 : MVT::getIntegerVT(MaskEltSizeInBits);
29137 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
29139 // Only allow legal mask types.
29140 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
29143 // Attempt to match the mask against known shuffle patterns.
29144 MVT ShuffleSrcVT, ShuffleVT;
29145 unsigned Shuffle, PermuteImm;
29147 // Which shuffle domains are permitted?
29148 // Permit domain crossing at higher combine depths.
29149 bool AllowFloatDomain = FloatDomain || (Depth > 3);
29150 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
29151 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
29153 // Determine zeroable mask elements.
29154 APInt Zeroable(NumMaskElts, 0);
29155 for (unsigned i = 0; i != NumMaskElts; ++i)
29156 if (isUndefOrZero(Mask[i]))
29157 Zeroable.setBit(i);
29159 if (UnaryShuffle) {
29160 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
29161 // directly if we don't shuffle the lower element and we shuffle the upper
29162 // (zero) elements within themselves.
29163 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
29164 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
29165 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
29166 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
29167 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
29168 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
29169 return DAG.getBitcast(RootVT, V1);
29173 SDValue NewV1 = V1; // Save operand in case early exit happens.
29174 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29175 NewV1, DL, DAG, Subtarget, Shuffle,
29176 ShuffleSrcVT, ShuffleVT) &&
29177 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29178 if (Depth == 1 && Root.getOpcode() == Shuffle)
29179 return SDValue(); // Nothing to do!
29180 Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
29181 DCI.AddToWorklist(Res.getNode());
29182 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
29183 DCI.AddToWorklist(Res.getNode());
29184 return DAG.getBitcast(RootVT, Res);
29187 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
29188 AllowIntDomain, Subtarget, Shuffle,
29189 ShuffleVT, PermuteImm) &&
29190 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29191 if (Depth == 1 && Root.getOpcode() == Shuffle)
29192 return SDValue(); // Nothing to do!
29193 Res = DAG.getBitcast(ShuffleVT, V1);
29194 DCI.AddToWorklist(Res.getNode());
29195 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
29196 DAG.getConstant(PermuteImm, DL, MVT::i8));
29197 DCI.AddToWorklist(Res.getNode());
29198 return DAG.getBitcast(RootVT, Res);
29202 SDValue NewV1 = V1; // Save operands in case early exit happens.
29203 SDValue NewV2 = V2;
29204 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
29205 NewV1, NewV2, DL, DAG, Subtarget, Shuffle,
29206 ShuffleSrcVT, ShuffleVT, UnaryShuffle) &&
29207 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29208 if (Depth == 1 && Root.getOpcode() == Shuffle)
29209 return SDValue(); // Nothing to do!
29210 NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
29211 DCI.AddToWorklist(NewV1.getNode());
29212 NewV2 = DAG.getBitcast(ShuffleSrcVT, NewV2);
29213 DCI.AddToWorklist(NewV2.getNode());
29214 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2);
29215 DCI.AddToWorklist(Res.getNode());
29216 return DAG.getBitcast(RootVT, Res);
29219 NewV1 = V1; // Save operands in case early exit happens.
29221 if (matchBinaryPermuteVectorShuffle(
29222 MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
29223 NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
29224 (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
29225 if (Depth == 1 && Root.getOpcode() == Shuffle)
29226 return SDValue(); // Nothing to do!
29227 NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
29228 DCI.AddToWorklist(NewV1.getNode());
29229 NewV2 = DAG.getBitcast(ShuffleVT, NewV2);
29230 DCI.AddToWorklist(NewV2.getNode());
29231 Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2,
29232 DAG.getConstant(PermuteImm, DL, MVT::i8));
29233 DCI.AddToWorklist(Res.getNode());
29234 return DAG.getBitcast(RootVT, Res);
29237 // Typically from here on, we need an integer version of MaskVT.
29238 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
29239 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
29241 // Annoyingly, SSE4A instructions don't map into the above match helpers.
29242 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
29243 uint64_t BitLen, BitIdx;
29244 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
29246 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
29247 return SDValue(); // Nothing to do!
29248 V1 = DAG.getBitcast(IntMaskVT, V1);
29249 DCI.AddToWorklist(V1.getNode());
29250 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
29251 DAG.getConstant(BitLen, DL, MVT::i8),
29252 DAG.getConstant(BitIdx, DL, MVT::i8));
29253 DCI.AddToWorklist(Res.getNode());
29254 return DAG.getBitcast(RootVT, Res);
29257 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
29258 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
29259 return SDValue(); // Nothing to do!
29260 V1 = DAG.getBitcast(IntMaskVT, V1);
29261 DCI.AddToWorklist(V1.getNode());
29262 V2 = DAG.getBitcast(IntMaskVT, V2);
29263 DCI.AddToWorklist(V2.getNode());
29264 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
29265 DAG.getConstant(BitLen, DL, MVT::i8),
29266 DAG.getConstant(BitIdx, DL, MVT::i8));
29267 DCI.AddToWorklist(Res.getNode());
29268 return DAG.getBitcast(RootVT, Res);
29272 // Don't try to re-form single instruction chains under any circumstances now
29273 // that we've done encoding canonicalization for them.
29277 // Depth threshold above which we can efficiently use variable mask shuffles.
29278 int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 2 : 3;
29279 bool AllowVariableMask = (Depth >= VariableShuffleDepth) || HasVariableMask;
29281 bool MaskContainsZeros =
29282 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
29284 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
29285 // If we have a single input lane-crossing shuffle then lower to VPERMV.
29286 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29287 ((Subtarget.hasAVX2() &&
29288 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29289 (Subtarget.hasAVX512() &&
29290 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29291 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29292 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29293 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29294 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29295 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29296 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29297 DCI.AddToWorklist(VPermMask.getNode());
29298 Res = DAG.getBitcast(MaskVT, V1);
29299 DCI.AddToWorklist(Res.getNode());
29300 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
29301 DCI.AddToWorklist(Res.getNode());
29302 return DAG.getBitcast(RootVT, Res);
29305 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
29306 // vector as the second source.
29307 if (UnaryShuffle && AllowVariableMask &&
29308 ((Subtarget.hasAVX512() &&
29309 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29310 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29311 (Subtarget.hasVLX() &&
29312 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29313 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29314 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29315 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29316 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29317 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29318 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
29319 for (unsigned i = 0; i != NumMaskElts; ++i)
29320 if (Mask[i] == SM_SentinelZero)
29321 Mask[i] = NumMaskElts + i;
29323 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29324 DCI.AddToWorklist(VPermMask.getNode());
29325 Res = DAG.getBitcast(MaskVT, V1);
29326 DCI.AddToWorklist(Res.getNode());
29327 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
29328 DCI.AddToWorklist(Zero.getNode());
29329 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
29330 DCI.AddToWorklist(Res.getNode());
29331 return DAG.getBitcast(RootVT, Res);
29334 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
29335 if (AllowVariableMask && !MaskContainsZeros &&
29336 ((Subtarget.hasAVX512() &&
29337 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
29338 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
29339 (Subtarget.hasVLX() &&
29340 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
29341 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
29342 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
29343 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
29344 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
29345 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
29346 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
29347 DCI.AddToWorklist(VPermMask.getNode());
29348 V1 = DAG.getBitcast(MaskVT, V1);
29349 DCI.AddToWorklist(V1.getNode());
29350 V2 = DAG.getBitcast(MaskVT, V2);
29351 DCI.AddToWorklist(V2.getNode());
29352 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
29353 DCI.AddToWorklist(Res.getNode());
29354 return DAG.getBitcast(RootVT, Res);
29359 // See if we can combine a single input shuffle with zeros to a bit-mask,
29360 // which is much simpler than any shuffle.
29361 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
29362 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
29363 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
29364 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
29365 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
29366 APInt UndefElts(NumMaskElts, 0);
29367 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
29368 for (unsigned i = 0; i != NumMaskElts; ++i) {
29370 if (M == SM_SentinelUndef) {
29371 UndefElts.setBit(i);
29374 if (M == SM_SentinelZero)
29376 EltBits[i] = AllOnes;
29378 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
29379 DCI.AddToWorklist(BitMask.getNode());
29380 Res = DAG.getBitcast(MaskVT, V1);
29381 DCI.AddToWorklist(Res.getNode());
29382 unsigned AndOpcode =
29383 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
29384 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
29385 DCI.AddToWorklist(Res.getNode());
29386 return DAG.getBitcast(RootVT, Res);
29389 // If we have a single input shuffle with different shuffle patterns in the
29390 // the 128-bit lanes use the variable mask to VPERMILPS.
29391 // TODO Combine other mask types at higher depths.
29392 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
29393 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
29394 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
29395 SmallVector<SDValue, 16> VPermIdx;
29396 for (int M : Mask) {
29398 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
29399 VPermIdx.push_back(Idx);
29401 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
29402 DCI.AddToWorklist(VPermMask.getNode());
29403 Res = DAG.getBitcast(MaskVT, V1);
29404 DCI.AddToWorklist(Res.getNode());
29405 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
29406 DCI.AddToWorklist(Res.getNode());
29407 return DAG.getBitcast(RootVT, Res);
29410 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
29411 // to VPERMIL2PD/VPERMIL2PS.
29412 if (AllowVariableMask && Subtarget.hasXOP() &&
29413 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
29414 MaskVT == MVT::v8f32)) {
29415 // VPERMIL2 Operation.
29416 // Bits[3] - Match Bit.
29417 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
29418 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
29419 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
29420 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
29421 SmallVector<int, 8> VPerm2Idx;
29422 unsigned M2ZImm = 0;
29423 for (int M : Mask) {
29424 if (M == SM_SentinelUndef) {
29425 VPerm2Idx.push_back(-1);
29428 if (M == SM_SentinelZero) {
29430 VPerm2Idx.push_back(8);
29433 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
29434 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
29435 VPerm2Idx.push_back(Index);
29437 V1 = DAG.getBitcast(MaskVT, V1);
29438 DCI.AddToWorklist(V1.getNode());
29439 V2 = DAG.getBitcast(MaskVT, V2);
29440 DCI.AddToWorklist(V2.getNode());
29441 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
29442 DCI.AddToWorklist(VPerm2MaskOp.getNode());
29443 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
29444 DAG.getConstant(M2ZImm, DL, MVT::i8));
29445 DCI.AddToWorklist(Res.getNode());
29446 return DAG.getBitcast(RootVT, Res);
29449 // If we have 3 or more shuffle instructions or a chain involving a variable
29450 // mask, we can replace them with a single PSHUFB instruction profitably.
29451 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
29452 // instructions, but in practice PSHUFB tends to be *very* fast so we're
29453 // more aggressive.
29454 if (UnaryShuffle && AllowVariableMask &&
29455 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
29456 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
29457 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
29458 SmallVector<SDValue, 16> PSHUFBMask;
29459 int NumBytes = RootVT.getSizeInBits() / 8;
29460 int Ratio = NumBytes / NumMaskElts;
29461 for (int i = 0; i < NumBytes; ++i) {
29462 int M = Mask[i / Ratio];
29463 if (M == SM_SentinelUndef) {
29464 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
29467 if (M == SM_SentinelZero) {
29468 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
29471 M = Ratio * M + i % Ratio;
29472 assert((M / 16) == (i / 16) && "Lane crossing detected");
29473 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29475 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
29476 Res = DAG.getBitcast(ByteVT, V1);
29477 DCI.AddToWorklist(Res.getNode());
29478 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
29479 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
29480 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
29481 DCI.AddToWorklist(Res.getNode());
29482 return DAG.getBitcast(RootVT, Res);
29485 // With XOP, if we have a 128-bit binary input shuffle we can always combine
29486 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
29487 // slower than PSHUFB on targets that support both.
29488 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
29489 // VPPERM Mask Operation
29490 // Bits[4:0] - Byte Index (0 - 31)
29491 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
29492 SmallVector<SDValue, 16> VPPERMMask;
29494 int Ratio = NumBytes / NumMaskElts;
29495 for (int i = 0; i < NumBytes; ++i) {
29496 int M = Mask[i / Ratio];
29497 if (M == SM_SentinelUndef) {
29498 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
29501 if (M == SM_SentinelZero) {
29502 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
29505 M = Ratio * M + i % Ratio;
29506 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
29508 MVT ByteVT = MVT::v16i8;
29509 V1 = DAG.getBitcast(ByteVT, V1);
29510 DCI.AddToWorklist(V1.getNode());
29511 V2 = DAG.getBitcast(ByteVT, V2);
29512 DCI.AddToWorklist(V2.getNode());
29513 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
29514 DCI.AddToWorklist(VPPERMMaskOp.getNode());
29515 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
29516 DCI.AddToWorklist(Res.getNode());
29517 return DAG.getBitcast(RootVT, Res);
29520 // Failed to find any combines.
29524 // Attempt to constant fold all of the constant source ops.
29525 // Returns true if the entire shuffle is folded to a constant.
29526 // TODO: Extend this to merge multiple constant Ops and update the mask.
29527 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
29528 ArrayRef<int> Mask, SDValue Root,
29529 bool HasVariableMask,
29531 TargetLowering::DAGCombinerInfo &DCI,
29532 const X86Subtarget &Subtarget) {
29533 MVT VT = Root.getSimpleValueType();
29535 unsigned SizeInBits = VT.getSizeInBits();
29536 unsigned NumMaskElts = Mask.size();
29537 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
29538 unsigned NumOps = Ops.size();
29540 // Extract constant bits from each source op.
29541 bool OneUseConstantOp = false;
29542 SmallVector<APInt, 16> UndefEltsOps(NumOps);
29543 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
29544 for (unsigned i = 0; i != NumOps; ++i) {
29545 SDValue SrcOp = Ops[i];
29546 OneUseConstantOp |= SrcOp.hasOneUse();
29547 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
29552 // Only fold if at least one of the constants is only used once or
29553 // the combined shuffle has included a variable mask shuffle, this
29554 // is to avoid constant pool bloat.
29555 if (!OneUseConstantOp && !HasVariableMask)
29558 // Shuffle the constant bits according to the mask.
29559 APInt UndefElts(NumMaskElts, 0);
29560 APInt ZeroElts(NumMaskElts, 0);
29561 APInt ConstantElts(NumMaskElts, 0);
29562 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
29563 APInt::getNullValue(MaskSizeInBits));
29564 for (unsigned i = 0; i != NumMaskElts; ++i) {
29566 if (M == SM_SentinelUndef) {
29567 UndefElts.setBit(i);
29569 } else if (M == SM_SentinelZero) {
29570 ZeroElts.setBit(i);
29573 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
29575 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
29576 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
29578 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
29579 if (SrcUndefElts[SrcMaskIdx]) {
29580 UndefElts.setBit(i);
29584 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
29585 APInt &Bits = SrcEltBits[SrcMaskIdx];
29587 ZeroElts.setBit(i);
29591 ConstantElts.setBit(i);
29592 ConstantBitData[i] = Bits;
29594 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
29596 // Create the constant data.
29598 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
29599 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
29601 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
29603 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
29606 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
29607 DCI.AddToWorklist(CstOp.getNode());
29608 return DAG.getBitcast(VT, CstOp);
29611 /// \brief Fully generic combining of x86 shuffle instructions.
29613 /// This should be the last combine run over the x86 shuffle instructions. Once
29614 /// they have been fully optimized, this will recursively consider all chains
29615 /// of single-use shuffle instructions, build a generic model of the cumulative
29616 /// shuffle operation, and check for simpler instructions which implement this
29617 /// operation. We use this primarily for two purposes:
29619 /// 1) Collapse generic shuffles to specialized single instructions when
29620 /// equivalent. In most cases, this is just an encoding size win, but
29621 /// sometimes we will collapse multiple generic shuffles into a single
29622 /// special-purpose shuffle.
29623 /// 2) Look for sequences of shuffle instructions with 3 or more total
29624 /// instructions, and replace them with the slightly more expensive SSSE3
29625 /// PSHUFB instruction if available. We do this as the last combining step
29626 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
29627 /// a suitable short sequence of other instructions. The PSHUFB will either
29628 /// use a register or have to read from memory and so is slightly (but only
29629 /// slightly) more expensive than the other shuffle instructions.
29631 /// Because this is inherently a quadratic operation (for each shuffle in
29632 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
29633 /// This should never be an issue in practice as the shuffle lowering doesn't
29634 /// produce sequences of more than 8 instructions.
29636 /// FIXME: We will currently miss some cases where the redundant shuffling
29637 /// would simplify under the threshold for PSHUFB formation because of
29638 /// combine-ordering. To fix this, we should do the redundant instruction
29639 /// combining in this recursive walk.
29640 static SDValue combineX86ShufflesRecursively(
29641 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
29642 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
29643 bool HasVariableMask, SelectionDAG &DAG,
29644 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
29645 // Bound the depth of our recursive combine because this is ultimately
29646 // quadratic in nature.
29647 const unsigned MaxRecursionDepth = 8;
29648 if (Depth > MaxRecursionDepth)
29651 // Directly rip through bitcasts to find the underlying operand.
29652 SDValue Op = SrcOps[SrcOpIndex];
29653 Op = peekThroughOneUseBitcasts(Op);
29655 MVT VT = Op.getSimpleValueType();
29656 if (!VT.isVector())
29657 return SDValue(); // Bail if we hit a non-vector.
29659 assert(Root.getSimpleValueType().isVector() &&
29660 "Shuffles operate on vector types!");
29661 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
29662 "Can only combine shuffles of the same vector register size.");
29664 // Extract target shuffle mask and resolve sentinels and inputs.
29665 SmallVector<int, 64> OpMask;
29666 SmallVector<SDValue, 2> OpInputs;
29667 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
29670 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
29671 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
29672 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
29674 // Add the inputs to the Ops list, avoiding duplicates.
29675 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
29677 int InputIdx0 = -1, InputIdx1 = -1;
29678 for (int i = 0, e = Ops.size(); i < e; ++i) {
29679 SDValue BC = peekThroughBitcasts(Ops[i]);
29680 if (Input0 && BC == peekThroughBitcasts(Input0))
29682 if (Input1 && BC == peekThroughBitcasts(Input1))
29686 if (Input0 && InputIdx0 < 0) {
29687 InputIdx0 = SrcOpIndex;
29688 Ops[SrcOpIndex] = Input0;
29690 if (Input1 && InputIdx1 < 0) {
29691 InputIdx1 = Ops.size();
29692 Ops.push_back(Input1);
29695 assert(((RootMask.size() > OpMask.size() &&
29696 RootMask.size() % OpMask.size() == 0) ||
29697 (OpMask.size() > RootMask.size() &&
29698 OpMask.size() % RootMask.size() == 0) ||
29699 OpMask.size() == RootMask.size()) &&
29700 "The smaller number of elements must divide the larger.");
29702 // This function can be performance-critical, so we rely on the power-of-2
29703 // knowledge that we have about the mask sizes to replace div/rem ops with
29704 // bit-masks and shifts.
29705 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
29706 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
29707 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
29708 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
29710 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
29711 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
29712 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
29713 assert((RootRatio == 1 || OpRatio == 1) &&
29714 "Must not have a ratio for both incoming and op masks!");
29716 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
29717 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
29718 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
29719 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
29720 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
29722 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
29724 // Merge this shuffle operation's mask into our accumulated mask. Note that
29725 // this shuffle's mask will be the first applied to the input, followed by the
29726 // root mask to get us all the way to the root value arrangement. The reason
29727 // for this order is that we are recursing up the operation chain.
29728 for (unsigned i = 0; i < MaskWidth; ++i) {
29729 unsigned RootIdx = i >> RootRatioLog2;
29730 if (RootMask[RootIdx] < 0) {
29731 // This is a zero or undef lane, we're done.
29732 Mask[i] = RootMask[RootIdx];
29736 unsigned RootMaskedIdx =
29738 ? RootMask[RootIdx]
29739 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
29741 // Just insert the scaled root mask value if it references an input other
29742 // than the SrcOp we're currently inserting.
29743 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
29744 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
29745 Mask[i] = RootMaskedIdx;
29749 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
29750 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
29751 if (OpMask[OpIdx] < 0) {
29752 // The incoming lanes are zero or undef, it doesn't matter which ones we
29754 Mask[i] = OpMask[OpIdx];
29758 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
29759 unsigned OpMaskedIdx =
29762 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
29764 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
29765 if (OpMask[OpIdx] < (int)OpMask.size()) {
29766 assert(0 <= InputIdx0 && "Unknown target shuffle input");
29767 OpMaskedIdx += InputIdx0 * MaskWidth;
29769 assert(0 <= InputIdx1 && "Unknown target shuffle input");
29770 OpMaskedIdx += InputIdx1 * MaskWidth;
29773 Mask[i] = OpMaskedIdx;
29776 // Handle the all undef/zero cases early.
29777 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
29778 return DAG.getUNDEF(Root.getValueType());
29780 // TODO - should we handle the mixed zero/undef case as well? Just returning
29781 // a zero mask will lose information on undef elements possibly reducing
29782 // future combine possibilities.
29783 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
29784 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
29787 // Remove unused shuffle source ops.
29788 resolveTargetShuffleInputsAndMask(Ops, Mask);
29789 assert(!Ops.empty() && "Shuffle with no inputs detected");
29791 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
29793 // Update the list of shuffle nodes that have been combined so far.
29794 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
29796 CombinedNodes.push_back(Op.getNode());
29798 // See if we can recurse into each shuffle source op (if it's a target
29799 // shuffle). The source op should only be combined if it either has a
29800 // single use (i.e. current Op) or all its users have already been combined.
29801 // Don't recurse if we already have more source ops than we can combine in
29802 // the remaining recursion depth.
29803 if (Ops.size() < (MaxRecursionDepth - Depth)) {
29804 for (int i = 0, e = Ops.size(); i < e; ++i)
29805 if (Ops[i].getNode()->hasOneUse() ||
29806 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29807 if (SDValue Res = combineX86ShufflesRecursively(
29808 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29809 DAG, DCI, Subtarget))
29813 // Attempt to constant fold all of the constant source ops.
29814 if (SDValue Cst = combineX86ShufflesConstants(
29815 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29818 // We can only combine unary and binary shuffle mask cases.
29819 if (Ops.size() > 2)
29822 // Minor canonicalization of the accumulated shuffle mask to make it easier
29823 // to match below. All this does is detect masks with sequential pairs of
29824 // elements, and shrink them to the half-width mask. It does this in a loop
29825 // so it will reduce the size of the mask to the minimal width mask which
29826 // performs an equivalent shuffle.
29827 SmallVector<int, 64> WidenedMask;
29828 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29829 Mask = std::move(WidenedMask);
29832 // Canonicalization of binary shuffle masks to improve pattern matching by
29833 // commuting the inputs.
29834 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29835 ShuffleVectorSDNode::commuteMask(Mask);
29836 std::swap(Ops[0], Ops[1]);
29839 // Finally, try to combine into a single shuffle instruction.
29840 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29844 /// \brief Get the PSHUF-style mask from PSHUF node.
29846 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29847 /// PSHUF-style masks that can be reused with such instructions.
29848 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29849 MVT VT = N.getSimpleValueType();
29850 SmallVector<int, 4> Mask;
29851 SmallVector<SDValue, 2> Ops;
29854 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29858 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29859 // matter. Check that the upper masks are repeats and remove them.
29860 if (VT.getSizeInBits() > 128) {
29861 int LaneElts = 128 / VT.getScalarSizeInBits();
29863 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29864 for (int j = 0; j < LaneElts; ++j)
29865 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
29866 "Mask doesn't repeat in high 128-bit lanes!");
29868 Mask.resize(LaneElts);
29871 switch (N.getOpcode()) {
29872 case X86ISD::PSHUFD:
29874 case X86ISD::PSHUFLW:
29877 case X86ISD::PSHUFHW:
29878 Mask.erase(Mask.begin(), Mask.begin() + 4);
29879 for (int &M : Mask)
29883 llvm_unreachable("No valid shuffle instruction found!");
29887 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
29889 /// We walk up the chain and look for a combinable shuffle, skipping over
29890 /// shuffles that we could hoist this shuffle's transformation past without
29891 /// altering anything.
29893 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29894 SelectionDAG &DAG) {
29895 assert(N.getOpcode() == X86ISD::PSHUFD &&
29896 "Called with something other than an x86 128-bit half shuffle!");
29899 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29900 // of the shuffles in the chain so that we can form a fresh chain to replace
29902 SmallVector<SDValue, 8> Chain;
29903 SDValue V = N.getOperand(0);
29904 for (; V.hasOneUse(); V = V.getOperand(0)) {
29905 switch (V.getOpcode()) {
29907 return SDValue(); // Nothing combined!
29910 // Skip bitcasts as we always know the type for the target specific
29914 case X86ISD::PSHUFD:
29915 // Found another dword shuffle.
29918 case X86ISD::PSHUFLW:
29919 // Check that the low words (being shuffled) are the identity in the
29920 // dword shuffle, and the high words are self-contained.
29921 if (Mask[0] != 0 || Mask[1] != 1 ||
29922 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29925 Chain.push_back(V);
29928 case X86ISD::PSHUFHW:
29929 // Check that the high words (being shuffled) are the identity in the
29930 // dword shuffle, and the low words are self-contained.
29931 if (Mask[2] != 2 || Mask[3] != 3 ||
29932 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29935 Chain.push_back(V);
29938 case X86ISD::UNPCKL:
29939 case X86ISD::UNPCKH:
29940 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29941 // shuffle into a preceding word shuffle.
29942 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29943 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29946 // Search for a half-shuffle which we can combine with.
29947 unsigned CombineOp =
29948 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29949 if (V.getOperand(0) != V.getOperand(1) ||
29950 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29952 Chain.push_back(V);
29953 V = V.getOperand(0);
29955 switch (V.getOpcode()) {
29957 return SDValue(); // Nothing to combine.
29959 case X86ISD::PSHUFLW:
29960 case X86ISD::PSHUFHW:
29961 if (V.getOpcode() == CombineOp)
29964 Chain.push_back(V);
29968 V = V.getOperand(0);
29972 } while (V.hasOneUse());
29975 // Break out of the loop if we break out of the switch.
29979 if (!V.hasOneUse())
29980 // We fell out of the loop without finding a viable combining instruction.
29983 // Merge this node's mask and our incoming mask.
29984 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29985 for (int &M : Mask)
29987 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29988 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29990 // Rebuild the chain around this new shuffle.
29991 while (!Chain.empty()) {
29992 SDValue W = Chain.pop_back_val();
29994 if (V.getValueType() != W.getOperand(0).getValueType())
29995 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29997 switch (W.getOpcode()) {
29999 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
30001 case X86ISD::UNPCKL:
30002 case X86ISD::UNPCKH:
30003 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
30006 case X86ISD::PSHUFD:
30007 case X86ISD::PSHUFLW:
30008 case X86ISD::PSHUFHW:
30009 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
30013 if (V.getValueType() != N.getValueType())
30014 V = DAG.getBitcast(N.getValueType(), V);
30016 // Return the new chain to replace N.
30020 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
30023 /// We walk up the chain, skipping shuffles of the other half and looking
30024 /// through shuffles which switch halves trying to find a shuffle of the same
30025 /// pair of dwords.
30026 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
30028 TargetLowering::DAGCombinerInfo &DCI) {
30030 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
30031 "Called with something other than an x86 128-bit half shuffle!");
30033 unsigned CombineOpcode = N.getOpcode();
30035 // Walk up a single-use chain looking for a combinable shuffle.
30036 SDValue V = N.getOperand(0);
30037 for (; V.hasOneUse(); V = V.getOperand(0)) {
30038 switch (V.getOpcode()) {
30040 return false; // Nothing combined!
30043 // Skip bitcasts as we always know the type for the target specific
30047 case X86ISD::PSHUFLW:
30048 case X86ISD::PSHUFHW:
30049 if (V.getOpcode() == CombineOpcode)
30052 // Other-half shuffles are no-ops.
30055 // Break out of the loop if we break out of the switch.
30059 if (!V.hasOneUse())
30060 // We fell out of the loop without finding a viable combining instruction.
30063 // Combine away the bottom node as its shuffle will be accumulated into
30064 // a preceding shuffle.
30065 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30067 // Record the old value.
30070 // Merge this node's mask and our incoming mask (adjusted to account for all
30071 // the pshufd instructions encountered).
30072 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30073 for (int &M : Mask)
30075 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
30076 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
30078 // Check that the shuffles didn't cancel each other out. If not, we need to
30079 // combine to the new one.
30081 // Replace the combinable shuffle with the combined one, updating all users
30082 // so that we re-evaluate the chain here.
30083 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
30088 /// \brief Try to combine x86 target specific shuffles.
30089 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
30090 TargetLowering::DAGCombinerInfo &DCI,
30091 const X86Subtarget &Subtarget) {
30093 MVT VT = N.getSimpleValueType();
30094 SmallVector<int, 4> Mask;
30095 unsigned Opcode = N.getOpcode();
30097 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
30098 // single instruction.
30099 if (VT.getScalarSizeInBits() == 64 &&
30100 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
30101 Opcode == X86ISD::UNPCKL)) {
30102 auto BC0 = peekThroughBitcasts(N.getOperand(0));
30103 auto BC1 = peekThroughBitcasts(N.getOperand(1));
30104 EVT VT0 = BC0.getValueType();
30105 EVT VT1 = BC1.getValueType();
30106 unsigned Opcode0 = BC0.getOpcode();
30107 unsigned Opcode1 = BC1.getOpcode();
30108 if (Opcode0 == Opcode1 && VT0 == VT1 &&
30109 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
30110 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
30111 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
30113 if (Opcode == X86ISD::MOVSD) {
30114 Lo = BC1.getOperand(0);
30115 Hi = BC0.getOperand(1);
30117 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30118 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
30120 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
30121 DCI.AddToWorklist(Horiz.getNode());
30122 return DAG.getBitcast(VT, Horiz);
30127 case X86ISD::VBROADCAST: {
30128 // If broadcasting from another shuffle, attempt to simplify it.
30129 // TODO - we really need a general SimplifyDemandedVectorElts mechanism.
30130 SDValue Src = N.getOperand(0);
30131 SDValue BC = peekThroughBitcasts(Src);
30132 EVT SrcVT = Src.getValueType();
30133 EVT BCVT = BC.getValueType();
30134 if (isTargetShuffle(BC.getOpcode()) &&
30135 VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) {
30136 unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits();
30137 SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(),
30139 for (unsigned i = 0; i != Scale; ++i)
30140 DemandedMask[i] = i;
30141 if (SDValue Res = combineX86ShufflesRecursively(
30142 {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 1,
30143 /*HasVarMask*/ false, DAG, DCI, Subtarget))
30144 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
30145 DAG.getBitcast(SrcVT, Res));
30149 case X86ISD::PSHUFD:
30150 case X86ISD::PSHUFLW:
30151 case X86ISD::PSHUFHW:
30152 Mask = getPSHUFShuffleMask(N);
30153 assert(Mask.size() == 4);
30155 case X86ISD::UNPCKL: {
30156 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
30157 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
30158 // moves upper half elements into the lower half part. For example:
30160 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
30162 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
30164 // will be combined to:
30166 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
30168 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
30169 // happen due to advanced instructions.
30170 if (!VT.is128BitVector())
30173 auto Op0 = N.getOperand(0);
30174 auto Op1 = N.getOperand(1);
30175 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
30176 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
30178 unsigned NumElts = VT.getVectorNumElements();
30179 SmallVector<int, 8> ExpectedMask(NumElts, -1);
30180 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
30183 auto ShufOp = Op1.getOperand(0);
30184 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
30185 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
30189 case X86ISD::BLENDI: {
30190 SDValue V0 = N->getOperand(0);
30191 SDValue V1 = N->getOperand(1);
30192 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
30193 "Unexpected input vector types");
30195 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
30196 // operands and changing the mask to 1. This saves us a bunch of
30197 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
30198 // x86InstrInfo knows how to commute this back after instruction selection
30199 // if it would help register allocation.
30201 // TODO: If optimizing for size or a processor that doesn't suffer from
30202 // partial register update stalls, this should be transformed into a MOVSD
30203 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
30205 if (VT == MVT::v2f64)
30206 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
30207 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
30208 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
30209 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
30214 case X86ISD::MOVSD:
30215 case X86ISD::MOVSS: {
30216 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
30217 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
30218 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
30219 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
30220 if (isZero0 && isZero1)
30223 // We often lower to MOVSD/MOVSS from integer as well as native float
30224 // types; remove unnecessary domain-crossing bitcasts if we can to make it
30225 // easier to combine shuffles later on. We've already accounted for the
30226 // domain switching cost when we decided to lower with it.
30227 bool isFloat = VT.isFloatingPoint();
30228 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
30229 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
30230 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
30231 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
30232 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
30233 V0 = DAG.getBitcast(NewVT, V0);
30234 V1 = DAG.getBitcast(NewVT, V1);
30235 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
30240 case X86ISD::INSERTPS: {
30241 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
30242 SDValue Op0 = N.getOperand(0);
30243 SDValue Op1 = N.getOperand(1);
30244 SDValue Op2 = N.getOperand(2);
30245 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
30246 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
30247 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
30248 unsigned ZeroMask = InsertPSMask & 0xF;
30250 // If we zero out all elements from Op0 then we don't need to reference it.
30251 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
30252 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
30253 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30255 // If we zero out the element from Op1 then we don't need to reference it.
30256 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
30257 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30258 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30260 // Attempt to merge insertps Op1 with an inner target shuffle node.
30261 SmallVector<int, 8> TargetMask1;
30262 SmallVector<SDValue, 2> Ops1;
30263 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
30264 int M = TargetMask1[SrcIdx];
30265 if (isUndefOrZero(M)) {
30266 // Zero/UNDEF insertion - zero out element and remove dependency.
30267 InsertPSMask |= (1u << DstIdx);
30268 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
30269 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30271 // Update insertps mask srcidx and reference the source input directly.
30272 assert(0 <= M && M < 8 && "Shuffle index out of range");
30273 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
30274 Op1 = Ops1[M < 4 ? 0 : 1];
30275 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30276 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30279 // Attempt to merge insertps Op0 with an inner target shuffle node.
30280 SmallVector<int, 8> TargetMask0;
30281 SmallVector<SDValue, 2> Ops0;
30282 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
30285 bool Updated = false;
30286 bool UseInput00 = false;
30287 bool UseInput01 = false;
30288 for (int i = 0; i != 4; ++i) {
30289 int M = TargetMask0[i];
30290 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
30291 // No change if element is already zero or the inserted element.
30293 } else if (isUndefOrZero(M)) {
30294 // If the target mask is undef/zero then we must zero the element.
30295 InsertPSMask |= (1u << i);
30300 // The input vector element must be inline.
30301 if (M != i && M != (i + 4))
30304 // Determine which inputs of the target shuffle we're using.
30305 UseInput00 |= (0 <= M && M < 4);
30306 UseInput01 |= (4 <= M);
30309 // If we're not using both inputs of the target shuffle then use the
30310 // referenced input directly.
30311 if (UseInput00 && !UseInput01) {
30314 } else if (!UseInput00 && UseInput01) {
30320 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
30321 DAG.getConstant(InsertPSMask, DL, MVT::i8));
30329 // Nuke no-op shuffles that show up after combining.
30330 if (isNoopShuffleMask(Mask))
30331 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
30333 // Look for simplifications involving one or two shuffle instructions.
30334 SDValue V = N.getOperand(0);
30335 switch (N.getOpcode()) {
30338 case X86ISD::PSHUFLW:
30339 case X86ISD::PSHUFHW:
30340 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
30342 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
30343 return SDValue(); // We combined away this shuffle, so we're done.
30345 // See if this reduces to a PSHUFD which is no more expensive and can
30346 // combine with more operations. Note that it has to at least flip the
30347 // dwords as otherwise it would have been removed as a no-op.
30348 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
30349 int DMask[] = {0, 1, 2, 3};
30350 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
30351 DMask[DOffset + 0] = DOffset + 1;
30352 DMask[DOffset + 1] = DOffset + 0;
30353 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30354 V = DAG.getBitcast(DVT, V);
30355 DCI.AddToWorklist(V.getNode());
30356 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
30357 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
30358 DCI.AddToWorklist(V.getNode());
30359 return DAG.getBitcast(VT, V);
30362 // Look for shuffle patterns which can be implemented as a single unpack.
30363 // FIXME: This doesn't handle the location of the PSHUFD generically, and
30364 // only works when we have a PSHUFD followed by two half-shuffles.
30365 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
30366 (V.getOpcode() == X86ISD::PSHUFLW ||
30367 V.getOpcode() == X86ISD::PSHUFHW) &&
30368 V.getOpcode() != N.getOpcode() &&
30370 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
30371 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
30372 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
30373 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
30374 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30375 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
30377 for (int i = 0; i < 4; ++i) {
30378 WordMask[i + NOffset] = Mask[i] + NOffset;
30379 WordMask[i + VOffset] = VMask[i] + VOffset;
30381 // Map the word mask through the DWord mask.
30383 for (int i = 0; i < 8; ++i)
30384 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
30385 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
30386 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
30387 // We can replace all three shuffles with an unpack.
30388 V = DAG.getBitcast(VT, D.getOperand(0));
30389 DCI.AddToWorklist(V.getNode());
30390 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
30399 case X86ISD::PSHUFD:
30400 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
30409 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD)
30410 /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation
30411 /// are written to the parameters \p Opnd0 and \p Opnd1.
30413 /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes
30414 /// so it is easier to generically match. We also insert dummy vector shuffle
30415 /// nodes for the operands which explicitly discard the lanes which are unused
30416 /// by this operation to try to flow through the rest of the combiner
30417 /// the fact that they're unused.
30418 static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget,
30419 SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1,
30422 EVT VT = N->getValueType(0);
30423 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30424 if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) ||
30425 !VT.getSimpleVT().isFloatingPoint())
30428 // We only handle target-independent shuffles.
30429 // FIXME: It would be easy and harmless to use the target shuffle mask
30430 // extraction tool to support more.
30431 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
30434 SDValue V1 = N->getOperand(0);
30435 SDValue V2 = N->getOperand(1);
30437 // Make sure we have an FADD and an FSUB.
30438 if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) ||
30439 (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) ||
30440 V1.getOpcode() == V2.getOpcode())
30443 // If there are other uses of these operations we can't fold them.
30444 if (!V1->hasOneUse() || !V2->hasOneUse())
30447 // Ensure that both operations have the same operands. Note that we can
30448 // commute the FADD operands.
30450 if (V1.getOpcode() == ISD::FSUB) {
30451 LHS = V1->getOperand(0); RHS = V1->getOperand(1);
30452 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
30453 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
30456 assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode");
30457 LHS = V2->getOperand(0); RHS = V2->getOperand(1);
30458 if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) &&
30459 (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS))
30463 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30465 int ParitySrc[2] = {-1, -1};
30466 unsigned Size = Mask.size();
30467 for (unsigned i = 0; i != Size; ++i) {
30472 // Make sure we are using the matching element from the input.
30473 if ((M % Size) != i)
30476 // Make sure we use the same input for all elements of the same parity.
30477 int Src = M / Size;
30478 if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src)
30480 ParitySrc[i % 2] = Src;
30483 // Make sure each input is used.
30484 if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1])
30487 // It's a subadd if the vector in the even parity is an FADD.
30488 IsSubAdd = ParitySrc[0] == 0 ? V1->getOpcode() == ISD::FADD
30489 : V2->getOpcode() == ISD::FADD;
30496 /// \brief Try to combine a shuffle into a target-specific add-sub or
30497 /// mul-add-sub node.
30498 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
30499 const X86Subtarget &Subtarget,
30500 SelectionDAG &DAG) {
30501 SDValue Opnd0, Opnd1;
30503 if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd))
30506 MVT VT = N->getSimpleValueType(0);
30509 // Try to generate X86ISD::FMADDSUB node here.
30511 if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) {
30512 unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB;
30513 return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2);
30519 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
30520 // the ADDSUB idiom has been successfully recognized. There are no known
30521 // X86 targets with 512-bit ADDSUB instructions!
30522 if (VT.is512BitVector())
30525 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
30528 // We are looking for a shuffle where both sources are concatenated with undef
30529 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
30530 // if we can express this as a single-source shuffle, that's preferable.
30531 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
30532 const X86Subtarget &Subtarget) {
30533 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
30536 EVT VT = N->getValueType(0);
30538 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
30539 if (!VT.is128BitVector() && !VT.is256BitVector())
30542 if (VT.getVectorElementType() != MVT::i32 &&
30543 VT.getVectorElementType() != MVT::i64 &&
30544 VT.getVectorElementType() != MVT::f32 &&
30545 VT.getVectorElementType() != MVT::f64)
30548 SDValue N0 = N->getOperand(0);
30549 SDValue N1 = N->getOperand(1);
30551 // Check that both sources are concats with undef.
30552 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
30553 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
30554 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
30555 !N1.getOperand(1).isUndef())
30558 // Construct the new shuffle mask. Elements from the first source retain their
30559 // index, but elements from the second source no longer need to skip an undef.
30560 SmallVector<int, 8> Mask;
30561 int NumElts = VT.getVectorNumElements();
30563 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30564 for (int Elt : SVOp->getMask())
30565 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
30568 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
30570 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
30573 /// Eliminate a redundant shuffle of a horizontal math op.
30574 static SDValue foldShuffleOfHorizOp(SDNode *N) {
30575 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
30578 SDValue HOp = N->getOperand(0);
30579 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
30580 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
30583 // 128-bit horizontal math instructions are defined to operate on adjacent
30584 // lanes of each operand as:
30585 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
30586 // ...similarly for v2f64 and v8i16.
30587 // TODO: 256-bit is not the same because...x86.
30588 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
30591 // When the operands of a horizontal math op are identical, the low half of
30592 // the result is the same as the high half. If the shuffle is also replicating
30593 // low and high halves, we don't need the shuffle.
30594 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
30595 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
30596 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
30597 // but this should be tied to whatever horizontal op matching and shuffle
30598 // canonicalization are producing.
30599 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
30600 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
30601 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
30607 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
30608 TargetLowering::DAGCombinerInfo &DCI,
30609 const X86Subtarget &Subtarget) {
30611 EVT VT = N->getValueType(0);
30612 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30613 // If we have legalized the vector types, look for blends of FADD and FSUB
30614 // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node.
30615 if (TLI.isTypeLegal(VT)) {
30616 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
30619 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
30623 // During Type Legalization, when promoting illegal vector types,
30624 // the backend might introduce new shuffle dag nodes and bitcasts.
30626 // This code performs the following transformation:
30627 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
30628 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
30630 // We do this only if both the bitcast and the BINOP dag nodes have
30631 // one use. Also, perform this transformation only if the new binary
30632 // operation is legal. This is to avoid introducing dag nodes that
30633 // potentially need to be further expanded (or custom lowered) into a
30634 // less optimal sequence of dag nodes.
30635 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
30636 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
30637 N->getOperand(0).getOpcode() == ISD::BITCAST &&
30638 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
30639 SDValue N0 = N->getOperand(0);
30640 SDValue N1 = N->getOperand(1);
30642 SDValue BC0 = N0.getOperand(0);
30643 EVT SVT = BC0.getValueType();
30644 unsigned Opcode = BC0.getOpcode();
30645 unsigned NumElts = VT.getVectorNumElements();
30647 if (BC0.hasOneUse() && SVT.isVector() &&
30648 SVT.getVectorNumElements() * 2 == NumElts &&
30649 TLI.isOperationLegal(Opcode, VT)) {
30650 bool CanFold = false;
30656 // isOperationLegal lies for integer ops on floating point types.
30657 CanFold = VT.isInteger();
30662 // isOperationLegal lies for floating point ops on integer types.
30663 CanFold = VT.isFloatingPoint();
30667 unsigned SVTNumElts = SVT.getVectorNumElements();
30668 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
30669 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
30670 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
30671 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
30672 CanFold = SVOp->getMaskElt(i) < 0;
30675 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
30676 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
30677 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
30678 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
30683 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
30684 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
30685 // consecutive, non-overlapping, and in the right order.
30686 SmallVector<SDValue, 16> Elts;
30687 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
30688 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
30689 Elts.push_back(Elt);
30696 if (Elts.size() == VT.getVectorNumElements())
30698 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
30701 // For AVX2, we sometimes want to combine
30702 // (vector_shuffle <mask> (concat_vectors t1, undef)
30703 // (concat_vectors t2, undef))
30705 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
30706 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
30707 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
30710 if (isTargetShuffle(N->getOpcode())) {
30712 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
30715 // Try recursively combining arbitrary sequences of x86 shuffle
30716 // instructions into higher-order shuffles. We do this after combining
30717 // specific PSHUF instruction sequences into their minimal form so that we
30718 // can evaluate how many specialized shuffle instructions are involved in
30719 // a particular chain.
30720 if (SDValue Res = combineX86ShufflesRecursively(
30721 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
30722 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
30723 DCI.CombineTo(N, Res);
30731 /// Check if a vector extract from a target-specific shuffle of a load can be
30732 /// folded into a single element load.
30733 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
30734 /// shuffles have been custom lowered so we need to handle those here.
30735 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
30736 TargetLowering::DAGCombinerInfo &DCI) {
30737 if (DCI.isBeforeLegalizeOps())
30740 SDValue InVec = N->getOperand(0);
30741 SDValue EltNo = N->getOperand(1);
30742 EVT EltVT = N->getValueType(0);
30744 if (!isa<ConstantSDNode>(EltNo))
30747 EVT OriginalVT = InVec.getValueType();
30749 // Peek through bitcasts, don't duplicate a load with other uses.
30750 InVec = peekThroughOneUseBitcasts(InVec);
30752 EVT CurrentVT = InVec.getValueType();
30753 if (!CurrentVT.isVector() ||
30754 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
30757 if (!isTargetShuffle(InVec.getOpcode()))
30760 // Don't duplicate a load with other uses.
30761 if (!InVec.hasOneUse())
30764 SmallVector<int, 16> ShuffleMask;
30765 SmallVector<SDValue, 2> ShuffleOps;
30767 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
30768 ShuffleOps, ShuffleMask, UnaryShuffle))
30771 // Select the input vector, guarding against out of range extract vector.
30772 unsigned NumElems = CurrentVT.getVectorNumElements();
30773 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
30774 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
30776 if (Idx == SM_SentinelZero)
30777 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
30778 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
30779 if (Idx == SM_SentinelUndef)
30780 return DAG.getUNDEF(EltVT);
30782 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
30783 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
30786 // If inputs to shuffle are the same for both ops, then allow 2 uses
30787 unsigned AllowedUses =
30788 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
30790 if (LdNode.getOpcode() == ISD::BITCAST) {
30791 // Don't duplicate a load with other uses.
30792 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
30795 AllowedUses = 1; // only allow 1 load use if we have a bitcast
30796 LdNode = LdNode.getOperand(0);
30799 if (!ISD::isNormalLoad(LdNode.getNode()))
30802 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
30804 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
30807 // If there's a bitcast before the shuffle, check if the load type and
30808 // alignment is valid.
30809 unsigned Align = LN0->getAlignment();
30810 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30811 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
30812 EltVT.getTypeForEVT(*DAG.getContext()));
30814 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
30817 // All checks match so transform back to vector_shuffle so that DAG combiner
30818 // can finish the job
30821 // Create shuffle node taking into account the case that its a unary shuffle
30822 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
30823 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
30825 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
30826 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
30830 // Try to match patterns such as
30831 // (i16 bitcast (v16i1 x))
30833 // (i16 movmsk (16i8 sext (v16i1 x)))
30834 // before the illegal vector is scalarized on subtargets that don't have legal
30836 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
30837 const X86Subtarget &Subtarget) {
30838 EVT VT = BitCast.getValueType();
30839 SDValue N0 = BitCast.getOperand(0);
30840 EVT VecVT = N0->getValueType(0);
30842 if (!VT.isScalarInteger() || !VecVT.isSimple())
30845 // With AVX512 vxi1 types are legal and we prefer using k-regs.
30846 // MOVMSK is supported in SSE2 or later.
30847 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
30850 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
30851 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
30852 // v8i16 and v16i16.
30853 // For these two cases, we can shuffle the upper element bytes to a
30854 // consecutive sequence at the start of the vector and treat the results as
30855 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30856 // for v16i16 this is not the case, because the shuffle is expensive, so we
30857 // avoid sign-extending to this type entirely.
30858 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30859 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30861 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30862 switch (VecVT.getSimpleVT().SimpleTy) {
30866 SExtVT = MVT::v2i64;
30867 FPCastVT = MVT::v2f64;
30870 SExtVT = MVT::v4i32;
30871 FPCastVT = MVT::v4f32;
30872 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30873 // sign-extend to a 256-bit operation to avoid truncation.
30874 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30875 N0->getOperand(0).getValueType().is256BitVector()) {
30876 SExtVT = MVT::v4i64;
30877 FPCastVT = MVT::v4f64;
30881 SExtVT = MVT::v8i16;
30882 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30883 // sign-extend to a 256-bit operation to match the compare.
30884 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30885 // 256-bit because the shuffle is cheaper than sign extending the result of
30887 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30888 (N0->getOperand(0).getValueType().is256BitVector() ||
30889 N0->getOperand(0).getValueType().is512BitVector())) {
30890 SExtVT = MVT::v8i32;
30891 FPCastVT = MVT::v8f32;
30895 SExtVT = MVT::v16i8;
30896 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30897 // it is not profitable to sign-extend to 256-bit because this will
30898 // require an extra cross-lane shuffle which is more expensive than
30899 // truncating the result of the compare to 128-bits.
30902 SExtVT = MVT::v32i8;
30907 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30909 if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8) {
30910 V = getPMOVMSKB(DL, V, DAG, Subtarget);
30911 return DAG.getZExtOrTrunc(V, DL, VT);
30914 if (SExtVT == MVT::v8i16) {
30915 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
30916 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30917 DAG.getUNDEF(MVT::v8i16));
30919 assert(SExtVT.getScalarType() != MVT::i16 &&
30920 "Vectors of i16 must be packed");
30921 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30922 V = DAG.getBitcast(FPCastVT, V);
30923 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30924 return DAG.getZExtOrTrunc(V, DL, VT);
30927 // Convert a vXi1 constant build vector to the same width scalar integer.
30928 static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) {
30929 EVT SrcVT = Op.getValueType();
30930 assert(SrcVT.getVectorElementType() == MVT::i1 &&
30931 "Expected a vXi1 vector");
30932 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
30933 "Expected a constant build vector");
30935 APInt Imm(SrcVT.getVectorNumElements(), 0);
30936 for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) {
30937 SDValue In = Op.getOperand(Idx);
30938 if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1))
30941 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth());
30942 return DAG.getConstant(Imm, SDLoc(Op), IntVT);
30945 static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30946 TargetLowering::DAGCombinerInfo &DCI,
30947 const X86Subtarget &Subtarget) {
30948 assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast");
30950 if (!DCI.isBeforeLegalizeOps())
30953 // Only do this if we have k-registers.
30954 if (!Subtarget.hasAVX512())
30957 EVT DstVT = N->getValueType(0);
30958 SDValue Op = N->getOperand(0);
30959 EVT SrcVT = Op.getValueType();
30961 if (!Op.hasOneUse())
30964 // Look for logic ops.
30965 if (Op.getOpcode() != ISD::AND &&
30966 Op.getOpcode() != ISD::OR &&
30967 Op.getOpcode() != ISD::XOR)
30970 // Make sure we have a bitcast between mask registers and a scalar type.
30971 if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
30972 DstVT.isScalarInteger()) &&
30973 !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 &&
30974 SrcVT.isScalarInteger()))
30977 SDValue LHS = Op.getOperand(0);
30978 SDValue RHS = Op.getOperand(1);
30980 if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST &&
30981 LHS.getOperand(0).getValueType() == DstVT)
30982 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0),
30983 DAG.getBitcast(DstVT, RHS));
30985 if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST &&
30986 RHS.getOperand(0).getValueType() == DstVT)
30987 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
30988 DAG.getBitcast(DstVT, LHS), RHS.getOperand(0));
30990 // If the RHS is a vXi1 build vector, this is a good reason to flip too.
30991 // Most of these have to move a constant from the scalar domain anyway.
30992 if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) {
30993 RHS = combinevXi1ConstantToInteger(RHS, DAG);
30994 return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT,
30995 DAG.getBitcast(DstVT, LHS), RHS);
31001 static SDValue createMMXBuildVector(SDValue N, SelectionDAG &DAG,
31002 const X86Subtarget &Subtarget) {
31004 unsigned NumElts = N.getNumOperands();
31006 auto *BV = cast<BuildVectorSDNode>(N);
31007 SDValue Splat = BV->getSplatValue();
31009 // Build MMX element from integer GPR or SSE float values.
31010 auto CreateMMXElement = [&](SDValue V) {
31012 return DAG.getUNDEF(MVT::x86mmx);
31013 if (V.getValueType().isFloatingPoint()) {
31014 if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) {
31015 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V);
31016 V = DAG.getBitcast(MVT::v2i64, V);
31017 return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V);
31019 V = DAG.getBitcast(MVT::i32, V);
31021 V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32);
31023 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V);
31026 // Convert build vector ops to MMX data in the bottom elements.
31027 SmallVector<SDValue, 8> Ops;
31029 // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element.
31031 if (Splat.isUndef())
31032 return DAG.getUNDEF(MVT::x86mmx);
31034 Splat = CreateMMXElement(Splat);
31036 if (Subtarget.hasSSE1()) {
31037 // Unpack v8i8 to splat i8 elements to lowest 16-bits.
31039 Splat = DAG.getNode(
31040 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31041 DAG.getConstant(Intrinsic::x86_mmx_punpcklbw, DL, MVT::i32), Splat,
31044 // Use PSHUFW to repeat 16-bit elements.
31045 unsigned ShufMask = (NumElts > 2 ? 0 : 0x44);
31046 return DAG.getNode(
31047 ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx,
31048 DAG.getConstant(Intrinsic::x86_sse_pshuf_w, DL, MVT::i32), Splat,
31049 DAG.getConstant(ShufMask, DL, MVT::i8));
31051 Ops.append(NumElts, Splat);
31053 for (unsigned i = 0; i != NumElts; ++i)
31054 Ops.push_back(CreateMMXElement(N.getOperand(i)));
31057 // Use tree of PUNPCKLs to build up general MMX vector.
31058 while (Ops.size() > 1) {
31059 unsigned NumOps = Ops.size();
31060 unsigned IntrinOp =
31061 (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq
31062 : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd
31063 : Intrinsic::x86_mmx_punpcklbw));
31064 SDValue Intrin = DAG.getConstant(IntrinOp, DL, MVT::i32);
31065 for (unsigned i = 0; i != NumOps; i += 2)
31066 Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin,
31067 Ops[i], Ops[i + 1]);
31068 Ops.resize(NumOps / 2);
31074 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
31075 TargetLowering::DAGCombinerInfo &DCI,
31076 const X86Subtarget &Subtarget) {
31077 SDValue N0 = N->getOperand(0);
31078 EVT VT = N->getValueType(0);
31079 EVT SrcVT = N0.getValueType();
31081 // Try to match patterns such as
31082 // (i16 bitcast (v16i1 x))
31084 // (i16 movmsk (16i8 sext (v16i1 x)))
31085 // before the setcc result is scalarized on subtargets that don't have legal
31087 if (DCI.isBeforeLegalize()) {
31088 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
31091 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31092 // type, widen both sides to avoid a trip through memory.
31093 if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
31094 Subtarget.hasAVX512()) {
31096 N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0);
31097 N0 = DAG.getBitcast(MVT::v8i1, N0);
31098 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0,
31099 DAG.getIntPtrConstant(0, dl));
31102 // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
31103 // type, widen both sides to avoid a trip through memory.
31104 if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() &&
31105 Subtarget.hasAVX512()) {
31107 unsigned NumConcats = 8 / SrcVT.getVectorNumElements();
31108 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT));
31110 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
31111 N0 = DAG.getBitcast(MVT::i8, N0);
31112 return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
31116 // Since MMX types are special and don't usually play with other vector types,
31117 // it's better to handle them early to be sure we emit efficient code by
31118 // avoiding store-load conversions.
31119 if (VT == MVT::x86mmx) {
31120 // Detect MMX constant vectors.
31122 SmallVector<APInt, 1> EltBits;
31123 if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) {
31125 // Handle zero-extension of i32 with MOVD.
31126 if (EltBits[0].countLeadingZeros() >= 32)
31127 return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT,
31128 DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32));
31129 // Else, bitcast to a double.
31130 // TODO - investigate supporting sext 32-bit immediates on x86_64.
31131 APFloat F64(APFloat::IEEEdouble(), EltBits[0]);
31132 return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64));
31135 // Detect bitcasts to x86mmx low word.
31136 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31137 (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) &&
31138 N0.getOperand(0).getValueType() == SrcVT.getScalarType()) {
31139 bool LowUndef = true, AllUndefOrZero = true;
31140 for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) {
31141 SDValue Op = N0.getOperand(i);
31142 LowUndef &= Op.isUndef() || (i >= e/2);
31143 AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op));
31145 if (AllUndefOrZero) {
31146 SDValue N00 = N0.getOperand(0);
31148 N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32)
31149 : DAG.getZExtOrTrunc(N00, dl, MVT::i32);
31150 return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00);
31154 // Detect bitcasts of 64-bit build vectors and convert to a
31155 // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the
31157 if (N0.getOpcode() == ISD::BUILD_VECTOR &&
31158 (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 ||
31159 SrcVT == MVT::v8i8))
31160 return createMMXBuildVector(N0, DAG, Subtarget);
31162 // Detect bitcasts between element or subvector extraction to x86mmx.
31163 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
31164 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
31165 isNullConstant(N0.getOperand(1))) {
31166 SDValue N00 = N0.getOperand(0);
31167 if (N00.getValueType().is128BitVector())
31168 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
31169 DAG.getBitcast(MVT::v2i64, N00));
31172 // Detect bitcasts from FP_TO_SINT to x86mmx.
31173 if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) {
31175 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
31176 DAG.getUNDEF(MVT::v2i32));
31177 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
31178 DAG.getBitcast(MVT::v2i64, Res));
31182 // Try to remove a bitcast of constant vXi1 vector. We have to legalize
31183 // most of these to scalar anyway.
31184 if (Subtarget.hasAVX512() && VT.isScalarInteger() &&
31185 SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 &&
31186 ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
31187 return combinevXi1ConstantToInteger(N0, DAG);
31190 // Try to remove bitcasts from input and output of mask arithmetic to
31191 // remove GPR<->K-register crossings.
31192 if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
31195 // Convert a bitcasted integer logic operation that has one bitcasted
31196 // floating-point operand into a floating-point logic operation. This may
31197 // create a load of a constant, but that is cheaper than materializing the
31198 // constant in an integer register and transferring it to an SSE register or
31199 // transferring the SSE operand to integer register and back.
31201 switch (N0.getOpcode()) {
31202 case ISD::AND: FPOpcode = X86ISD::FAND; break;
31203 case ISD::OR: FPOpcode = X86ISD::FOR; break;
31204 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
31205 default: return SDValue();
31208 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
31209 (Subtarget.hasSSE2() && VT == MVT::f64)))
31212 SDValue LogicOp0 = N0.getOperand(0);
31213 SDValue LogicOp1 = N0.getOperand(1);
31216 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
31217 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
31218 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
31219 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
31220 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
31221 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
31223 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
31224 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
31225 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
31226 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
31227 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
31228 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
31234 // Match a binop + shuffle pyramid that represents a horizontal reduction over
31235 // the elements of a vector.
31236 // Returns the vector that is being reduced on, or SDValue() if a reduction
31237 // was not matched.
31238 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
31239 ArrayRef<ISD::NodeType> CandidateBinOps) {
31240 // The pattern must end in an extract from index 0.
31241 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
31242 !isNullConstant(Extract->getOperand(1)))
31245 SDValue Op = Extract->getOperand(0);
31246 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
31248 // Match against one of the candidate binary ops.
31249 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
31250 return Op.getOpcode() == unsigned(BinOp);
31254 // At each stage, we're looking for something that looks like:
31255 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
31256 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
31257 // i32 undef, i32 undef, i32 undef, i32 undef>
31258 // %a = binop <8 x i32> %op, %s
31259 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
31260 // we expect something like:
31261 // <4,5,6,7,u,u,u,u>
31262 // <2,3,u,u,u,u,u,u>
31263 // <1,u,u,u,u,u,u,u>
31264 unsigned CandidateBinOp = Op.getOpcode();
31265 for (unsigned i = 0; i < Stages; ++i) {
31266 if (Op.getOpcode() != CandidateBinOp)
31269 ShuffleVectorSDNode *Shuffle =
31270 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
31272 Op = Op.getOperand(1);
31274 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
31275 Op = Op.getOperand(0);
31278 // The first operand of the shuffle should be the same as the other operand
31280 if (!Shuffle || Shuffle->getOperand(0) != Op)
31283 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
31284 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
31285 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
31289 BinOp = CandidateBinOp;
31293 // Given a select, detect the following pattern:
31294 // 1: %2 = zext <N x i8> %0 to <N x i32>
31295 // 2: %3 = zext <N x i8> %1 to <N x i32>
31296 // 3: %4 = sub nsw <N x i32> %2, %3
31297 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
31298 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
31299 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
31300 // This is useful as it is the input into a SAD pattern.
31301 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
31303 // Check the condition of the select instruction is greater-than.
31304 SDValue SetCC = Select->getOperand(0);
31305 if (SetCC.getOpcode() != ISD::SETCC)
31307 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
31308 if (CC != ISD::SETGT && CC != ISD::SETLT)
31311 SDValue SelectOp1 = Select->getOperand(1);
31312 SDValue SelectOp2 = Select->getOperand(2);
31314 // The following instructions assume SelectOp1 is the subtraction operand
31315 // and SelectOp2 is the negation operand.
31316 // In the case of SETLT this is the other way around.
31317 if (CC == ISD::SETLT)
31318 std::swap(SelectOp1, SelectOp2);
31320 // The second operand of the select should be the negation of the first
31321 // operand, which is implemented as 0 - SelectOp1.
31322 if (!(SelectOp2.getOpcode() == ISD::SUB &&
31323 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
31324 SelectOp2.getOperand(1) == SelectOp1))
31327 // The first operand of SetCC is the first operand of the select, which is the
31328 // difference between the two input vectors.
31329 if (SetCC.getOperand(0) != SelectOp1)
31332 // In SetLT case, The second operand of the comparison can be either 1 or 0.
31334 if ((CC == ISD::SETLT) &&
31335 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
31336 SplatVal.isOneValue()) ||
31337 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
31340 // In SetGT case, The second operand of the comparison can be either -1 or 0.
31341 if ((CC == ISD::SETGT) &&
31342 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
31343 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
31346 // The first operand of the select is the difference between the two input
31348 if (SelectOp1.getOpcode() != ISD::SUB)
31351 Op0 = SelectOp1.getOperand(0);
31352 Op1 = SelectOp1.getOperand(1);
31354 // Check if the operands of the sub are zero-extended from vectors of i8.
31355 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
31356 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
31357 Op1.getOpcode() != ISD::ZERO_EXTEND ||
31358 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
31364 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
31366 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
31367 const SDValue &Zext1, const SDLoc &DL,
31368 const X86Subtarget &Subtarget) {
31369 // Find the appropriate width for the PSADBW.
31370 EVT InVT = Zext0.getOperand(0).getValueType();
31371 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
31373 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
31374 // fill in the missing vector elements with 0.
31375 unsigned NumConcat = RegSize / InVT.getSizeInBits();
31376 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
31377 Ops[0] = Zext0.getOperand(0);
31378 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
31379 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31380 Ops[0] = Zext1.getOperand(0);
31381 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
31383 // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW.
31384 auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
31385 ArrayRef<SDValue> Ops) {
31386 MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64);
31387 return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops);
31389 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
31390 return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 },
31394 // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with
31396 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
31397 const X86Subtarget &Subtarget) {
31398 // Bail without SSE41.
31399 if (!Subtarget.hasSSE41())
31402 EVT ExtractVT = Extract->getValueType(0);
31403 if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8)
31406 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
31408 SDValue Src = matchBinOpReduction(
31409 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
31413 EVT SrcVT = Src.getValueType();
31414 EVT SrcSVT = SrcVT.getScalarType();
31415 if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0)
31419 SDValue MinPos = Src;
31421 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
31422 while (SrcVT.getSizeInBits() > 128) {
31423 unsigned NumElts = SrcVT.getVectorNumElements();
31424 unsigned NumSubElts = NumElts / 2;
31425 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
31426 unsigned SubSizeInBits = SrcVT.getSizeInBits();
31427 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
31428 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
31429 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
31431 assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
31432 (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&
31433 "Unexpected value type");
31435 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
31436 // to flip the value accordingly.
31438 unsigned MaskEltsBits = ExtractVT.getSizeInBits();
31439 if (BinOp == ISD::SMAX)
31440 Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT);
31441 else if (BinOp == ISD::SMIN)
31442 Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT);
31443 else if (BinOp == ISD::UMAX)
31444 Mask = DAG.getConstant(APInt::getAllOnesValue(MaskEltsBits), DL, SrcVT);
31447 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
31449 // For v16i8 cases we need to perform UMIN on pairs of byte elements,
31450 // shuffling each upper element down and insert zeros. This means that the
31451 // v16i8 UMIN will leave the upper element as zero, performing zero-extension
31452 // ready for the PHMINPOS.
31453 if (ExtractVT == MVT::i8) {
31454 SDValue Upper = DAG.getVectorShuffle(
31455 SrcVT, DL, MinPos, getZeroVector(MVT::v16i8, Subtarget, DAG, DL),
31456 {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16});
31457 MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper);
31460 // Perform the PHMINPOS on a v8i16 vector,
31461 MinPos = DAG.getBitcast(MVT::v8i16, MinPos);
31462 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos);
31463 MinPos = DAG.getBitcast(SrcVT, MinPos);
31466 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
31468 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
31469 DAG.getIntPtrConstant(0, DL));
31472 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
31473 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
31475 const X86Subtarget &Subtarget) {
31476 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
31477 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
31480 EVT ExtractVT = Extract->getValueType(0);
31481 unsigned BitWidth = ExtractVT.getSizeInBits();
31482 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
31483 ExtractVT != MVT::i8)
31486 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
31487 unsigned BinOp = 0;
31488 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
31492 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
31493 // which we can't support here for now.
31494 if (Match.getScalarValueSizeInBits() != BitWidth)
31497 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
31498 unsigned MatchSizeInBits = Match.getValueSizeInBits();
31499 if (!(MatchSizeInBits == 128 ||
31500 (MatchSizeInBits == 256 &&
31501 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
31504 // Don't bother performing this for 2-element vectors.
31505 if (Match.getValueType().getVectorNumElements() <= 2)
31508 // Check that we are extracting a reduction of all sign bits.
31509 if (DAG.ComputeNumSignBits(Match) != BitWidth)
31512 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
31514 if (64 == BitWidth || 32 == BitWidth)
31515 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
31516 MatchSizeInBits / BitWidth);
31518 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
31521 ISD::CondCode CondCode;
31522 if (BinOp == ISD::OR) {
31523 // any_of -> MOVMSK != 0
31524 CompareBits = APInt::getNullValue(32);
31525 CondCode = ISD::CondCode::SETNE;
31527 // all_of -> MOVMSK == ((1 << NumElts) - 1)
31528 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
31529 CondCode = ISD::CondCode::SETEQ;
31532 // Perform the select as i32/i64 and then truncate to avoid partial register
31534 unsigned ResWidth = std::max(BitWidth, 32u);
31535 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
31537 SDValue Zero = DAG.getConstant(0, DL, ResVT);
31538 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
31539 SDValue Res = DAG.getBitcast(MaskVT, Match);
31540 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
31541 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
31542 Ones, Zero, CondCode);
31543 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
31546 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
31547 const X86Subtarget &Subtarget) {
31548 // PSADBW is only supported on SSE2 and up.
31549 if (!Subtarget.hasSSE2())
31552 // Verify the type we're extracting from is any integer type above i16.
31553 EVT VT = Extract->getOperand(0).getValueType();
31554 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
31557 unsigned RegSize = 128;
31558 if (Subtarget.useBWIRegs())
31560 else if (Subtarget.hasAVX())
31563 // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
31564 // TODO: We should be able to handle larger vectors by splitting them before
31565 // feeding them into several SADs, and then reducing over those.
31566 if (RegSize / VT.getVectorNumElements() < 8)
31569 // Match shuffle + add pyramid.
31570 unsigned BinOp = 0;
31571 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
31573 // The operand is expected to be zero extended from i8
31574 // (verified in detectZextAbsDiff).
31575 // In order to convert to i64 and above, additional any/zero/sign
31576 // extend is expected.
31577 // The zero extend from 32 bit has no mathematical effect on the result.
31578 // Also the sign extend is basically zero extend
31579 // (extends the sign bit which is zero).
31580 // So it is correct to skip the sign/zero extend instruction.
31581 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
31582 Root.getOpcode() == ISD::ZERO_EXTEND ||
31583 Root.getOpcode() == ISD::ANY_EXTEND))
31584 Root = Root.getOperand(0);
31586 // If there was a match, we want Root to be a select that is the root of an
31587 // abs-diff pattern.
31588 if (!Root || (Root.getOpcode() != ISD::VSELECT))
31591 // Check whether we have an abs-diff pattern feeding into the select.
31592 SDValue Zext0, Zext1;
31593 if (!detectZextAbsDiff(Root, Zext0, Zext1))
31596 // Create the SAD instruction.
31598 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget);
31600 // If the original vector was wider than 8 elements, sum over the results
31601 // in the SAD vector.
31602 unsigned Stages = Log2_32(VT.getVectorNumElements());
31603 MVT SadVT = SAD.getSimpleValueType();
31605 unsigned SadElems = SadVT.getVectorNumElements();
31607 for(unsigned i = Stages - 3; i > 0; --i) {
31608 SmallVector<int, 16> Mask(SadElems, -1);
31609 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
31610 Mask[j] = MaskEnd + j;
31613 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
31614 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
31618 MVT Type = Extract->getSimpleValueType(0);
31619 unsigned TypeSizeInBits = Type.getSizeInBits();
31620 // Return the lowest TypeSizeInBits bits.
31621 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
31622 SAD = DAG.getBitcast(ResVT, SAD);
31623 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
31624 Extract->getOperand(1));
31627 // Attempt to peek through a target shuffle and extract the scalar from the
31629 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
31630 TargetLowering::DAGCombinerInfo &DCI,
31631 const X86Subtarget &Subtarget) {
31632 if (DCI.isBeforeLegalizeOps())
31635 SDValue Src = N->getOperand(0);
31636 SDValue Idx = N->getOperand(1);
31638 EVT VT = N->getValueType(0);
31639 EVT SrcVT = Src.getValueType();
31640 EVT SrcSVT = SrcVT.getVectorElementType();
31641 unsigned NumSrcElts = SrcVT.getVectorNumElements();
31643 // Don't attempt this for boolean mask vectors or unknown extraction indices.
31644 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
31647 // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
31648 if (X86ISD::VBROADCAST == Src.getOpcode() &&
31649 Src.getOperand(0).getValueType() == VT)
31650 return Src.getOperand(0);
31652 // Resolve the target shuffle inputs and mask.
31653 SmallVector<int, 16> Mask;
31654 SmallVector<SDValue, 2> Ops;
31655 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
31658 // Attempt to narrow/widen the shuffle mask to the correct size.
31659 if (Mask.size() != NumSrcElts) {
31660 if ((NumSrcElts % Mask.size()) == 0) {
31661 SmallVector<int, 16> ScaledMask;
31662 int Scale = NumSrcElts / Mask.size();
31663 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
31664 Mask = std::move(ScaledMask);
31665 } else if ((Mask.size() % NumSrcElts) == 0) {
31666 SmallVector<int, 16> WidenedMask;
31667 while (Mask.size() > NumSrcElts &&
31668 canWidenShuffleElements(Mask, WidenedMask))
31669 Mask = std::move(WidenedMask);
31670 // TODO - investigate support for wider shuffle masks with known upper
31671 // undef/zero elements for implicit zero-extension.
31675 // Check if narrowing/widening failed.
31676 if (Mask.size() != NumSrcElts)
31679 int SrcIdx = Mask[N->getConstantOperandVal(1)];
31682 // If the shuffle source element is undef/zero then we can just accept it.
31683 if (SrcIdx == SM_SentinelUndef)
31684 return DAG.getUNDEF(VT);
31686 if (SrcIdx == SM_SentinelZero)
31687 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
31688 : DAG.getConstant(0, dl, VT);
31690 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
31691 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
31692 SrcIdx = SrcIdx % Mask.size();
31694 // We can only extract other elements from 128-bit vectors and in certain
31695 // circumstances, depending on SSE-level.
31696 // TODO: Investigate using extract_subvector for larger vectors.
31697 // TODO: Investigate float/double extraction if it will be just stored.
31698 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
31699 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
31700 assert(SrcSVT == VT && "Unexpected extraction type");
31701 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
31702 DAG.getIntPtrConstant(SrcIdx, dl));
31705 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
31706 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
31707 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
31708 "Unexpected extraction type");
31709 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
31710 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
31711 DAG.getIntPtrConstant(SrcIdx, dl));
31712 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
31718 /// Detect vector gather/scatter index generation and convert it from being a
31719 /// bunch of shuffles and extracts into a somewhat faster sequence.
31720 /// For i686, the best sequence is apparently storing the value and loading
31721 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
31722 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
31723 TargetLowering::DAGCombinerInfo &DCI,
31724 const X86Subtarget &Subtarget) {
31725 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
31728 // TODO - Remove this once we can handle the implicit zero-extension of
31729 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
31730 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
31731 // combineBasicSADPattern.
31732 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
31735 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
31738 SDValue InputVector = N->getOperand(0);
31739 SDValue EltIdx = N->getOperand(1);
31741 EVT SrcVT = InputVector.getValueType();
31742 EVT VT = N->getValueType(0);
31743 SDLoc dl(InputVector);
31745 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
31746 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31747 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
31748 SDValue MMXSrc = InputVector.getOperand(0);
31750 // The bitcast source is a direct mmx result.
31751 if (MMXSrc.getValueType() == MVT::x86mmx)
31752 return DAG.getBitcast(VT, InputVector);
31755 // Detect mmx to i32 conversion through a v2i32 elt extract.
31756 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
31757 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
31758 SDValue MMXSrc = InputVector.getOperand(0);
31760 // The bitcast source is a direct mmx result.
31761 if (MMXSrc.getValueType() == MVT::x86mmx)
31762 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
31765 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
31766 isa<ConstantSDNode>(EltIdx) &&
31767 isa<ConstantSDNode>(InputVector.getOperand(0))) {
31768 uint64_t ExtractedElt = N->getConstantOperandVal(1);
31769 auto *InputC = cast<ConstantSDNode>(InputVector.getOperand(0));
31770 const APInt &InputValue = InputC->getAPIntValue();
31771 uint64_t Res = InputValue[ExtractedElt];
31772 return DAG.getConstant(Res, dl, MVT::i1);
31775 // Check whether this extract is the root of a sum of absolute differences
31776 // pattern. This has to be done here because we really want it to happen
31777 // pre-legalization,
31778 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
31781 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
31782 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
31785 // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW.
31786 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
31792 /// If a vector select has an operand that is -1 or 0, try to simplify the
31793 /// select to a bitwise logic operation.
31794 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
31796 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
31797 TargetLowering::DAGCombinerInfo &DCI,
31798 const X86Subtarget &Subtarget) {
31799 SDValue Cond = N->getOperand(0);
31800 SDValue LHS = N->getOperand(1);
31801 SDValue RHS = N->getOperand(2);
31802 EVT VT = LHS.getValueType();
31803 EVT CondVT = Cond.getValueType();
31805 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31807 if (N->getOpcode() != ISD::VSELECT)
31810 assert(CondVT.isVector() && "Vector select expects a vector selector!");
31812 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
31813 // Check if the first operand is all zeros and Cond type is vXi1.
31814 // This situation only applies to avx512.
31815 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
31816 CondVT.getVectorElementType() == MVT::i1) {
31817 // Invert the cond to not(cond) : xor(op,allones)=not(op)
31818 SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
31819 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
31820 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
31823 // To use the condition operand as a bitwise mask, it must have elements that
31824 // are the same size as the select elements. Ie, the condition operand must
31825 // have already been promoted from the IR select condition type <N x i1>.
31826 // Don't check if the types themselves are equal because that excludes
31827 // vector floating-point selects.
31828 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
31831 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
31832 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
31834 // Try to invert the condition if true value is not all 1s and false value is
31836 if (!TValIsAllOnes && !FValIsAllZeros &&
31837 // Check if the selector will be produced by CMPP*/PCMP*.
31838 Cond.getOpcode() == ISD::SETCC &&
31839 // Check if SETCC has already been promoted.
31840 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
31842 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
31844 if (TValIsAllZeros || FValIsAllOnes) {
31845 SDValue CC = Cond.getOperand(2);
31846 ISD::CondCode NewCC =
31847 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
31848 Cond.getOperand(0).getValueType().isInteger());
31849 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
31851 std::swap(LHS, RHS);
31852 TValIsAllOnes = FValIsAllOnes;
31853 FValIsAllZeros = TValIsAllZeros;
31857 // Cond value must be 'sign splat' to be converted to a logical op.
31858 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
31861 // vselect Cond, 111..., 000... -> Cond
31862 if (TValIsAllOnes && FValIsAllZeros)
31863 return DAG.getBitcast(VT, Cond);
31865 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
31868 // vselect Cond, 111..., X -> or Cond, X
31869 if (TValIsAllOnes) {
31870 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
31871 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
31872 return DAG.getBitcast(VT, Or);
31875 // vselect Cond, X, 000... -> and Cond, X
31876 if (FValIsAllZeros) {
31877 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
31878 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
31879 return DAG.getBitcast(VT, And);
31882 // vselect Cond, 000..., X -> andn Cond, X
31883 if (TValIsAllZeros) {
31884 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
31885 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
31886 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
31887 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
31888 return DAG.getBitcast(VT, AndN);
31894 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
31895 SDValue Cond = N->getOperand(0);
31896 SDValue LHS = N->getOperand(1);
31897 SDValue RHS = N->getOperand(2);
31900 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
31901 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
31902 if (!TrueC || !FalseC)
31905 // Don't do this for crazy integer types.
31906 EVT VT = N->getValueType(0);
31907 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
31910 // We're going to use the condition bit in math or logic ops. We could allow
31911 // this with a wider condition value (post-legalization it becomes an i8),
31912 // but if nothing is creating selects that late, it doesn't matter.
31913 if (Cond.getValueType() != MVT::i1)
31916 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
31917 // 3, 5, or 9 with i32/i64, so those get transformed too.
31918 // TODO: For constants that overflow or do not differ by power-of-2 or small
31919 // multiplier, convert to 'and' + 'add'.
31920 const APInt &TrueVal = TrueC->getAPIntValue();
31921 const APInt &FalseVal = FalseC->getAPIntValue();
31923 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
31927 APInt AbsDiff = Diff.abs();
31928 if (AbsDiff.isPowerOf2() ||
31929 ((VT == MVT::i32 || VT == MVT::i64) &&
31930 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
31932 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
31933 // of the condition can usually be folded into a compare predicate, but even
31934 // without that, the sequence should be cheaper than a CMOV alternative.
31935 if (TrueVal.slt(FalseVal)) {
31936 Cond = DAG.getNOT(DL, Cond, MVT::i1);
31937 std::swap(TrueC, FalseC);
31940 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
31941 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
31943 // Multiply condition by the difference if non-one.
31944 if (!AbsDiff.isOneValue())
31945 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
31947 // Add the base if non-zero.
31948 if (!FalseC->isNullValue())
31949 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
31957 /// Do target-specific dag combines on SELECT and VSELECT nodes.
31958 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31959 TargetLowering::DAGCombinerInfo &DCI,
31960 const X86Subtarget &Subtarget) {
31962 SDValue Cond = N->getOperand(0);
31963 // Get the LHS/RHS of the select.
31964 SDValue LHS = N->getOperand(1);
31965 SDValue RHS = N->getOperand(2);
31966 EVT VT = LHS.getValueType();
31967 EVT CondVT = Cond.getValueType();
31968 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31970 // Convert vselects with constant condition into shuffles.
31971 if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
31972 DCI.isBeforeLegalizeOps()) {
31973 SmallVector<int, 64> Mask(VT.getVectorNumElements(), -1);
31974 for (int i = 0, Size = Mask.size(); i != Size; ++i) {
31975 SDValue CondElt = Cond->getOperand(i);
31977 // Arbitrarily choose from the 2nd operand if the select condition element
31979 // TODO: Can we do better by matching patterns such as even/odd?
31980 if (CondElt.isUndef() || isNullConstant(CondElt))
31984 return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
31987 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31988 // instructions match the semantics of the common C idiom x<y?x:y but not
31989 // x<=y?x:y, because of how they handle negative zero (which can be
31990 // ignored in unsafe-math mode).
31991 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31992 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31993 VT != MVT::f80 && VT != MVT::f128 &&
31994 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31995 (Subtarget.hasSSE2() ||
31996 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31997 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31999 unsigned Opcode = 0;
32000 // Check for x CC y ? x : y.
32001 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32002 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32006 // Converting this to a min would handle NaNs incorrectly, and swapping
32007 // the operands would cause it to handle comparisons between positive
32008 // and negative zero incorrectly.
32009 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32010 if (!DAG.getTarget().Options.UnsafeFPMath &&
32011 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
32013 std::swap(LHS, RHS);
32015 Opcode = X86ISD::FMIN;
32018 // Converting this to a min would handle comparisons between positive
32019 // and negative zero incorrectly.
32020 if (!DAG.getTarget().Options.UnsafeFPMath &&
32021 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
32023 Opcode = X86ISD::FMIN;
32026 // Converting this to a min would handle both negative zeros and NaNs
32027 // incorrectly, but we can swap the operands to fix both.
32028 std::swap(LHS, RHS);
32033 Opcode = X86ISD::FMIN;
32037 // Converting this to a max would handle comparisons between positive
32038 // and negative zero incorrectly.
32039 if (!DAG.getTarget().Options.UnsafeFPMath &&
32040 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
32042 Opcode = X86ISD::FMAX;
32045 // Converting this to a max would handle NaNs incorrectly, and swapping
32046 // the operands would cause it to handle comparisons between positive
32047 // and negative zero incorrectly.
32048 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
32049 if (!DAG.getTarget().Options.UnsafeFPMath &&
32050 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
32052 std::swap(LHS, RHS);
32054 Opcode = X86ISD::FMAX;
32057 // Converting this to a max would handle both negative zeros and NaNs
32058 // incorrectly, but we can swap the operands to fix both.
32059 std::swap(LHS, RHS);
32064 Opcode = X86ISD::FMAX;
32067 // Check for x CC y ? y : x -- a min/max with reversed arms.
32068 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
32069 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
32073 // Converting this to a min would handle comparisons between positive
32074 // and negative zero incorrectly, and swapping the operands would
32075 // cause it to handle NaNs incorrectly.
32076 if (!DAG.getTarget().Options.UnsafeFPMath &&
32077 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
32078 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32080 std::swap(LHS, RHS);
32082 Opcode = X86ISD::FMIN;
32085 // Converting this to a min would handle NaNs incorrectly.
32086 if (!DAG.getTarget().Options.UnsafeFPMath &&
32087 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
32089 Opcode = X86ISD::FMIN;
32092 // Converting this to a min would handle both negative zeros and NaNs
32093 // incorrectly, but we can swap the operands to fix both.
32094 std::swap(LHS, RHS);
32099 Opcode = X86ISD::FMIN;
32103 // Converting this to a max would handle NaNs incorrectly.
32104 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32106 Opcode = X86ISD::FMAX;
32109 // Converting this to a max would handle comparisons between positive
32110 // and negative zero incorrectly, and swapping the operands would
32111 // cause it to handle NaNs incorrectly.
32112 if (!DAG.getTarget().Options.UnsafeFPMath &&
32113 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
32114 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
32116 std::swap(LHS, RHS);
32118 Opcode = X86ISD::FMAX;
32121 // Converting this to a max would handle both negative zeros and NaNs
32122 // incorrectly, but we can swap the operands to fix both.
32123 std::swap(LHS, RHS);
32128 Opcode = X86ISD::FMAX;
32134 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
32137 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
32138 // lowering on KNL. In this case we convert it to
32139 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
32140 // The same situation all vectors of i8 and i16 without BWI.
32141 // Make sure we extend these even before type legalization gets a chance to
32142 // split wide vectors.
32143 // Since SKX these selects have a proper lowering.
32144 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() &&
32145 CondVT.getVectorElementType() == MVT::i1 &&
32146 VT.getVectorNumElements() > 4 &&
32147 (VT.getVectorElementType() == MVT::i8 ||
32148 VT.getVectorElementType() == MVT::i16)) {
32149 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
32150 DCI.AddToWorklist(Cond.getNode());
32151 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
32154 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
32157 // Canonicalize max and min:
32158 // (x > y) ? x : y -> (x >= y) ? x : y
32159 // (x < y) ? x : y -> (x <= y) ? x : y
32160 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
32161 // the need for an extra compare
32162 // against zero. e.g.
32163 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
32165 // testl %edi, %edi
32167 // cmovgl %edi, %eax
32171 // cmovsl %eax, %edi
32172 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
32173 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
32174 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
32175 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32180 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
32181 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
32182 Cond.getOperand(0), Cond.getOperand(1), NewCC);
32183 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
32188 // Early exit check
32189 if (!TLI.isTypeLegal(VT))
32192 // Match VSELECTs into subs with unsigned saturation.
32193 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
32194 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
32195 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
32196 (Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
32197 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
32199 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
32200 // left side invert the predicate to simplify logic below.
32202 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
32204 CC = ISD::getSetCCInverse(CC, true);
32205 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
32209 if (Other.getNode() && Other->getNumOperands() == 2 &&
32210 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
32211 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
32212 SDValue CondRHS = Cond->getOperand(1);
32214 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
32215 ArrayRef<SDValue> Ops) {
32216 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
32219 // Look for a general sub with unsigned saturation first.
32220 // x >= y ? x-y : 0 --> subus x, y
32221 // x > y ? x-y : 0 --> subus x, y
32222 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
32223 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
32224 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32227 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
32228 if (isa<BuildVectorSDNode>(CondRHS)) {
32229 // If the RHS is a constant we have to reverse the const
32230 // canonicalization.
32231 // x > C-1 ? x+-C : 0 --> subus x, C
32232 auto MatchSUBUS = [](ConstantSDNode *Op, ConstantSDNode *Cond) {
32233 return Cond->getAPIntValue() == (-Op->getAPIntValue() - 1);
32235 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
32236 ISD::matchBinaryPredicate(OpRHS, CondRHS, MatchSUBUS)) {
32237 OpRHS = DAG.getNode(ISD::SUB, DL, VT,
32238 DAG.getConstant(0, DL, VT), OpRHS);
32239 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32243 // Another special case: If C was a sign bit, the sub has been
32244 // canonicalized into a xor.
32245 // FIXME: Would it be better to use computeKnownBits to determine
32246 // whether it's safe to decanonicalize the xor?
32247 // x s< 0 ? x^C : 0 --> subus x, C
32248 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode())
32249 if (CC == ISD::SETLT && Other.getOpcode() == ISD::XOR &&
32250 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
32251 OpRHSConst->getAPIntValue().isSignMask()) {
32252 OpRHS = DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT);
32253 // Note that we have to rebuild the RHS constant here to ensure we
32254 // don't rely on particular values of undef lanes.
32255 return SplitOpsAndApply(DAG, Subtarget, DL, VT, { OpLHS, OpRHS },
32262 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
32265 // If this is a *dynamic* select (non-constant condition) and we can match
32266 // this node with one of the variable blend instructions, restructure the
32267 // condition so that blends can use the high (sign) bit of each element and
32268 // use SimplifyDemandedBits to simplify the condition operand.
32269 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
32270 !DCI.isBeforeLegalize() &&
32271 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
32272 unsigned BitWidth = Cond.getScalarValueSizeInBits();
32274 // Don't optimize vector selects that map to mask-registers.
32278 // We can only handle the cases where VSELECT is directly legal on the
32279 // subtarget. We custom lower VSELECT nodes with constant conditions and
32280 // this makes it hard to see whether a dynamic VSELECT will correctly
32281 // lower, so we both check the operation's status and explicitly handle the
32282 // cases where a *dynamic* blend will fail even though a constant-condition
32283 // blend could be custom lowered.
32284 // FIXME: We should find a better way to handle this class of problems.
32285 // Potentially, we should combine constant-condition vselect nodes
32286 // pre-legalization into shuffles and not mark as many types as custom
32288 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
32290 // FIXME: We don't support i16-element blends currently. We could and
32291 // should support them by making *all* the bits in the condition be set
32292 // rather than just the high bit and using an i8-element blend.
32293 if (VT.getVectorElementType() == MVT::i16)
32295 // Dynamic blending was only available from SSE4.1 onward.
32296 if (VT.is128BitVector() && !Subtarget.hasSSE41())
32298 // Byte blends are only available in AVX2
32299 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
32301 // There are no 512-bit blend instructions that use sign bits.
32302 if (VT.is512BitVector())
32305 bool CanShrinkCond = true;
32306 for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
32308 // TODO: Add other opcodes eventually lowered into BLEND.
32309 if (UI->getOpcode() != ISD::VSELECT || UI.getOperandNo() != 0) {
32310 CanShrinkCond = false;
32315 if (CanShrinkCond) {
32316 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
32317 APInt DemandedMask(APInt::getSignMask(BitWidth));
32319 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32320 !DCI.isBeforeLegalizeOps());
32321 if (TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0,
32322 /*AssumeSingleUse*/true)) {
32323 // If we changed the computation somewhere in the DAG, this change will
32324 // affect all users of Cond. Update all the nodes so that we do not use
32325 // the generic VSELECT anymore. Otherwise, we may perform wrong
32326 // optimizations as we messed with the actual expectation for the vector
32328 for (SDNode *U : Cond->uses()) {
32329 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
32330 U->getValueType(0), Cond, U->getOperand(1),
32332 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
32334 DCI.CommitTargetLoweringOpt(TLO);
32335 return SDValue(N, 0);
32340 // Custom action for SELECT MMX
32341 if (VT == MVT::x86mmx) {
32342 LHS = DAG.getBitcast(MVT::i64, LHS);
32343 RHS = DAG.getBitcast(MVT::i64, RHS);
32344 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
32345 return DAG.getBitcast(VT, newSelect);
32352 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
32354 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
32355 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
32356 /// Note that this is only legal for some op/cc combinations.
32357 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
32359 const X86Subtarget &Subtarget) {
32360 // This combine only operates on CMP-like nodes.
32361 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32362 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32365 // Can't replace the cmp if it has more uses than the one we're looking at.
32366 // FIXME: We would like to be able to handle this, but would need to make sure
32367 // all uses were updated.
32368 if (!Cmp.hasOneUse())
32371 // This only applies to variations of the common case:
32372 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
32373 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
32374 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
32375 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
32376 // Using the proper condcodes (see below), overflow is checked for.
32378 // FIXME: We can generalize both constraints:
32379 // - XOR/OR/AND (if they were made to survive AtomicExpand)
32381 // if the result is compared.
32383 SDValue CmpLHS = Cmp.getOperand(0);
32384 SDValue CmpRHS = Cmp.getOperand(1);
32386 if (!CmpLHS.hasOneUse())
32389 unsigned Opc = CmpLHS.getOpcode();
32390 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
32393 SDValue OpRHS = CmpLHS.getOperand(2);
32394 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
32398 APInt Addend = OpRHSC->getAPIntValue();
32399 if (Opc == ISD::ATOMIC_LOAD_SUB)
32402 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
32406 APInt Comparison = CmpRHSC->getAPIntValue();
32408 // If the addend is the negation of the comparison value, then we can do
32409 // a full comparison by emitting the atomic arithmetic as a locked sub.
32410 if (Comparison == -Addend) {
32411 // The CC is fine, but we need to rewrite the LHS of the comparison as an
32413 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
32414 auto AtomicSub = DAG.getAtomic(
32415 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
32416 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
32417 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
32418 AN->getMemOperand());
32419 // If the comparision uses the CF flag we can't use INC/DEC instructions.
32420 bool NeedCF = false;
32423 case X86::COND_A: case X86::COND_AE:
32424 case X86::COND_B: case X86::COND_BE:
32428 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
32429 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32430 DAG.getUNDEF(CmpLHS.getValueType()));
32431 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32435 // We can handle comparisons with zero in a number of cases by manipulating
32437 if (!Comparison.isNullValue())
32440 if (CC == X86::COND_S && Addend == 1)
32442 else if (CC == X86::COND_NS && Addend == 1)
32444 else if (CC == X86::COND_G && Addend == -1)
32446 else if (CC == X86::COND_LE && Addend == -1)
32451 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
32452 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
32453 DAG.getUNDEF(CmpLHS.getValueType()));
32454 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
32458 // Check whether a boolean test is testing a boolean value generated by
32459 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
32462 // Simplify the following patterns:
32463 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
32464 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
32465 // to (Op EFLAGS Cond)
32467 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
32468 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
32469 // to (Op EFLAGS !Cond)
32471 // where Op could be BRCOND or CMOV.
32473 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
32474 // This combine only operates on CMP-like nodes.
32475 if (!(Cmp.getOpcode() == X86ISD::CMP ||
32476 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
32479 // Quit if not used as a boolean value.
32480 if (CC != X86::COND_E && CC != X86::COND_NE)
32483 // Check CMP operands. One of them should be 0 or 1 and the other should be
32484 // an SetCC or extended from it.
32485 SDValue Op1 = Cmp.getOperand(0);
32486 SDValue Op2 = Cmp.getOperand(1);
32489 const ConstantSDNode* C = nullptr;
32490 bool needOppositeCond = (CC == X86::COND_E);
32491 bool checkAgainstTrue = false; // Is it a comparison against 1?
32493 if ((C = dyn_cast<ConstantSDNode>(Op1)))
32495 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
32497 else // Quit if all operands are not constants.
32500 if (C->getZExtValue() == 1) {
32501 needOppositeCond = !needOppositeCond;
32502 checkAgainstTrue = true;
32503 } else if (C->getZExtValue() != 0)
32504 // Quit if the constant is neither 0 or 1.
32507 bool truncatedToBoolWithAnd = false;
32508 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
32509 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
32510 SetCC.getOpcode() == ISD::TRUNCATE ||
32511 SetCC.getOpcode() == ISD::AND) {
32512 if (SetCC.getOpcode() == ISD::AND) {
32514 if (isOneConstant(SetCC.getOperand(0)))
32516 if (isOneConstant(SetCC.getOperand(1)))
32520 SetCC = SetCC.getOperand(OpIdx);
32521 truncatedToBoolWithAnd = true;
32523 SetCC = SetCC.getOperand(0);
32526 switch (SetCC.getOpcode()) {
32527 case X86ISD::SETCC_CARRY:
32528 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
32529 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
32530 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
32531 // truncated to i1 using 'and'.
32532 if (checkAgainstTrue && !truncatedToBoolWithAnd)
32534 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
32535 "Invalid use of SETCC_CARRY!");
32537 case X86ISD::SETCC:
32538 // Set the condition code or opposite one if necessary.
32539 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
32540 if (needOppositeCond)
32541 CC = X86::GetOppositeBranchCondition(CC);
32542 return SetCC.getOperand(1);
32543 case X86ISD::CMOV: {
32544 // Check whether false/true value has canonical one, i.e. 0 or 1.
32545 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
32546 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
32547 // Quit if true value is not a constant.
32550 // Quit if false value is not a constant.
32552 SDValue Op = SetCC.getOperand(0);
32553 // Skip 'zext' or 'trunc' node.
32554 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
32555 Op.getOpcode() == ISD::TRUNCATE)
32556 Op = Op.getOperand(0);
32557 // A special case for rdrand/rdseed, where 0 is set if false cond is
32559 if ((Op.getOpcode() != X86ISD::RDRAND &&
32560 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
32563 // Quit if false value is not the constant 0 or 1.
32564 bool FValIsFalse = true;
32565 if (FVal && FVal->getZExtValue() != 0) {
32566 if (FVal->getZExtValue() != 1)
32568 // If FVal is 1, opposite cond is needed.
32569 needOppositeCond = !needOppositeCond;
32570 FValIsFalse = false;
32572 // Quit if TVal is not the constant opposite of FVal.
32573 if (FValIsFalse && TVal->getZExtValue() != 1)
32575 if (!FValIsFalse && TVal->getZExtValue() != 0)
32577 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
32578 if (needOppositeCond)
32579 CC = X86::GetOppositeBranchCondition(CC);
32580 return SetCC.getOperand(3);
32587 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
32589 /// (X86or (X86setcc) (X86setcc))
32590 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
32591 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
32592 X86::CondCode &CC1, SDValue &Flags,
32594 if (Cond->getOpcode() == X86ISD::CMP) {
32595 if (!isNullConstant(Cond->getOperand(1)))
32598 Cond = Cond->getOperand(0);
32603 SDValue SetCC0, SetCC1;
32604 switch (Cond->getOpcode()) {
32605 default: return false;
32612 SetCC0 = Cond->getOperand(0);
32613 SetCC1 = Cond->getOperand(1);
32617 // Make sure we have SETCC nodes, using the same flags value.
32618 if (SetCC0.getOpcode() != X86ISD::SETCC ||
32619 SetCC1.getOpcode() != X86ISD::SETCC ||
32620 SetCC0->getOperand(1) != SetCC1->getOperand(1))
32623 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
32624 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
32625 Flags = SetCC0->getOperand(1);
32629 // When legalizing carry, we create carries via add X, -1
32630 // If that comes from an actual carry, via setcc, we use the
32632 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
32633 if (EFLAGS.getOpcode() == X86ISD::ADD) {
32634 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
32635 SDValue Carry = EFLAGS.getOperand(0);
32636 while (Carry.getOpcode() == ISD::TRUNCATE ||
32637 Carry.getOpcode() == ISD::ZERO_EXTEND ||
32638 Carry.getOpcode() == ISD::SIGN_EXTEND ||
32639 Carry.getOpcode() == ISD::ANY_EXTEND ||
32640 (Carry.getOpcode() == ISD::AND &&
32641 isOneConstant(Carry.getOperand(1))))
32642 Carry = Carry.getOperand(0);
32643 if (Carry.getOpcode() == X86ISD::SETCC ||
32644 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
32645 if (Carry.getConstantOperandVal(0) == X86::COND_B)
32646 return Carry.getOperand(1);
32654 /// Optimize an EFLAGS definition used according to the condition code \p CC
32655 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
32656 /// uses of chain values.
32657 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
32659 const X86Subtarget &Subtarget) {
32660 if (CC == X86::COND_B)
32661 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
32664 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
32666 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
32669 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
32670 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
32671 TargetLowering::DAGCombinerInfo &DCI,
32672 const X86Subtarget &Subtarget) {
32675 SDValue FalseOp = N->getOperand(0);
32676 SDValue TrueOp = N->getOperand(1);
32677 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
32678 SDValue Cond = N->getOperand(3);
32680 // Try to simplify the EFLAGS and condition code operands.
32681 // We can't always do this as FCMOV only supports a subset of X86 cond.
32682 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
32683 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
32684 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
32686 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32690 // If this is a select between two integer constants, try to do some
32691 // optimizations. Note that the operands are ordered the opposite of SELECT
32693 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
32694 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
32695 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
32696 // larger than FalseC (the false value).
32697 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
32698 CC = X86::GetOppositeBranchCondition(CC);
32699 std::swap(TrueC, FalseC);
32700 std::swap(TrueOp, FalseOp);
32703 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
32704 // This is efficient for any integer data type (including i8/i16) and
32706 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
32707 Cond = getSETCC(CC, Cond, DL, DAG);
32709 // Zero extend the condition if needed.
32710 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
32712 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
32713 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
32714 DAG.getConstant(ShAmt, DL, MVT::i8));
32718 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
32719 // for any integer data type, including i8/i16.
32720 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
32721 Cond = getSETCC(CC, Cond, DL, DAG);
32723 // Zero extend the condition if needed.
32724 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
32725 FalseC->getValueType(0), Cond);
32726 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32727 SDValue(FalseC, 0));
32731 // Optimize cases that will turn into an LEA instruction. This requires
32732 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
32733 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
32734 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
32735 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
32737 bool isFastMultiplier = false;
32739 switch ((unsigned char)Diff) {
32741 case 1: // result = add base, cond
32742 case 2: // result = lea base( , cond*2)
32743 case 3: // result = lea base(cond, cond*2)
32744 case 4: // result = lea base( , cond*4)
32745 case 5: // result = lea base(cond, cond*4)
32746 case 8: // result = lea base( , cond*8)
32747 case 9: // result = lea base(cond, cond*8)
32748 isFastMultiplier = true;
32753 if (isFastMultiplier) {
32754 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
32755 Cond = getSETCC(CC, Cond, DL ,DAG);
32756 // Zero extend the condition if needed.
32757 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
32759 // Scale the condition by the difference.
32761 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
32762 DAG.getConstant(Diff, DL, Cond.getValueType()));
32764 // Add the base if non-zero.
32765 if (FalseC->getAPIntValue() != 0)
32766 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
32767 SDValue(FalseC, 0));
32774 // Handle these cases:
32775 // (select (x != c), e, c) -> select (x != c), e, x),
32776 // (select (x == c), c, e) -> select (x == c), x, e)
32777 // where the c is an integer constant, and the "select" is the combination
32778 // of CMOV and CMP.
32780 // The rationale for this change is that the conditional-move from a constant
32781 // needs two instructions, however, conditional-move from a register needs
32782 // only one instruction.
32784 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
32785 // some instruction-combining opportunities. This opt needs to be
32786 // postponed as late as possible.
32788 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
32789 // the DCI.xxxx conditions are provided to postpone the optimization as
32790 // late as possible.
32792 ConstantSDNode *CmpAgainst = nullptr;
32793 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
32794 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
32795 !isa<ConstantSDNode>(Cond.getOperand(0))) {
32797 if (CC == X86::COND_NE &&
32798 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
32799 CC = X86::GetOppositeBranchCondition(CC);
32800 std::swap(TrueOp, FalseOp);
32803 if (CC == X86::COND_E &&
32804 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
32805 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
32806 DAG.getConstant(CC, DL, MVT::i8), Cond };
32807 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32812 // Fold and/or of setcc's to double CMOV:
32813 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
32814 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
32816 // This combine lets us generate:
32817 // cmovcc1 (jcc1 if we don't have CMOV)
32823 // cmovne (jne if we don't have CMOV)
32824 // When we can't use the CMOV instruction, it might increase branch
32826 // When we can use CMOV, or when there is no mispredict, this improves
32827 // throughput and reduces register pressure.
32829 if (CC == X86::COND_NE) {
32831 X86::CondCode CC0, CC1;
32833 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
32835 std::swap(FalseOp, TrueOp);
32836 CC0 = X86::GetOppositeBranchCondition(CC0);
32837 CC1 = X86::GetOppositeBranchCondition(CC1);
32840 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
32842 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
32843 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
32844 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
32852 /// Different mul shrinking modes.
32853 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
32855 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
32856 EVT VT = N->getOperand(0).getValueType();
32857 if (VT.getScalarSizeInBits() != 32)
32860 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
32861 unsigned SignBits[2] = {1, 1};
32862 bool IsPositive[2] = {false, false};
32863 for (unsigned i = 0; i < 2; i++) {
32864 SDValue Opd = N->getOperand(i);
32866 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
32867 // compute signbits for it separately.
32868 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
32869 // For anyextend, it is safe to assume an appropriate number of leading
32871 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
32873 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
32878 IsPositive[i] = true;
32879 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
32880 // All the operands of BUILD_VECTOR need to be int constant.
32881 // Find the smallest value range which all the operands belong to.
32883 IsPositive[i] = true;
32884 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
32885 if (SubOp.isUndef())
32887 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
32890 APInt IntVal = CN->getAPIntValue();
32891 if (IntVal.isNegative())
32892 IsPositive[i] = false;
32893 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
32896 SignBits[i] = DAG.ComputeNumSignBits(Opd);
32897 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
32898 IsPositive[i] = true;
32902 bool AllPositive = IsPositive[0] && IsPositive[1];
32903 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
32904 // When ranges are from -128 ~ 127, use MULS8 mode.
32905 if (MinSignBits >= 25)
32907 // When ranges are from 0 ~ 255, use MULU8 mode.
32908 else if (AllPositive && MinSignBits >= 24)
32910 // When ranges are from -32768 ~ 32767, use MULS16 mode.
32911 else if (MinSignBits >= 17)
32913 // When ranges are from 0 ~ 65535, use MULU16 mode.
32914 else if (AllPositive && MinSignBits >= 16)
32921 /// When the operands of vector mul are extended from smaller size values,
32922 /// like i8 and i16, the type of mul may be shrinked to generate more
32923 /// efficient code. Two typical patterns are handled:
32925 /// %2 = sext/zext <N x i8> %1 to <N x i32>
32926 /// %4 = sext/zext <N x i8> %3 to <N x i32>
32927 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32928 /// %5 = mul <N x i32> %2, %4
32931 /// %2 = zext/sext <N x i16> %1 to <N x i32>
32932 /// %4 = zext/sext <N x i16> %3 to <N x i32>
32933 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32934 /// %5 = mul <N x i32> %2, %4
32936 /// There are four mul shrinking modes:
32937 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32938 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32939 /// generate pmullw+sext32 for it (MULS8 mode).
32940 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32941 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32942 /// generate pmullw+zext32 for it (MULU8 mode).
32943 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32944 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32945 /// generate pmullw+pmulhw for it (MULS16 mode).
32946 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32947 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32948 /// generate pmullw+pmulhuw for it (MULU16 mode).
32949 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32950 const X86Subtarget &Subtarget) {
32951 // Check for legality
32952 // pmullw/pmulhw are not supported by SSE.
32953 if (!Subtarget.hasSSE2())
32956 // Check for profitability
32957 // pmulld is supported since SSE41. It is better to use pmulld
32958 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32960 bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
32961 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32965 if (!canReduceVMulWidth(N, DAG, Mode))
32969 SDValue N0 = N->getOperand(0);
32970 SDValue N1 = N->getOperand(1);
32971 EVT VT = N->getOperand(0).getValueType();
32972 unsigned NumElts = VT.getVectorNumElements();
32973 if ((NumElts % 2) != 0)
32976 unsigned RegSize = 128;
32977 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32978 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32980 // Shrink the operands of mul.
32981 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32982 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32984 if (NumElts >= OpsVT.getVectorNumElements()) {
32985 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32986 // lower part is needed.
32987 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32988 if (Mode == MULU8 || Mode == MULS8) {
32989 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32992 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32993 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32994 // the higher part is also needed.
32995 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32996 ReducedVT, NewN0, NewN1);
32998 // Repack the lower part and higher part result of mul into a wider
33000 // Generate shuffle functioning as punpcklwd.
33001 SmallVector<int, 16> ShuffleMask(NumElts);
33002 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33003 ShuffleMask[2 * i] = i;
33004 ShuffleMask[2 * i + 1] = i + NumElts;
33007 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33008 ResLo = DAG.getBitcast(ResVT, ResLo);
33009 // Generate shuffle functioning as punpckhwd.
33010 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
33011 ShuffleMask[2 * i] = i + NumElts / 2;
33012 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
33015 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
33016 ResHi = DAG.getBitcast(ResVT, ResHi);
33017 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
33020 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
33021 // to legalize the mul explicitly because implicit legalization for type
33022 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
33023 // instructions which will not exist when we explicitly legalize it by
33024 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
33025 // <4 x i16> undef).
33027 // Legalize the operands of mul.
33028 // FIXME: We may be able to handle non-concatenated vectors by insertion.
33029 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
33030 if ((RegSize % ReducedSizeInBits) != 0)
33033 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
33034 DAG.getUNDEF(ReducedVT));
33036 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33038 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
33040 if (Mode == MULU8 || Mode == MULS8) {
33041 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
33043 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33045 // convert the type of mul result to VT.
33046 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33047 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
33048 : ISD::SIGN_EXTEND_VECTOR_INREG,
33050 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33051 DAG.getIntPtrConstant(0, DL));
33053 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
33054 // MULU16/MULS16, both parts are needed.
33055 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
33056 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
33057 OpsVT, NewN0, NewN1);
33059 // Repack the lower part and higher part result of mul into a wider
33060 // result. Make sure the type of mul result is VT.
33061 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
33062 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
33063 Res = DAG.getBitcast(ResVT, Res);
33064 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
33065 DAG.getIntPtrConstant(0, DL));
33070 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
33071 EVT VT, const SDLoc &DL) {
33073 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
33074 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33075 DAG.getConstant(Mult, DL, VT));
33076 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
33077 DAG.getConstant(Shift, DL, MVT::i8));
33078 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33083 auto combineMulMulAddOrSub = [&](bool isAdd) {
33084 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33085 DAG.getConstant(9, DL, VT));
33086 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
33087 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
33096 // mul x, 11 => add ((shl (mul x, 5), 1), x)
33097 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
33099 // mul x, 21 => add ((shl (mul x, 5), 2), x)
33100 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
33102 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
33103 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33104 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
33106 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
33107 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
33109 // mul x, 13 => add ((shl (mul x, 3), 2), x)
33110 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
33112 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
33113 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
33115 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
33116 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33117 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
33119 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
33120 return combineMulMulAddOrSub(/*isAdd*/ false);
33122 // mul x, 28 => add ((mul (mul x, 9), 3), x)
33123 return combineMulMulAddOrSub(/*isAdd*/ true);
33125 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
33126 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
33127 combineMulMulAddOrSub(/*isAdd*/ true));
33129 // mul x, 30 => sub (sub ((shl x, 5), x), x)
33130 return DAG.getNode(
33132 DAG.getNode(ISD::SUB, DL, VT,
33133 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33134 DAG.getConstant(5, DL, MVT::i8)),
33141 // If the upper 17 bits of each element are zero then we can use PMADDWD,
33142 // which is always at least as quick as PMULLD, expect on KNL.
33143 static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
33144 const X86Subtarget &Subtarget) {
33145 if (!Subtarget.hasSSE2())
33148 if (Subtarget.getProcFamily() == X86Subtarget::IntelKNL)
33151 EVT VT = N->getValueType(0);
33153 // Only support vXi32 vectors.
33154 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
33157 // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
33158 MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
33159 if (!DAG.getTargetLoweringInfo().isTypeLegal(WVT))
33162 SDValue N0 = N->getOperand(0);
33163 SDValue N1 = N->getOperand(1);
33164 APInt Mask17 = APInt::getHighBitsSet(32, 17);
33165 if (!DAG.MaskedValueIsZero(N1, Mask17) ||
33166 !DAG.MaskedValueIsZero(N0, Mask17))
33169 // Use SplitOpsAndApply to handle AVX splitting.
33170 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
33171 ArrayRef<SDValue> Ops) {
33172 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
33173 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
33175 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
33176 { DAG.getBitcast(WVT, N0), DAG.getBitcast(WVT, N1) },
33180 /// Optimize a single multiply with constant into two operations in order to
33181 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
33182 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
33183 TargetLowering::DAGCombinerInfo &DCI,
33184 const X86Subtarget &Subtarget) {
33185 EVT VT = N->getValueType(0);
33187 if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget))
33190 if (DCI.isBeforeLegalize() && VT.isVector())
33191 return reduceVMULWidth(N, DAG, Subtarget);
33193 if (!MulConstantOptimization)
33195 // An imul is usually smaller than the alternative sequence.
33196 if (DAG.getMachineFunction().getFunction().optForMinSize())
33199 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
33202 if (VT != MVT::i64 && VT != MVT::i32)
33205 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
33208 uint64_t MulAmt = C->getZExtValue();
33209 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
33212 uint64_t MulAmt1 = 0;
33213 uint64_t MulAmt2 = 0;
33214 if ((MulAmt % 9) == 0) {
33216 MulAmt2 = MulAmt / 9;
33217 } else if ((MulAmt % 5) == 0) {
33219 MulAmt2 = MulAmt / 5;
33220 } else if ((MulAmt % 3) == 0) {
33222 MulAmt2 = MulAmt / 3;
33228 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
33230 if (isPowerOf2_64(MulAmt2) &&
33231 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
33232 // If second multiplifer is pow2, issue it first. We want the multiply by
33233 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
33235 std::swap(MulAmt1, MulAmt2);
33237 if (isPowerOf2_64(MulAmt1))
33238 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33239 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
33241 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
33242 DAG.getConstant(MulAmt1, DL, VT));
33244 if (isPowerOf2_64(MulAmt2))
33245 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
33246 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
33248 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
33249 DAG.getConstant(MulAmt2, DL, VT));
33250 } else if (!Subtarget.slowLEA())
33251 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
33254 assert(MulAmt != 0 &&
33255 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
33256 "Both cases that could cause potential overflows should have "
33257 "already been handled.");
33258 int64_t SignMulAmt = C->getSExtValue();
33259 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
33260 (SignMulAmt != -INT64_MAX)) {
33261 int NumSign = SignMulAmt > 0 ? 1 : -1;
33262 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
33263 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
33264 if (IsPowerOf2_64PlusOne) {
33265 // (mul x, 2^N + 1) => (add (shl x, N), x)
33266 NewMul = DAG.getNode(
33267 ISD::ADD, DL, VT, N->getOperand(0),
33268 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33269 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
33271 } else if (IsPowerOf2_64MinusOne) {
33272 // (mul x, 2^N - 1) => (sub (shl x, N), x)
33273 NewMul = DAG.getNode(
33275 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
33276 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
33280 // To negate, subtract the number from zero
33281 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
33283 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
33288 // Do not add new nodes to DAG combiner worklist.
33289 DCI.CombineTo(N, NewMul, false);
33294 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
33295 SDValue N0 = N->getOperand(0);
33296 SDValue N1 = N->getOperand(1);
33297 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
33298 EVT VT = N0.getValueType();
33300 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
33301 // since the result of setcc_c is all zero's or all ones.
33302 if (VT.isInteger() && !VT.isVector() &&
33303 N1C && N0.getOpcode() == ISD::AND &&
33304 N0.getOperand(1).getOpcode() == ISD::Constant) {
33305 SDValue N00 = N0.getOperand(0);
33306 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
33307 Mask <<= N1C->getAPIntValue();
33308 bool MaskOK = false;
33309 // We can handle cases concerning bit-widening nodes containing setcc_c if
33310 // we carefully interrogate the mask to make sure we are semantics
33312 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
33313 // of the underlying setcc_c operation if the setcc_c was zero extended.
33314 // Consider the following example:
33315 // zext(setcc_c) -> i32 0x0000FFFF
33316 // c1 -> i32 0x0000FFFF
33317 // c2 -> i32 0x00000001
33318 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
33319 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
33320 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33322 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
33323 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
33325 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
33326 N00.getOpcode() == ISD::ANY_EXTEND) &&
33327 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
33328 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
33330 if (MaskOK && Mask != 0) {
33332 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
33336 // Hardware support for vector shifts is sparse which makes us scalarize the
33337 // vector operations in many cases. Also, on sandybridge ADD is faster than
33339 // (shl V, 1) -> add V,V
33340 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
33341 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
33342 assert(N0.getValueType().isVector() && "Invalid vector shift type");
33343 // We shift all of the values by one. In many cases we do not have
33344 // hardware support for this operation. This is better expressed as an ADD
33346 if (N1SplatC->getAPIntValue() == 1)
33347 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
33353 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
33354 SDValue N0 = N->getOperand(0);
33355 SDValue N1 = N->getOperand(1);
33356 EVT VT = N0.getValueType();
33357 unsigned Size = VT.getSizeInBits();
33359 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
33360 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
33361 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
33362 // depending on sign of (SarConst - [56,48,32,24,16])
33364 // sexts in X86 are MOVs. The MOVs have the same code size
33365 // as above SHIFTs (only SHIFT on 1 has lower code size).
33366 // However the MOVs have 2 advantages to a SHIFT:
33367 // 1. MOVs can write to a register that differs from source
33368 // 2. MOVs accept memory operands
33370 if (VT.isVector() || N1.getOpcode() != ISD::Constant ||
33371 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
33372 N0.getOperand(1).getOpcode() != ISD::Constant)
33375 SDValue N00 = N0.getOperand(0);
33376 SDValue N01 = N0.getOperand(1);
33377 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
33378 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
33379 EVT CVT = N1.getValueType();
33381 if (SarConst.isNegative())
33384 for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) {
33385 unsigned ShiftSize = SVT.getSizeInBits();
33386 // skipping types without corresponding sext/zext and
33387 // ShlConst that is not one of [56,48,32,24,16]
33388 if (ShiftSize >= Size || ShlConst != Size - ShiftSize)
33392 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
33393 SarConst = SarConst - (Size - ShiftSize);
33396 else if (SarConst.isNegative())
33397 return DAG.getNode(ISD::SHL, DL, VT, NN,
33398 DAG.getConstant(-SarConst, DL, CVT));
33400 return DAG.getNode(ISD::SRA, DL, VT, NN,
33401 DAG.getConstant(SarConst, DL, CVT));
33406 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
33407 TargetLowering::DAGCombinerInfo &DCI) {
33408 SDValue N0 = N->getOperand(0);
33409 SDValue N1 = N->getOperand(1);
33410 EVT VT = N0.getValueType();
33412 // Only do this on the last DAG combine as it can interfere with other
33414 if (!DCI.isAfterLegalizeDAG())
33417 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
33418 // TODO: This is a generic DAG combine that became an x86-only combine to
33419 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
33420 // and-not ('andn').
33421 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
33424 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
33425 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
33426 if (!ShiftC || !AndC)
33429 // If we can shrink the constant mask below 8-bits or 32-bits, then this
33430 // transform should reduce code size. It may also enable secondary transforms
33431 // from improved known-bits analysis or instruction selection.
33432 APInt MaskVal = AndC->getAPIntValue();
33434 // If this can be matched by a zero extend, don't optimize.
33435 if (MaskVal.isMask()) {
33436 unsigned TO = MaskVal.countTrailingOnes();
33437 if (TO >= 8 && isPowerOf2_32(TO))
33441 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
33442 unsigned OldMaskSize = MaskVal.getMinSignedBits();
33443 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
33444 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
33445 (OldMaskSize > 32 && NewMaskSize <= 32)) {
33446 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
33448 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
33449 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
33450 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
33455 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
33456 TargetLowering::DAGCombinerInfo &DCI,
33457 const X86Subtarget &Subtarget) {
33458 if (N->getOpcode() == ISD::SHL)
33459 if (SDValue V = combineShiftLeft(N, DAG))
33462 if (N->getOpcode() == ISD::SRA)
33463 if (SDValue V = combineShiftRightArithmetic(N, DAG))
33466 if (N->getOpcode() == ISD::SRL)
33467 if (SDValue V = combineShiftRightLogical(N, DAG, DCI))
33473 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
33474 TargetLowering::DAGCombinerInfo &DCI,
33475 const X86Subtarget &Subtarget) {
33476 unsigned Opcode = N->getOpcode();
33477 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
33478 "Unexpected shift opcode");
33480 EVT VT = N->getValueType(0);
33481 SDValue N0 = N->getOperand(0);
33482 SDValue N1 = N->getOperand(1);
33483 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
33484 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
33485 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
33486 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
33487 "Unexpected PACKSS/PACKUS input type");
33489 // Constant Folding.
33490 APInt UndefElts0, UndefElts1;
33491 SmallVector<APInt, 32> EltBits0, EltBits1;
33492 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
33493 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
33494 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
33495 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
33496 unsigned NumLanes = VT.getSizeInBits() / 128;
33497 unsigned NumDstElts = VT.getVectorNumElements();
33498 unsigned NumSrcElts = NumDstElts / 2;
33499 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
33500 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
33501 bool IsSigned = (X86ISD::PACKSS == Opcode);
33503 APInt Undefs(NumDstElts, 0);
33504 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
33505 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
33506 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
33507 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
33508 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
33509 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
33511 if (UndefElts[SrcIdx]) {
33512 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
33516 APInt &Val = EltBits[SrcIdx];
33518 // PACKSS: Truncate signed value with signed saturation.
33519 // Source values less than dst minint are saturated to minint.
33520 // Source values greater than dst maxint are saturated to maxint.
33521 if (Val.isSignedIntN(DstBitsPerElt))
33522 Val = Val.trunc(DstBitsPerElt);
33523 else if (Val.isNegative())
33524 Val = APInt::getSignedMinValue(DstBitsPerElt);
33526 Val = APInt::getSignedMaxValue(DstBitsPerElt);
33528 // PACKUS: Truncate signed value with unsigned saturation.
33529 // Source values less than zero are saturated to zero.
33530 // Source values greater than dst maxuint are saturated to maxuint.
33531 if (Val.isIntN(DstBitsPerElt))
33532 Val = Val.trunc(DstBitsPerElt);
33533 else if (Val.isNegative())
33534 Val = APInt::getNullValue(DstBitsPerElt);
33536 Val = APInt::getAllOnesValue(DstBitsPerElt);
33538 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
33542 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
33545 // Attempt to combine as shuffle.
33547 if (SDValue Res = combineX86ShufflesRecursively(
33548 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33549 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33550 DCI.CombineTo(N, Res);
33557 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
33558 TargetLowering::DAGCombinerInfo &DCI,
33559 const X86Subtarget &Subtarget) {
33560 unsigned Opcode = N->getOpcode();
33561 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
33562 X86ISD::VSRLI == Opcode) &&
33563 "Unexpected shift opcode");
33564 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
33565 EVT VT = N->getValueType(0);
33566 SDValue N0 = N->getOperand(0);
33567 SDValue N1 = N->getOperand(1);
33568 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
33569 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
33570 "Unexpected value type");
33572 // Out of range logical bit shifts are guaranteed to be zero.
33573 // Out of range arithmetic bit shifts splat the sign bit.
33574 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
33575 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
33577 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33579 ShiftVal = NumBitsPerElt - 1;
33582 // Shift N0 by zero -> N0.
33586 // Shift zero -> zero.
33587 if (ISD::isBuildVectorAllZeros(N0.getNode()))
33588 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
33590 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
33591 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
33592 // TODO - support other sra opcodes as needed.
33593 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
33594 N0.getOpcode() == X86ISD::VSRAI)
33595 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
33597 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
33598 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
33599 N1 == N0.getOperand(1)) {
33600 SDValue N00 = N0.getOperand(0);
33601 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
33602 if (ShiftVal.ult(NumSignBits))
33606 // We can decode 'whole byte' logical bit shifts as shuffles.
33607 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
33609 if (SDValue Res = combineX86ShufflesRecursively(
33610 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33611 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33612 DCI.CombineTo(N, Res);
33617 // Constant Folding.
33619 SmallVector<APInt, 32> EltBits;
33620 if (N->isOnlyUserOf(N0.getNode()) &&
33621 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
33622 assert(EltBits.size() == VT.getVectorNumElements() &&
33623 "Unexpected shift value type");
33624 unsigned ShiftImm = ShiftVal.getZExtValue();
33625 for (APInt &Elt : EltBits) {
33626 if (X86ISD::VSHLI == Opcode)
33628 else if (X86ISD::VSRAI == Opcode)
33629 Elt.ashrInPlace(ShiftImm);
33631 Elt.lshrInPlace(ShiftImm);
33633 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
33639 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
33640 TargetLowering::DAGCombinerInfo &DCI,
33641 const X86Subtarget &Subtarget) {
33643 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
33644 (N->getOpcode() == X86ISD::PINSRW &&
33645 N->getValueType(0) == MVT::v8i16)) &&
33646 "Unexpected vector insertion");
33648 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
33650 if (SDValue Res = combineX86ShufflesRecursively(
33651 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33652 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33653 DCI.CombineTo(N, Res);
33660 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
33661 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
33662 /// OR -> CMPNEQSS.
33663 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
33664 TargetLowering::DAGCombinerInfo &DCI,
33665 const X86Subtarget &Subtarget) {
33668 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
33669 // we're requiring SSE2 for both.
33670 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
33671 SDValue N0 = N->getOperand(0);
33672 SDValue N1 = N->getOperand(1);
33673 SDValue CMP0 = N0->getOperand(1);
33674 SDValue CMP1 = N1->getOperand(1);
33677 // The SETCCs should both refer to the same CMP.
33678 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
33681 SDValue CMP00 = CMP0->getOperand(0);
33682 SDValue CMP01 = CMP0->getOperand(1);
33683 EVT VT = CMP00.getValueType();
33685 if (VT == MVT::f32 || VT == MVT::f64) {
33686 bool ExpectingFlags = false;
33687 // Check for any users that want flags:
33688 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
33689 !ExpectingFlags && UI != UE; ++UI)
33690 switch (UI->getOpcode()) {
33695 ExpectingFlags = true;
33697 case ISD::CopyToReg:
33698 case ISD::SIGN_EXTEND:
33699 case ISD::ZERO_EXTEND:
33700 case ISD::ANY_EXTEND:
33704 if (!ExpectingFlags) {
33705 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
33706 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
33708 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
33709 X86::CondCode tmp = cc0;
33714 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
33715 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
33716 // FIXME: need symbolic constants for these magic numbers.
33717 // See X86ATTInstPrinter.cpp:printSSECC().
33718 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
33719 if (Subtarget.hasAVX512()) {
33721 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
33722 DAG.getConstant(x86cc, DL, MVT::i8));
33723 // Need to fill with zeros to ensure the bitcast will produce zeroes
33724 // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that.
33725 SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1,
33726 DAG.getConstant(0, DL, MVT::v16i1),
33727 FSetCC, DAG.getIntPtrConstant(0, DL));
33728 return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL,
33729 N->getSimpleValueType(0));
33731 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
33732 CMP00.getValueType(), CMP00, CMP01,
33733 DAG.getConstant(x86cc, DL,
33736 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
33737 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
33739 if (is64BitFP && !Subtarget.is64Bit()) {
33740 // On a 32-bit target, we cannot bitcast the 64-bit float to a
33741 // 64-bit integer, since that's not a legal type. Since
33742 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
33743 // bits, but can do this little dance to extract the lowest 32 bits
33744 // and work with those going forward.
33745 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
33747 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
33748 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
33749 Vector32, DAG.getIntPtrConstant(0, DL));
33753 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
33754 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
33755 DAG.getConstant(1, DL, IntVT));
33756 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
33758 return OneBitOfTruth;
33766 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
33767 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
33768 assert(N->getOpcode() == ISD::AND);
33770 EVT VT = N->getValueType(0);
33771 SDValue N0 = N->getOperand(0);
33772 SDValue N1 = N->getOperand(1);
33775 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
33778 if (N0.getOpcode() == ISD::XOR &&
33779 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
33780 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
33782 if (N1.getOpcode() == ISD::XOR &&
33783 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
33784 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
33789 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
33790 // register. In most cases we actually compare or select YMM-sized registers
33791 // and mixing the two types creates horrible code. This method optimizes
33792 // some of the transition sequences.
33793 // Even with AVX-512 this is still useful for removing casts around logical
33794 // operations on vXi1 mask types.
33795 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
33796 const X86Subtarget &Subtarget) {
33797 EVT VT = N->getValueType(0);
33798 assert(VT.isVector() && "Expected vector type");
33800 assert((N->getOpcode() == ISD::ANY_EXTEND ||
33801 N->getOpcode() == ISD::ZERO_EXTEND ||
33802 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
33804 SDValue Narrow = N->getOperand(0);
33805 EVT NarrowVT = Narrow.getValueType();
33807 if (Narrow->getOpcode() != ISD::XOR &&
33808 Narrow->getOpcode() != ISD::AND &&
33809 Narrow->getOpcode() != ISD::OR)
33812 SDValue N0 = Narrow->getOperand(0);
33813 SDValue N1 = Narrow->getOperand(1);
33816 // The Left side has to be a trunc.
33817 if (N0.getOpcode() != ISD::TRUNCATE)
33820 // The type of the truncated inputs.
33821 if (N0->getOperand(0).getValueType() != VT)
33824 // The right side has to be a 'trunc' or a constant vector.
33825 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
33826 N1.getOperand(0).getValueType() == VT;
33828 !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
33831 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33833 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
33836 // Set N0 and N1 to hold the inputs to the new wide operation.
33837 N0 = N0->getOperand(0);
33839 N1 = N1->getOperand(0);
33841 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
33843 // Generate the wide operation.
33844 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
33845 unsigned Opcode = N->getOpcode();
33847 default: llvm_unreachable("Unexpected opcode");
33848 case ISD::ANY_EXTEND:
33850 case ISD::ZERO_EXTEND:
33851 return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
33852 case ISD::SIGN_EXTEND:
33853 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
33854 Op, DAG.getValueType(NarrowVT));
33858 /// If both input operands of a logic op are being cast from floating point
33859 /// types, try to convert this into a floating point logic node to avoid
33860 /// unnecessary moves from SSE to integer registers.
33861 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
33862 const X86Subtarget &Subtarget) {
33863 unsigned FPOpcode = ISD::DELETED_NODE;
33864 if (N->getOpcode() == ISD::AND)
33865 FPOpcode = X86ISD::FAND;
33866 else if (N->getOpcode() == ISD::OR)
33867 FPOpcode = X86ISD::FOR;
33868 else if (N->getOpcode() == ISD::XOR)
33869 FPOpcode = X86ISD::FXOR;
33871 assert(FPOpcode != ISD::DELETED_NODE &&
33872 "Unexpected input node for FP logic conversion");
33874 EVT VT = N->getValueType(0);
33875 SDValue N0 = N->getOperand(0);
33876 SDValue N1 = N->getOperand(1);
33878 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
33879 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
33880 (Subtarget.hasSSE2() && VT == MVT::i64))) {
33881 SDValue N00 = N0.getOperand(0);
33882 SDValue N10 = N1.getOperand(0);
33883 EVT N00Type = N00.getValueType();
33884 EVT N10Type = N10.getValueType();
33885 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
33886 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
33887 return DAG.getBitcast(VT, FPLogic);
33893 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
33894 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
33895 /// with a shift-right to eliminate loading the vector constant mask value.
33896 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
33897 const X86Subtarget &Subtarget) {
33898 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
33899 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
33900 EVT VT0 = Op0.getValueType();
33901 EVT VT1 = Op1.getValueType();
33903 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
33907 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
33908 !SplatVal.isMask())
33911 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
33914 unsigned EltBitWidth = VT0.getScalarSizeInBits();
33915 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
33919 unsigned ShiftVal = SplatVal.countTrailingOnes();
33920 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
33921 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
33922 return DAG.getBitcast(N->getValueType(0), Shift);
33925 // Get the index node from the lowered DAG of a GEP IR instruction with one
33926 // indexing dimension.
33927 static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
33928 if (Ld->isIndexed())
33931 SDValue Base = Ld->getBasePtr();
33933 if (Base.getOpcode() != ISD::ADD)
33936 SDValue ShiftedIndex = Base.getOperand(0);
33938 if (ShiftedIndex.getOpcode() != ISD::SHL)
33941 return ShiftedIndex.getOperand(0);
33945 static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
33946 if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
33947 switch (VT.getSizeInBits()) {
33948 default: return false;
33949 case 64: return Subtarget.is64Bit() ? true : false;
33950 case 32: return true;
33956 // This function recognizes cases where X86 bzhi instruction can replace and
33957 // 'and-load' sequence.
33958 // In case of loading integer value from an array of constants which is defined
33961 // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
33963 // then applying a bitwise and on the result with another input.
33964 // It's equivalent to performing bzhi (zero high bits) on the input, with the
33965 // same index of the load.
33966 static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
33967 const X86Subtarget &Subtarget) {
33968 MVT VT = Node->getSimpleValueType(0);
33971 // Check if subtarget has BZHI instruction for the node's type
33972 if (!hasBZHI(Subtarget, VT))
33975 // Try matching the pattern for both operands.
33976 for (unsigned i = 0; i < 2; i++) {
33977 SDValue N = Node->getOperand(i);
33978 LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
33980 // continue if the operand is not a load instruction
33984 const Value *MemOp = Ld->getMemOperand()->getValue();
33989 if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
33990 if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
33991 if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
33993 Constant *Init = GV->getInitializer();
33994 Type *Ty = Init->getType();
33995 if (!isa<ConstantDataArray>(Init) ||
33996 !Ty->getArrayElementType()->isIntegerTy() ||
33997 Ty->getArrayElementType()->getScalarSizeInBits() !=
33998 VT.getSizeInBits() ||
33999 Ty->getArrayNumElements() >
34000 Ty->getArrayElementType()->getScalarSizeInBits())
34003 // Check if the array's constant elements are suitable to our case.
34004 uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
34005 bool ConstantsMatch = true;
34006 for (uint64_t j = 0; j < ArrayElementCount; j++) {
34007 ConstantInt *Elem =
34008 dyn_cast<ConstantInt>(Init->getAggregateElement(j));
34009 if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
34010 ConstantsMatch = false;
34014 if (!ConstantsMatch)
34017 // Do the transformation (For 32-bit type):
34018 // -> (and (load arr[idx]), inp)
34019 // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
34020 // that will be replaced with one bzhi instruction.
34021 SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
34022 SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
34024 // Get the Node which indexes into the array.
34025 SDValue Index = getIndexFromUnindexedLoad(Ld);
34028 Index = DAG.getZExtOrTrunc(Index, dl, VT);
34030 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
34032 SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
34033 SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
34035 return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
34043 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
34044 TargetLowering::DAGCombinerInfo &DCI,
34045 const X86Subtarget &Subtarget) {
34046 EVT VT = N->getValueType(0);
34048 // If this is SSE1 only convert to FAND to avoid scalarization.
34049 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34050 return DAG.getBitcast(
34051 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
34052 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34053 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34056 // Use a 32-bit and+zext if upper bits known zero.
34057 if (VT == MVT::i64 && Subtarget.is64Bit() &&
34058 !isa<ConstantSDNode>(N->getOperand(1))) {
34059 APInt HiMask = APInt::getHighBitsSet(64, 32);
34060 if (DAG.MaskedValueIsZero(N->getOperand(1), HiMask) ||
34061 DAG.MaskedValueIsZero(N->getOperand(0), HiMask)) {
34063 SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(0));
34064 SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N->getOperand(1));
34065 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64,
34066 DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS));
34070 if (DCI.isBeforeLegalizeOps())
34073 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34076 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34079 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
34082 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
34085 if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
34088 // Attempt to recursively combine a bitmask AND with shuffles.
34089 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34091 if (SDValue Res = combineX86ShufflesRecursively(
34092 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
34093 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
34094 DCI.CombineTo(N, Res);
34099 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
34100 if ((VT.getScalarSizeInBits() % 8) == 0 &&
34101 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
34102 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
34103 SDValue BitMask = N->getOperand(1);
34104 SDValue SrcVec = N->getOperand(0).getOperand(0);
34105 EVT SrcVecVT = SrcVec.getValueType();
34107 // Check that the constant bitmask masks whole bytes.
34109 SmallVector<APInt, 64> EltBits;
34110 if (VT == SrcVecVT.getScalarType() &&
34111 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
34112 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
34113 llvm::all_of(EltBits, [](APInt M) {
34114 return M.isNullValue() || M.isAllOnesValue();
34116 unsigned NumElts = SrcVecVT.getVectorNumElements();
34117 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
34118 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
34120 // Create a root shuffle mask from the byte mask and the extracted index.
34121 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
34122 for (unsigned i = 0; i != Scale; ++i) {
34125 int VecIdx = Scale * Idx + i;
34126 ShuffleMask[VecIdx] =
34127 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
34130 if (SDValue Shuffle = combineX86ShufflesRecursively(
34131 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
34132 /*HasVarMask*/ false, DAG, DCI, Subtarget))
34133 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
34134 N->getOperand(0).getOperand(1));
34142 // (or (and (m, y), (pandn m, x)))
34144 // (vselect m, x, y)
34145 // As a special case, try to fold:
34146 // (or (and (m, (sub 0, x)), (pandn m, x)))
34148 // (sub (xor X, M), M)
34149 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
34150 const X86Subtarget &Subtarget) {
34151 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
34153 SDValue N0 = N->getOperand(0);
34154 SDValue N1 = N->getOperand(1);
34155 EVT VT = N->getValueType(0);
34157 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
34158 (VT.is256BitVector() && Subtarget.hasInt256())))
34161 // Canonicalize AND to LHS.
34162 if (N1.getOpcode() == ISD::AND)
34165 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
34166 // ANDNP combine allows other combines to happen that prevent matching.
34167 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
34170 SDValue Mask = N1.getOperand(0);
34171 SDValue X = N1.getOperand(1);
34173 if (N0.getOperand(0) == Mask)
34174 Y = N0.getOperand(1);
34175 if (N0.getOperand(1) == Mask)
34176 Y = N0.getOperand(0);
34178 // Check to see if the mask appeared in both the AND and ANDNP.
34182 // Validate that X, Y, and Mask are bitcasts, and see through them.
34183 Mask = peekThroughBitcasts(Mask);
34184 X = peekThroughBitcasts(X);
34185 Y = peekThroughBitcasts(Y);
34187 EVT MaskVT = Mask.getValueType();
34188 unsigned EltBits = MaskVT.getScalarSizeInBits();
34190 // TODO: Attempt to handle floating point cases as well?
34191 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
34197 // (or (and (M, (sub 0, X)), (pandn M, X)))
34198 // which is a special case of vselect:
34199 // (vselect M, (sub 0, X), X)
34201 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
34202 // We know that, if fNegate is 0 or 1:
34203 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
34205 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
34206 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
34207 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
34208 // This lets us transform our vselect to:
34209 // (add (xor X, M), (and M, 1))
34211 // (sub (xor X, M), M)
34212 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
34213 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
34214 auto IsNegV = [](SDNode *N, SDValue V) {
34215 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
34216 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
34219 if (IsNegV(Y.getNode(), X))
34221 else if (IsNegV(X.getNode(), Y))
34225 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
34226 SDValue SubOp2 = Mask;
34228 // If the negate was on the false side of the select, then
34229 // the operands of the SUB need to be swapped. PR 27251.
34230 // This is because the pattern being matched above is
34231 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
34232 // but if the pattern matched was
34233 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
34234 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
34235 // pattern also needs to be a negation of the replacement pattern above.
34236 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
34237 // sub accomplishes the negation of the replacement pattern.
34239 std::swap(SubOp1, SubOp2);
34241 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
34242 return DAG.getBitcast(VT, Res);
34246 // PBLENDVB is only available on SSE 4.1.
34247 if (!Subtarget.hasSSE41())
34250 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
34252 X = DAG.getBitcast(BlendVT, X);
34253 Y = DAG.getBitcast(BlendVT, Y);
34254 Mask = DAG.getBitcast(BlendVT, Mask);
34255 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
34256 return DAG.getBitcast(VT, Mask);
34259 // Helper function for combineOrCmpEqZeroToCtlzSrl
34263 // srl(ctlz x), log2(bitsize(x))
34264 // Input pattern is checked by caller.
34265 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
34266 SelectionDAG &DAG) {
34267 SDValue Cmp = Op.getOperand(1);
34268 EVT VT = Cmp.getOperand(0).getValueType();
34269 unsigned Log2b = Log2_32(VT.getSizeInBits());
34271 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
34272 // The result of the shift is true or false, and on X86, the 32-bit
34273 // encoding of shr and lzcnt is more desirable.
34274 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
34275 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
34276 DAG.getConstant(Log2b, dl, VT));
34277 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
34280 // Try to transform:
34281 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
34283 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
34284 // Will also attempt to match more generic cases, eg:
34285 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
34286 // Only applies if the target supports the FastLZCNT feature.
34287 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
34288 TargetLowering::DAGCombinerInfo &DCI,
34289 const X86Subtarget &Subtarget) {
34290 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
34293 auto isORCandidate = [](SDValue N) {
34294 return (N->getOpcode() == ISD::OR && N->hasOneUse());
34297 // Check the zero extend is extending to 32-bit or more. The code generated by
34298 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
34299 // instructions to clear the upper bits.
34300 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
34301 !isORCandidate(N->getOperand(0)))
34304 // Check the node matches: setcc(eq, cmp 0)
34305 auto isSetCCCandidate = [](SDValue N) {
34306 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
34307 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
34308 N->getOperand(1).getOpcode() == X86ISD::CMP &&
34309 isNullConstant(N->getOperand(1).getOperand(1)) &&
34310 N->getOperand(1).getValueType().bitsGE(MVT::i32);
34313 SDNode *OR = N->getOperand(0).getNode();
34314 SDValue LHS = OR->getOperand(0);
34315 SDValue RHS = OR->getOperand(1);
34317 // Save nodes matching or(or, setcc(eq, cmp 0)).
34318 SmallVector<SDNode *, 2> ORNodes;
34319 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
34320 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
34321 ORNodes.push_back(OR);
34322 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
34323 LHS = OR->getOperand(0);
34324 RHS = OR->getOperand(1);
34327 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
34328 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
34329 !isORCandidate(SDValue(OR, 0)))
34332 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
34334 // or(srl(ctlz),srl(ctlz)).
34335 // The dag combiner can then fold it into:
34336 // srl(or(ctlz, ctlz)).
34337 EVT VT = OR->getValueType(0);
34338 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
34339 SDValue Ret, NewRHS;
34340 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
34341 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
34346 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
34347 while (ORNodes.size() > 0) {
34348 OR = ORNodes.pop_back_val();
34349 LHS = OR->getOperand(0);
34350 RHS = OR->getOperand(1);
34351 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
34352 if (RHS->getOpcode() == ISD::OR)
34353 std::swap(LHS, RHS);
34354 EVT VT = OR->getValueType(0);
34355 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
34358 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
34362 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
34367 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
34368 TargetLowering::DAGCombinerInfo &DCI,
34369 const X86Subtarget &Subtarget) {
34370 SDValue N0 = N->getOperand(0);
34371 SDValue N1 = N->getOperand(1);
34372 EVT VT = N->getValueType(0);
34374 // If this is SSE1 only convert to FOR to avoid scalarization.
34375 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
34376 return DAG.getBitcast(MVT::v4i32,
34377 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
34378 DAG.getBitcast(MVT::v4f32, N0),
34379 DAG.getBitcast(MVT::v4f32, N1)));
34382 if (DCI.isBeforeLegalizeOps())
34385 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
34388 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34391 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
34394 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
34397 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
34398 bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
34400 // SHLD/SHRD instructions have lower register pressure, but on some
34401 // platforms they have higher latency than the equivalent
34402 // series of shifts/or that would otherwise be generated.
34403 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
34404 // have higher latencies and we are not optimizing for size.
34405 if (!OptForSize && Subtarget.isSHLDSlow())
34408 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
34410 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
34412 if (!N0.hasOneUse() || !N1.hasOneUse())
34415 SDValue ShAmt0 = N0.getOperand(1);
34416 if (ShAmt0.getValueType() != MVT::i8)
34418 SDValue ShAmt1 = N1.getOperand(1);
34419 if (ShAmt1.getValueType() != MVT::i8)
34421 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
34422 ShAmt0 = ShAmt0.getOperand(0);
34423 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
34424 ShAmt1 = ShAmt1.getOperand(0);
34427 unsigned Opc = X86ISD::SHLD;
34428 SDValue Op0 = N0.getOperand(0);
34429 SDValue Op1 = N1.getOperand(0);
34430 if (ShAmt0.getOpcode() == ISD::SUB ||
34431 ShAmt0.getOpcode() == ISD::XOR) {
34432 Opc = X86ISD::SHRD;
34433 std::swap(Op0, Op1);
34434 std::swap(ShAmt0, ShAmt1);
34437 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
34438 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
34439 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
34440 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
34441 unsigned Bits = VT.getSizeInBits();
34442 if (ShAmt1.getOpcode() == ISD::SUB) {
34443 SDValue Sum = ShAmt1.getOperand(0);
34444 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
34445 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
34446 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
34447 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
34448 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
34449 return DAG.getNode(Opc, DL, VT,
34451 DAG.getNode(ISD::TRUNCATE, DL,
34454 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
34455 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
34456 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
34457 return DAG.getNode(Opc, DL, VT,
34458 N0.getOperand(0), N1.getOperand(0),
34459 DAG.getNode(ISD::TRUNCATE, DL,
34461 } else if (ShAmt1.getOpcode() == ISD::XOR) {
34462 SDValue Mask = ShAmt1.getOperand(1);
34463 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
34464 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
34465 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
34466 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
34467 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
34468 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
34469 if (Op1.getOpcode() == InnerShift &&
34470 isa<ConstantSDNode>(Op1.getOperand(1)) &&
34471 Op1.getConstantOperandVal(1) == 1) {
34472 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
34473 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
34475 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
34476 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
34477 Op1.getOperand(0) == Op1.getOperand(1)) {
34478 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
34479 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
34488 /// Try to turn tests against the signbit in the form of:
34489 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
34492 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
34493 // This is only worth doing if the output type is i8 or i1.
34494 EVT ResultType = N->getValueType(0);
34495 if (ResultType != MVT::i8 && ResultType != MVT::i1)
34498 SDValue N0 = N->getOperand(0);
34499 SDValue N1 = N->getOperand(1);
34501 // We should be performing an xor against a truncated shift.
34502 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
34505 // Make sure we are performing an xor against one.
34506 if (!isOneConstant(N1))
34509 // SetCC on x86 zero extends so only act on this if it's a logical shift.
34510 SDValue Shift = N0.getOperand(0);
34511 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
34514 // Make sure we are truncating from one of i16, i32 or i64.
34515 EVT ShiftTy = Shift.getValueType();
34516 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
34519 // Make sure the shift amount extracts the sign bit.
34520 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
34521 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
34524 // Create a greater-than comparison against -1.
34525 // N.B. Using SETGE against 0 works but we want a canonical looking
34526 // comparison, using SETGT matches up with what TranslateX86CC.
34528 SDValue ShiftOp = Shift.getOperand(0);
34529 EVT ShiftOpTy = ShiftOp.getValueType();
34530 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34531 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
34532 *DAG.getContext(), ResultType);
34533 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
34534 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
34535 if (SetCCResultType != ResultType)
34536 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
34540 /// Turn vector tests of the signbit in the form of:
34541 /// xor (sra X, elt_size(X)-1), -1
34545 /// This should be called before type legalization because the pattern may not
34546 /// persist after that.
34547 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
34548 const X86Subtarget &Subtarget) {
34549 EVT VT = N->getValueType(0);
34550 if (!VT.isSimple())
34553 switch (VT.getSimpleVT().SimpleTy) {
34554 default: return SDValue();
34557 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
34558 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
34562 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
34565 // There must be a shift right algebraic before the xor, and the xor must be a
34566 // 'not' operation.
34567 SDValue Shift = N->getOperand(0);
34568 SDValue Ones = N->getOperand(1);
34569 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
34570 !ISD::isBuildVectorAllOnes(Ones.getNode()))
34573 // The shift should be smearing the sign bit across each vector element.
34574 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
34578 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
34579 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
34580 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
34583 // Create a greater-than comparison against -1. We don't use the more obvious
34584 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
34585 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
34588 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
34589 /// is valid for the given \p Subtarget.
34590 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
34591 const X86Subtarget &Subtarget) {
34592 if (!Subtarget.hasAVX512())
34595 // FIXME: Scalar type may be supported if we move it to vector register.
34596 if (!SrcVT.isVector())
34599 EVT SrcElVT = SrcVT.getScalarType();
34600 EVT DstElVT = DstVT.getScalarType();
34601 if (DstElVT != MVT::i8 && DstElVT != MVT::i16 && DstElVT != MVT::i32)
34603 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
34604 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
34608 /// Detect a pattern of truncation with unsigned saturation:
34609 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34610 /// Return the source value to be truncated or SDValue() if the pattern was not
34612 static SDValue detectUSatPattern(SDValue In, EVT VT) {
34613 if (In.getOpcode() != ISD::UMIN)
34616 // Saturation with truncation. We truncate from InVT to VT.
34617 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
34618 "Unexpected types for truncate operation");
34621 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
34622 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
34623 // the element size of the destination type.
34624 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : SDValue();
34629 /// Detect patterns of truncation with signed saturation:
34630 /// (truncate (smin ((smax (x, signed_min_of_dest_type)),
34631 /// signed_max_of_dest_type)) to dest_type)
34633 /// (truncate (smax ((smin (x, signed_max_of_dest_type)),
34634 /// signed_min_of_dest_type)) to dest_type).
34635 /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type].
34636 /// Return the source value to be truncated or SDValue() if the pattern was not
34638 static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) {
34639 unsigned NumDstBits = VT.getScalarSizeInBits();
34640 unsigned NumSrcBits = In.getScalarValueSizeInBits();
34641 assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation");
34643 auto MatchMinMax = [](SDValue V, unsigned Opcode,
34644 const APInt &Limit) -> SDValue {
34646 if (V.getOpcode() == Opcode &&
34647 ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit)
34648 return V.getOperand(0);
34652 APInt SignedMax, SignedMin;
34654 SignedMax = APInt::getAllOnesValue(NumDstBits).zext(NumSrcBits);
34655 SignedMin = APInt(NumSrcBits, 0);
34657 SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits);
34658 SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits);
34661 if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax))
34662 if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin))
34665 if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin))
34666 if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax))
34672 /// Detect a pattern of truncation with signed saturation.
34673 /// The types should allow to use VPMOVSS* instruction on AVX512.
34674 /// Return the source value to be truncated or SDValue() if the pattern was not
34676 static SDValue detectAVX512SSatPattern(SDValue In, EVT VT,
34677 const X86Subtarget &Subtarget,
34678 const TargetLowering &TLI) {
34679 if (!TLI.isTypeLegal(In.getValueType()))
34681 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34683 return detectSSatPattern(In, VT);
34686 /// Detect a pattern of truncation with saturation:
34687 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
34688 /// The types should allow to use VPMOVUS* instruction on AVX512.
34689 /// Return the source value to be truncated or SDValue() if the pattern was not
34691 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
34692 const X86Subtarget &Subtarget,
34693 const TargetLowering &TLI) {
34694 if (!TLI.isTypeLegal(In.getValueType()))
34696 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
34698 return detectUSatPattern(In, VT);
34701 static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
34703 const X86Subtarget &Subtarget) {
34704 EVT SVT = VT.getScalarType();
34705 EVT InVT = In.getValueType();
34706 EVT InSVT = InVT.getScalarType();
34707 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34708 if (TLI.isTypeLegal(InVT) && TLI.isTypeLegal(VT) &&
34709 isSATValidOnAVX512Subtarget(InVT, VT, Subtarget)) {
34710 if (auto SSatVal = detectSSatPattern(In, VT))
34711 return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal);
34712 if (auto USatVal = detectUSatPattern(In, VT))
34713 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
34715 if (VT.isVector() && isPowerOf2_32(VT.getVectorNumElements()) &&
34716 (SVT == MVT::i8 || SVT == MVT::i16) &&
34717 (InSVT == MVT::i16 || InSVT == MVT::i32)) {
34718 if (auto SSatVal = detectSSatPattern(In, VT))
34719 return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG,
34721 if (auto USatVal = detectSSatPattern(In, VT, true)) {
34722 // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW).
34723 if (SVT == MVT::i8 && InSVT == MVT::i32) {
34724 EVT MidVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
34725 VT.getVectorNumElements());
34726 SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL,
34729 return truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG,
34731 } else if (SVT == MVT::i8 || Subtarget.hasSSE41())
34732 return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG,
34739 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
34740 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
34741 /// X86ISD::AVG instruction.
34742 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
34743 const X86Subtarget &Subtarget,
34745 if (!VT.isVector())
34747 EVT InVT = In.getValueType();
34748 unsigned NumElems = VT.getVectorNumElements();
34750 EVT ScalarVT = VT.getVectorElementType();
34751 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
34752 isPowerOf2_32(NumElems)))
34755 // InScalarVT is the intermediate type in AVG pattern and it should be greater
34756 // than the original input type (i8/i16).
34757 EVT InScalarVT = InVT.getVectorElementType();
34758 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
34761 if (!Subtarget.hasSSE2())
34764 // Detect the following pattern:
34766 // %1 = zext <N x i8> %a to <N x i32>
34767 // %2 = zext <N x i8> %b to <N x i32>
34768 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
34769 // %4 = add nuw nsw <N x i32> %3, %2
34770 // %5 = lshr <N x i32> %N, <i32 1 x N>
34771 // %6 = trunc <N x i32> %5 to <N x i8>
34773 // In AVX512, the last instruction can also be a trunc store.
34774 if (In.getOpcode() != ISD::SRL)
34777 // A lambda checking the given SDValue is a constant vector and each element
34778 // is in the range [Min, Max].
34779 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
34780 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
34781 if (!BV || !BV->isConstant())
34783 for (SDValue Op : V->ops()) {
34784 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
34787 const APInt &Val = C->getAPIntValue();
34788 if (Val.ult(Min) || Val.ugt(Max))
34794 // Check if each element of the vector is left-shifted by one.
34795 auto LHS = In.getOperand(0);
34796 auto RHS = In.getOperand(1);
34797 if (!IsConstVectorInRange(RHS, 1, 1))
34799 if (LHS.getOpcode() != ISD::ADD)
34802 // Detect a pattern of a + b + 1 where the order doesn't matter.
34803 SDValue Operands[3];
34804 Operands[0] = LHS.getOperand(0);
34805 Operands[1] = LHS.getOperand(1);
34807 auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
34808 ArrayRef<SDValue> Ops) {
34809 return DAG.getNode(X86ISD::AVG, DL, Ops[0].getValueType(), Ops);
34812 // Take care of the case when one of the operands is a constant vector whose
34813 // element is in the range [1, 256].
34814 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
34815 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
34816 Operands[0].getOperand(0).getValueType() == VT) {
34817 // The pattern is detected. Subtract one from the constant vector, then
34818 // demote it and emit X86ISD::AVG instruction.
34819 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
34820 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
34821 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
34822 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
34823 { Operands[0].getOperand(0), Operands[1] },
34827 if (Operands[0].getOpcode() == ISD::ADD)
34828 std::swap(Operands[0], Operands[1]);
34829 else if (Operands[1].getOpcode() != ISD::ADD)
34831 Operands[2] = Operands[1].getOperand(0);
34832 Operands[1] = Operands[1].getOperand(1);
34834 // Now we have three operands of two additions. Check that one of them is a
34835 // constant vector with ones, and the other two are promoted from i8/i16.
34836 for (int i = 0; i < 3; ++i) {
34837 if (!IsConstVectorInRange(Operands[i], 1, 1))
34839 std::swap(Operands[i], Operands[2]);
34841 // Check if Operands[0] and Operands[1] are results of type promotion.
34842 for (int j = 0; j < 2; ++j)
34843 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
34844 Operands[j].getOperand(0).getValueType() != VT)
34847 // The pattern is detected, emit X86ISD::AVG instruction(s).
34848 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
34849 { Operands[0].getOperand(0),
34850 Operands[1].getOperand(0) }, AVGBuilder);
34856 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
34857 TargetLowering::DAGCombinerInfo &DCI,
34858 const X86Subtarget &Subtarget) {
34859 LoadSDNode *Ld = cast<LoadSDNode>(N);
34860 EVT RegVT = Ld->getValueType(0);
34861 EVT MemVT = Ld->getMemoryVT();
34863 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34865 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
34866 // into two 16-byte operations. Also split non-temporal aligned loads on
34867 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
34868 ISD::LoadExtType Ext = Ld->getExtensionType();
34870 unsigned AddressSpace = Ld->getAddressSpace();
34871 unsigned Alignment = Ld->getAlignment();
34872 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
34873 Ext == ISD::NON_EXTLOAD &&
34874 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
34875 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
34876 AddressSpace, Alignment, &Fast) && !Fast))) {
34877 unsigned NumElems = RegVT.getVectorNumElements();
34881 SDValue Ptr = Ld->getBasePtr();
34883 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
34886 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
34887 Alignment, Ld->getMemOperand()->getFlags());
34889 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
34891 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
34892 Ld->getPointerInfo().getWithOffset(16),
34893 MinAlign(Alignment, 16U), Ld->getMemOperand()->getFlags());
34894 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
34896 Load2.getValue(1));
34898 SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2);
34899 return DCI.CombineTo(N, NewVec, TF, true);
34905 /// If V is a build vector of boolean constants and exactly one of those
34906 /// constants is true, return the operand index of that true element.
34907 /// Otherwise, return -1.
34908 static int getOneTrueElt(SDValue V) {
34909 // This needs to be a build vector of booleans.
34910 // TODO: Checking for the i1 type matches the IR definition for the mask,
34911 // but the mask check could be loosened to i8 or other types. That might
34912 // also require checking more than 'allOnesValue'; eg, the x86 HW
34913 // instructions only require that the MSB is set for each mask element.
34914 // The ISD::MSTORE comments/definition do not specify how the mask operand
34916 auto *BV = dyn_cast<BuildVectorSDNode>(V);
34917 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
34920 int TrueIndex = -1;
34921 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
34922 for (unsigned i = 0; i < NumElts; ++i) {
34923 const SDValue &Op = BV->getOperand(i);
34926 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
34929 if (ConstNode->getAPIntValue().isAllOnesValue()) {
34930 // If we already found a one, this is too many.
34931 if (TrueIndex >= 0)
34939 /// Given a masked memory load/store operation, return true if it has one mask
34940 /// bit set. If it has one mask bit set, then also return the memory address of
34941 /// the scalar element to load/store, the vector index to insert/extract that
34942 /// scalar element, and the alignment for the scalar memory access.
34943 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
34944 SelectionDAG &DAG, SDValue &Addr,
34945 SDValue &Index, unsigned &Alignment) {
34946 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
34947 if (TrueMaskElt < 0)
34950 // Get the address of the one scalar element that is specified by the mask
34951 // using the appropriate offset from the base pointer.
34952 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
34953 Addr = MaskedOp->getBasePtr();
34954 if (TrueMaskElt != 0) {
34955 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
34956 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
34959 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
34960 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
34964 /// If exactly one element of the mask is set for a non-extending masked load,
34965 /// it is a scalar load and vector insert.
34966 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
34967 /// mask have already been optimized in IR, so we don't bother with those here.
34969 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34970 TargetLowering::DAGCombinerInfo &DCI) {
34971 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
34972 // However, some target hooks may need to be added to know when the transform
34973 // is profitable. Endianness would also have to be considered.
34975 SDValue Addr, VecIndex;
34976 unsigned Alignment;
34977 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
34980 // Load the one scalar element that is specified by the mask using the
34981 // appropriate offset from the base pointer.
34983 EVT VT = ML->getValueType(0);
34984 EVT EltVT = VT.getVectorElementType();
34986 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
34987 Alignment, ML->getMemOperand()->getFlags());
34989 // Insert the loaded element into the appropriate place in the vector.
34990 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
34992 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
34996 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
34997 TargetLowering::DAGCombinerInfo &DCI) {
34998 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
35002 EVT VT = ML->getValueType(0);
35004 // If we are loading the first and last elements of a vector, it is safe and
35005 // always faster to load the whole vector. Replace the masked load with a
35006 // vector load and select.
35007 unsigned NumElts = VT.getVectorNumElements();
35008 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
35009 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
35010 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
35011 if (LoadFirstElt && LoadLastElt) {
35012 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35013 ML->getMemOperand());
35014 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
35015 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
35018 // Convert a masked load with a constant mask into a masked load and a select.
35019 // This allows the select operation to use a faster kind of select instruction
35020 // (for example, vblendvps -> vblendps).
35022 // Don't try this if the pass-through operand is already undefined. That would
35023 // cause an infinite loop because that's what we're about to create.
35024 if (ML->getSrc0().isUndef())
35027 // The new masked load has an undef pass-through operand. The select uses the
35028 // original pass-through operand.
35029 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
35030 ML->getMask(), DAG.getUNDEF(VT),
35031 ML->getMemoryVT(), ML->getMemOperand(),
35032 ML->getExtensionType());
35033 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
35035 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
35038 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
35039 TargetLowering::DAGCombinerInfo &DCI,
35040 const X86Subtarget &Subtarget) {
35041 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
35043 // TODO: Expanding load with constant mask may be optimized as well.
35044 if (Mld->isExpandingLoad())
35047 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
35048 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
35050 // TODO: Do some AVX512 subsets benefit from this transform?
35051 if (!Subtarget.hasAVX512())
35052 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
35056 if (Mld->getExtensionType() != ISD::SEXTLOAD)
35059 // Resolve extending loads.
35060 EVT VT = Mld->getValueType(0);
35061 unsigned NumElems = VT.getVectorNumElements();
35062 EVT LdVT = Mld->getMemoryVT();
35065 assert(LdVT != VT && "Cannot extend to the same type");
35066 unsigned ToSz = VT.getScalarSizeInBits();
35067 unsigned FromSz = LdVT.getScalarSizeInBits();
35068 // From/To sizes and ElemCount must be pow of two.
35069 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35070 "Unexpected size for extending masked load");
35072 unsigned SizeRatio = ToSz / FromSz;
35073 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
35075 // Create a type on which we perform the shuffle.
35076 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35077 LdVT.getScalarType(), NumElems*SizeRatio);
35078 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35080 // Convert Src0 value.
35081 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
35082 if (!Mld->getSrc0().isUndef()) {
35083 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35084 for (unsigned i = 0; i != NumElems; ++i)
35085 ShuffleVec[i] = i * SizeRatio;
35087 // Can't shuffle using an illegal type.
35088 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35089 "WideVecVT should be legal");
35090 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
35091 DAG.getUNDEF(WideVecVT), ShuffleVec);
35094 // Prepare the new mask.
35096 SDValue Mask = Mld->getMask();
35097 if (Mask.getValueType() == VT) {
35098 // Mask and original value have the same type.
35099 NewMask = DAG.getBitcast(WideVecVT, Mask);
35100 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35101 for (unsigned i = 0; i != NumElems; ++i)
35102 ShuffleVec[i] = i * SizeRatio;
35103 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
35104 ShuffleVec[i] = NumElems * SizeRatio;
35105 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35106 DAG.getConstant(0, dl, WideVecVT),
35109 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35110 unsigned WidenNumElts = NumElems*SizeRatio;
35111 unsigned MaskNumElts = VT.getVectorNumElements();
35112 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35115 unsigned NumConcat = WidenNumElts / MaskNumElts;
35116 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35117 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35119 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35122 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
35123 Mld->getBasePtr(), NewMask, WideSrc0,
35124 Mld->getMemoryVT(), Mld->getMemOperand(),
35126 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
35127 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
35130 /// If exactly one element of the mask is set for a non-truncating masked store,
35131 /// it is a vector extract and scalar store.
35132 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
35133 /// mask have already been optimized in IR, so we don't bother with those here.
35134 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
35135 SelectionDAG &DAG) {
35136 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
35137 // However, some target hooks may need to be added to know when the transform
35138 // is profitable. Endianness would also have to be considered.
35140 SDValue Addr, VecIndex;
35141 unsigned Alignment;
35142 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
35145 // Extract the one scalar element that is actually being stored.
35147 EVT VT = MS->getValue().getValueType();
35148 EVT EltVT = VT.getVectorElementType();
35149 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
35150 MS->getValue(), VecIndex);
35152 // Store that element at the appropriate offset from the base pointer.
35153 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
35154 Alignment, MS->getMemOperand()->getFlags());
35157 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
35158 const X86Subtarget &Subtarget) {
35159 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
35161 if (Mst->isCompressingStore())
35164 if (!Mst->isTruncatingStore()) {
35165 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
35166 return ScalarStore;
35168 // If the mask is checking (0 > X), we're creating a vector with all-zeros
35169 // or all-ones elements based on the sign bits of X. AVX1 masked store only
35170 // cares about the sign bit of each mask element, so eliminate the compare:
35171 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
35172 // Note that by waiting to match an x86-specific PCMPGT node, we're
35173 // eliminating potentially more complex matching of a setcc node which has
35174 // a full range of predicates.
35175 SDValue Mask = Mst->getMask();
35176 if (Mask.getOpcode() == X86ISD::PCMPGT &&
35177 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
35178 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
35179 "Unexpected type for PCMPGT");
35180 return DAG.getMaskedStore(
35181 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
35182 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
35185 // TODO: AVX512 targets should also be able to simplify something like the
35186 // pattern above, but that pattern will be different. It will either need to
35187 // match setcc more generally or match PCMPGTM later (in tablegen?).
35192 // Resolve truncating stores.
35193 EVT VT = Mst->getValue().getValueType();
35194 unsigned NumElems = VT.getVectorNumElements();
35195 EVT StVT = Mst->getMemoryVT();
35198 assert(StVT != VT && "Cannot truncate to the same type");
35199 unsigned FromSz = VT.getScalarSizeInBits();
35200 unsigned ToSz = StVT.getScalarSizeInBits();
35202 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35204 // The truncating store is legal in some cases. For example
35205 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35206 // are designated for truncate store.
35207 // In this case we don't need any further transformations.
35208 if (TLI.isTruncStoreLegal(VT, StVT))
35211 // From/To sizes and ElemCount must be pow of two.
35212 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
35213 "Unexpected size for truncating masked store");
35214 // We are going to use the original vector elt for storing.
35215 // Accumulated smaller vector elements must be a multiple of the store size.
35216 assert (((NumElems * FromSz) % ToSz) == 0 &&
35217 "Unexpected ratio for truncating masked store");
35219 unsigned SizeRatio = FromSz / ToSz;
35220 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35222 // Create a type on which we perform the shuffle.
35223 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35224 StVT.getScalarType(), NumElems*SizeRatio);
35226 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35228 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
35229 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
35230 for (unsigned i = 0; i != NumElems; ++i)
35231 ShuffleVec[i] = i * SizeRatio;
35233 // Can't shuffle using an illegal type.
35234 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
35235 "WideVecVT should be legal");
35237 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
35238 DAG.getUNDEF(WideVecVT),
35242 SDValue Mask = Mst->getMask();
35243 if (Mask.getValueType() == VT) {
35244 // Mask and original value have the same type.
35245 NewMask = DAG.getBitcast(WideVecVT, Mask);
35246 for (unsigned i = 0; i != NumElems; ++i)
35247 ShuffleVec[i] = i * SizeRatio;
35248 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
35249 ShuffleVec[i] = NumElems*SizeRatio;
35250 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
35251 DAG.getConstant(0, dl, WideVecVT),
35254 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
35255 unsigned WidenNumElts = NumElems*SizeRatio;
35256 unsigned MaskNumElts = VT.getVectorNumElements();
35257 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
35260 unsigned NumConcat = WidenNumElts / MaskNumElts;
35261 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
35262 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
35264 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
35267 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
35268 Mst->getBasePtr(), NewMask, StVT,
35269 Mst->getMemOperand(), false);
35272 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
35273 const X86Subtarget &Subtarget) {
35274 StoreSDNode *St = cast<StoreSDNode>(N);
35275 EVT VT = St->getValue().getValueType();
35276 EVT StVT = St->getMemoryVT();
35278 SDValue StoredVal = St->getOperand(1);
35279 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35281 // If this is a store of a scalar_to_vector to v1i1, just use a scalar store.
35282 // This will avoid a copy to k-register.
35283 if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() &&
35284 StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35285 StoredVal.getOperand(0).getValueType() == MVT::i8) {
35286 return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
35287 St->getBasePtr(), St->getPointerInfo(),
35288 St->getAlignment(), St->getMemOperand()->getFlags());
35291 // Widen v2i1/v4i1 stores to v8i1.
35292 if ((VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT &&
35293 Subtarget.hasAVX512()) {
35294 unsigned NumConcats = 8 / VT.getVectorNumElements();
35295 SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(VT));
35296 Ops[0] = StoredVal;
35297 StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
35298 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
35299 St->getPointerInfo(), St->getAlignment(),
35300 St->getMemOperand()->getFlags());
35303 // Turn vXi1 stores of constants into a scalar store.
35304 if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 ||
35305 VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
35306 ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
35307 // If its a v64i1 store without 64-bit support, we need two stores.
35308 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
35309 SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
35310 StoredVal->ops().slice(0, 32));
35311 Lo = combinevXi1ConstantToInteger(Lo, DAG);
35312 SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl,
35313 StoredVal->ops().slice(32, 32));
35314 Hi = combinevXi1ConstantToInteger(Hi, DAG);
35316 unsigned Alignment = St->getAlignment();
35318 SDValue Ptr0 = St->getBasePtr();
35319 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 4, dl);
35322 DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
35323 Alignment, St->getMemOperand()->getFlags());
35325 DAG.getStore(St->getChain(), dl, Hi, Ptr1,
35326 St->getPointerInfo().getWithOffset(4),
35327 MinAlign(Alignment, 4U),
35328 St->getMemOperand()->getFlags());
35329 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
35332 StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
35333 return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
35334 St->getPointerInfo(), St->getAlignment(),
35335 St->getMemOperand()->getFlags());
35338 // If we are saving a concatenation of two XMM registers and 32-byte stores
35339 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
35341 unsigned AddressSpace = St->getAddressSpace();
35342 unsigned Alignment = St->getAlignment();
35343 if (VT.is256BitVector() && StVT == VT &&
35344 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
35345 AddressSpace, Alignment, &Fast) &&
35347 unsigned NumElems = VT.getVectorNumElements();
35351 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
35352 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
35354 SDValue Ptr0 = St->getBasePtr();
35355 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
35358 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
35359 Alignment, St->getMemOperand()->getFlags());
35361 DAG.getStore(St->getChain(), dl, Value1, Ptr1,
35362 St->getPointerInfo().getWithOffset(16),
35363 MinAlign(Alignment, 16U), St->getMemOperand()->getFlags());
35364 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
35367 // Optimize trunc store (of multiple scalars) to shuffle and store.
35368 // First, pack all of the elements in one place. Next, store to memory
35369 // in fewer chunks.
35370 if (St->isTruncatingStore() && VT.isVector()) {
35371 // Check if we can detect an AVG pattern from the truncation. If yes,
35372 // replace the trunc store by a normal store with the result of X86ISD::AVG
35374 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
35376 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
35377 St->getPointerInfo(), St->getAlignment(),
35378 St->getMemOperand()->getFlags());
35380 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35382 detectAVX512SSatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
35384 return EmitTruncSStore(true /* Signed saturation */, St->getChain(),
35385 dl, Val, St->getBasePtr(),
35386 St->getMemoryVT(), St->getMemOperand(), DAG);
35388 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget,
35390 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
35391 dl, Val, St->getBasePtr(),
35392 St->getMemoryVT(), St->getMemOperand(), DAG);
35394 unsigned NumElems = VT.getVectorNumElements();
35395 assert(StVT != VT && "Cannot truncate to the same type");
35396 unsigned FromSz = VT.getScalarSizeInBits();
35397 unsigned ToSz = StVT.getScalarSizeInBits();
35399 // The truncating store is legal in some cases. For example
35400 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
35401 // are designated for truncate store.
35402 // In this case we don't need any further transformations.
35403 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
35406 // From, To sizes and ElemCount must be pow of two
35407 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
35408 // We are going to use the original vector elt for storing.
35409 // Accumulated smaller vector elements must be a multiple of the store size.
35410 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
35412 unsigned SizeRatio = FromSz / ToSz;
35414 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
35416 // Create a type on which we perform the shuffle
35417 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
35418 StVT.getScalarType(), NumElems*SizeRatio);
35420 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
35422 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
35423 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
35424 for (unsigned i = 0; i != NumElems; ++i)
35425 ShuffleVec[i] = i * SizeRatio;
35427 // Can't shuffle using an illegal type.
35428 if (!TLI.isTypeLegal(WideVecVT))
35431 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
35432 DAG.getUNDEF(WideVecVT),
35434 // At this point all of the data is stored at the bottom of the
35435 // register. We now need to save it to mem.
35437 // Find the largest store unit
35438 MVT StoreType = MVT::i8;
35439 for (MVT Tp : MVT::integer_valuetypes()) {
35440 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
35444 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
35445 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
35446 (64 <= NumElems * ToSz))
35447 StoreType = MVT::f64;
35449 // Bitcast the original vector into a vector of store-size units
35450 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
35451 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
35452 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
35453 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
35454 SmallVector<SDValue, 8> Chains;
35455 SDValue Ptr = St->getBasePtr();
35457 // Perform one or more big stores into memory.
35458 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
35459 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
35460 StoreType, ShuffWide,
35461 DAG.getIntPtrConstant(i, dl));
35463 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
35464 St->getAlignment(), St->getMemOperand()->getFlags());
35465 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
35466 Chains.push_back(Ch);
35469 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
35472 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
35473 // the FP state in cases where an emms may be missing.
35474 // A preferable solution to the general problem is to figure out the right
35475 // places to insert EMMS. This qualifies as a quick hack.
35477 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
35478 if (VT.getSizeInBits() != 64)
35481 const Function &F = DAG.getMachineFunction().getFunction();
35482 bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
35484 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
35485 if ((VT.isVector() ||
35486 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
35487 isa<LoadSDNode>(St->getValue()) &&
35488 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
35489 St->getChain().hasOneUse() && !St->isVolatile()) {
35490 LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
35491 SmallVector<SDValue, 8> Ops;
35493 if (!ISD::isNormalLoad(Ld))
35496 // If this is not the MMX case, i.e. we are just turning i64 load/store
35497 // into f64 load/store, avoid the transformation if there are multiple
35498 // uses of the loaded value.
35499 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
35504 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
35505 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
35507 if (Subtarget.is64Bit() || F64IsLegal) {
35508 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
35509 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
35510 Ld->getMemOperand());
35512 // Make sure new load is placed in same chain order.
35513 DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
35514 return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
35515 St->getMemOperand());
35518 // Otherwise, lower to two pairs of 32-bit loads / stores.
35519 SDValue LoAddr = Ld->getBasePtr();
35520 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
35522 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
35523 Ld->getPointerInfo(), Ld->getAlignment(),
35524 Ld->getMemOperand()->getFlags());
35525 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
35526 Ld->getPointerInfo().getWithOffset(4),
35527 MinAlign(Ld->getAlignment(), 4),
35528 Ld->getMemOperand()->getFlags());
35529 // Make sure new loads are placed in same chain order.
35530 DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
35531 DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
35533 LoAddr = St->getBasePtr();
35534 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
35537 DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
35538 St->getAlignment(), St->getMemOperand()->getFlags());
35539 SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
35540 St->getPointerInfo().getWithOffset(4),
35541 MinAlign(St->getAlignment(), 4),
35542 St->getMemOperand()->getFlags());
35543 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
35546 // This is similar to the above case, but here we handle a scalar 64-bit
35547 // integer store that is extracted from a vector on a 32-bit target.
35548 // If we have SSE2, then we can treat it like a floating-point double
35549 // to get past legalization. The execution dependencies fixup pass will
35550 // choose the optimal machine instruction for the store if this really is
35551 // an integer or v2f32 rather than an f64.
35552 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
35553 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
35554 SDValue OldExtract = St->getOperand(1);
35555 SDValue ExtOp0 = OldExtract.getOperand(0);
35556 unsigned VecSize = ExtOp0.getValueSizeInBits();
35557 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
35558 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
35559 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
35560 BitCast, OldExtract.getOperand(1));
35561 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
35562 St->getPointerInfo(), St->getAlignment(),
35563 St->getMemOperand()->getFlags());
35569 /// Return 'true' if this vector operation is "horizontal"
35570 /// and return the operands for the horizontal operation in LHS and RHS. A
35571 /// horizontal operation performs the binary operation on successive elements
35572 /// of its first operand, then on successive elements of its second operand,
35573 /// returning the resulting values in a vector. For example, if
35574 /// A = < float a0, float a1, float a2, float a3 >
35576 /// B = < float b0, float b1, float b2, float b3 >
35577 /// then the result of doing a horizontal operation on A and B is
35578 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
35579 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
35580 /// A horizontal-op B, for some already available A and B, and if so then LHS is
35581 /// set to A, RHS to B, and the routine returns 'true'.
35582 /// Note that the binary operation should have the property that if one of the
35583 /// operands is UNDEF then the result is UNDEF.
35584 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
35585 // Look for the following pattern: if
35586 // A = < float a0, float a1, float a2, float a3 >
35587 // B = < float b0, float b1, float b2, float b3 >
35589 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
35590 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
35591 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
35592 // which is A horizontal-op B.
35594 // At least one of the operands should be a vector shuffle.
35595 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
35596 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
35599 MVT VT = LHS.getSimpleValueType();
35601 assert((VT.is128BitVector() || VT.is256BitVector()) &&
35602 "Unsupported vector type for horizontal add/sub");
35604 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
35605 // operate independently on 128-bit lanes.
35606 unsigned NumElts = VT.getVectorNumElements();
35607 unsigned NumLanes = VT.getSizeInBits()/128;
35608 unsigned NumLaneElts = NumElts / NumLanes;
35609 assert((NumLaneElts % 2 == 0) &&
35610 "Vector type should have an even number of elements in each lane");
35611 unsigned HalfLaneElts = NumLaneElts/2;
35613 // View LHS in the form
35614 // LHS = VECTOR_SHUFFLE A, B, LMask
35615 // If LHS is not a shuffle then pretend it is the shuffle
35616 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
35617 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
35620 SmallVector<int, 16> LMask(NumElts);
35621 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35622 if (!LHS.getOperand(0).isUndef())
35623 A = LHS.getOperand(0);
35624 if (!LHS.getOperand(1).isUndef())
35625 B = LHS.getOperand(1);
35626 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
35627 std::copy(Mask.begin(), Mask.end(), LMask.begin());
35629 if (!LHS.isUndef())
35631 for (unsigned i = 0; i != NumElts; ++i)
35635 // Likewise, view RHS in the form
35636 // RHS = VECTOR_SHUFFLE C, D, RMask
35638 SmallVector<int, 16> RMask(NumElts);
35639 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
35640 if (!RHS.getOperand(0).isUndef())
35641 C = RHS.getOperand(0);
35642 if (!RHS.getOperand(1).isUndef())
35643 D = RHS.getOperand(1);
35644 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
35645 std::copy(Mask.begin(), Mask.end(), RMask.begin());
35647 if (!RHS.isUndef())
35649 for (unsigned i = 0; i != NumElts; ++i)
35653 // Check that the shuffles are both shuffling the same vectors.
35654 if (!(A == C && B == D) && !(A == D && B == C))
35657 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
35658 if (!A.getNode() && !B.getNode())
35661 // If A and B occur in reverse order in RHS, then "swap" them (which means
35662 // rewriting the mask).
35664 ShuffleVectorSDNode::commuteMask(RMask);
35666 // At this point LHS and RHS are equivalent to
35667 // LHS = VECTOR_SHUFFLE A, B, LMask
35668 // RHS = VECTOR_SHUFFLE A, B, RMask
35669 // Check that the masks correspond to performing a horizontal operation.
35670 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
35671 for (unsigned i = 0; i != NumLaneElts; ++i) {
35672 int LIdx = LMask[i+l], RIdx = RMask[i+l];
35674 // Ignore any UNDEF components.
35675 if (LIdx < 0 || RIdx < 0 ||
35676 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
35677 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
35680 // Check that successive elements are being operated on. If not, this is
35681 // not a horizontal operation.
35682 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
35683 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
35684 if (!(LIdx == Index && RIdx == Index + 1) &&
35685 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
35690 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
35691 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
35695 /// Do target-specific dag combines on floating-point adds/subs.
35696 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
35697 const X86Subtarget &Subtarget) {
35698 EVT VT = N->getValueType(0);
35699 SDValue LHS = N->getOperand(0);
35700 SDValue RHS = N->getOperand(1);
35701 bool IsFadd = N->getOpcode() == ISD::FADD;
35702 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
35704 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
35705 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
35706 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
35707 isHorizontalBinOp(LHS, RHS, IsFadd)) {
35708 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
35709 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
35714 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
35716 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
35717 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
35718 const X86Subtarget &Subtarget,
35720 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
35721 SDValue Src = N->getOperand(0);
35722 unsigned Opcode = Src.getOpcode();
35723 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
35725 EVT VT = N->getValueType(0);
35726 EVT SrcVT = Src.getValueType();
35728 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
35729 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
35731 // Repeated operand, so we are only trading one output truncation for
35732 // one input truncation.
35736 // See if either operand has been extended from a smaller/equal size to
35737 // the truncation size, allowing a truncation to combine with the extend.
35738 unsigned Opcode0 = Op0.getOpcode();
35739 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
35740 Opcode0 == ISD::ZERO_EXTEND) &&
35741 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35744 unsigned Opcode1 = Op1.getOpcode();
35745 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
35746 Opcode1 == ISD::ZERO_EXTEND) &&
35747 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
35750 // See if either operand is a single use constant which can be constant
35752 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
35753 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
35754 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
35755 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
35758 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
35759 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
35760 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
35761 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
35764 // Don't combine if the operation has other uses.
35765 if (!N->isOnlyUserOf(Src.getNode()))
35768 // Only support vector truncation for now.
35769 // TODO: i64 scalar math would benefit as well.
35770 if (!VT.isVector())
35773 // In most cases its only worth pre-truncating if we're only facing the cost
35774 // of one truncation.
35775 // i.e. if one of the inputs will constant fold or the input is repeated.
35780 SDValue Op0 = Src.getOperand(0);
35781 SDValue Op1 = Src.getOperand(1);
35782 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
35783 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35784 return TruncateArithmetic(Op0, Op1);
35789 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
35790 // better to truncate if we have the chance.
35791 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
35792 !Subtarget.hasDQI())
35793 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
35796 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
35797 SDValue Op0 = Src.getOperand(0);
35798 SDValue Op1 = Src.getOperand(1);
35799 if (TLI.isOperationLegal(Opcode, VT) &&
35800 IsRepeatedOpOrFreeTruncation(Op0, Op1))
35801 return TruncateArithmetic(Op0, Op1);
35809 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
35811 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
35812 SmallVector<SDValue, 8> &Regs) {
35813 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
35814 Regs[0].getValueType() == MVT::v2i64));
35815 EVT OutVT = N->getValueType(0);
35816 EVT OutSVT = OutVT.getVectorElementType();
35817 EVT InVT = Regs[0].getValueType();
35818 EVT InSVT = InVT.getVectorElementType();
35821 // First, use mask to unset all bits that won't appear in the result.
35822 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
35823 "OutSVT can only be either i8 or i16.");
35825 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
35826 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
35827 for (auto &Reg : Regs)
35828 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
35830 MVT UnpackedVT, PackedVT;
35831 if (OutSVT == MVT::i8) {
35832 UnpackedVT = MVT::v8i16;
35833 PackedVT = MVT::v16i8;
35835 UnpackedVT = MVT::v4i32;
35836 PackedVT = MVT::v8i16;
35839 // In each iteration, truncate the type by a half size.
35840 auto RegNum = Regs.size();
35841 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
35842 j < e; j *= 2, RegNum /= 2) {
35843 for (unsigned i = 0; i < RegNum; i++)
35844 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
35845 for (unsigned i = 0; i < RegNum / 2; i++)
35846 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
35850 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
35851 // then extract a subvector as the result since v8i8 is not a legal type.
35852 if (OutVT == MVT::v8i8) {
35853 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
35854 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
35855 DAG.getIntPtrConstant(0, DL));
35857 } else if (RegNum > 1) {
35858 Regs.resize(RegNum);
35859 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35864 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
35866 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
35868 SmallVector<SDValue, 8> &Regs) {
35869 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
35870 EVT OutVT = N->getValueType(0);
35873 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
35874 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
35875 for (auto &Reg : Regs) {
35876 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
35878 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
35882 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
35883 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
35886 if (Regs.size() > 2) {
35887 Regs.resize(Regs.size() / 2);
35888 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
35893 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
35894 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
35895 /// legalization the truncation will be translated into a BUILD_VECTOR with each
35896 /// element that is extracted from a vector and then truncated, and it is
35897 /// difficult to do this optimization based on them.
35898 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
35899 const X86Subtarget &Subtarget) {
35900 EVT OutVT = N->getValueType(0);
35901 if (!OutVT.isVector())
35904 SDValue In = N->getOperand(0);
35905 if (!In.getValueType().isSimple())
35908 EVT InVT = In.getValueType();
35909 unsigned NumElems = OutVT.getVectorNumElements();
35911 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
35912 // SSE2, and we need to take care of it specially.
35913 // AVX512 provides vpmovdb.
35914 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
35917 EVT OutSVT = OutVT.getVectorElementType();
35918 EVT InSVT = InVT.getVectorElementType();
35919 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
35920 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
35924 // SSSE3's pshufb results in less instructions in the cases below.
35925 if (Subtarget.hasSSSE3() && NumElems == 8 &&
35926 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
35927 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
35932 // Split a long vector into vectors of legal type.
35933 unsigned RegNum = InVT.getSizeInBits() / 128;
35934 SmallVector<SDValue, 8> SubVec(RegNum);
35935 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
35936 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
35938 for (unsigned i = 0; i < RegNum; i++)
35939 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
35940 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
35942 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
35943 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
35944 // truncate 2 x v4i32 to v8i16.
35945 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
35946 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
35947 if (InSVT == MVT::i32)
35948 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
35953 /// This function transforms vector truncation of 'extended sign-bits' or
35954 /// 'extended zero-bits' values.
35955 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
35956 static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
35958 const X86Subtarget &Subtarget) {
35959 // Requires SSE2 but AVX512 has fast truncate.
35960 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35963 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
35966 SDValue In = N->getOperand(0);
35967 if (!In.getValueType().isSimple())
35970 MVT VT = N->getValueType(0).getSimpleVT();
35971 MVT SVT = VT.getScalarType();
35973 MVT InVT = In.getValueType().getSimpleVT();
35974 MVT InSVT = InVT.getScalarType();
35976 // Check we have a truncation suited for PACKSS.
35977 if (!VT.is128BitVector() && !VT.is256BitVector())
35979 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
35981 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
35984 // Use PACKSS if the input has sign-bits that extend all the way to the
35985 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
35986 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
35987 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
35988 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
35989 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
35991 // Use PACKUS if the input has zero-bits that extend all the way to the
35992 // packed/truncated value. e.g. masks, zext_in_reg, etc.
35994 DAG.computeKnownBits(In, Known);
35995 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
35996 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
35997 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
35998 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
36003 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
36004 const X86Subtarget &Subtarget) {
36005 EVT VT = N->getValueType(0);
36006 SDValue Src = N->getOperand(0);
36009 // Attempt to pre-truncate inputs to arithmetic ops instead.
36010 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
36013 // Try to detect AVG pattern first.
36014 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
36017 // Try to combine truncation with signed/unsigned saturation.
36018 if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget))
36021 // The bitcast source is a direct mmx result.
36022 // Detect bitcasts between i32 to x86mmx
36023 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
36024 SDValue BCSrc = Src.getOperand(0);
36025 if (BCSrc.getValueType() == MVT::x86mmx)
36026 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
36029 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
36030 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
36033 return combineVectorTruncation(N, DAG, Subtarget);
36036 /// Returns the negated value if the node \p N flips sign of FP value.
36038 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
36039 /// AVX512F does not have FXOR, so FNEG is lowered as
36040 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
36041 /// In this case we go though all bitcasts.
36042 static SDValue isFNEG(SDNode *N) {
36043 if (N->getOpcode() == ISD::FNEG)
36044 return N->getOperand(0);
36046 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
36047 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
36050 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
36051 if (!Op1.getValueType().isFloatingPoint())
36054 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
36056 unsigned EltBits = Op1.getScalarValueSizeInBits();
36057 auto isSignMask = [&](const ConstantFP *C) {
36058 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
36061 // There is more than one way to represent the same constant on
36062 // the different X86 targets. The type of the node may also depend on size.
36063 // - load scalar value and broadcast
36064 // - BUILD_VECTOR node
36065 // - load from a constant pool.
36066 // We check all variants here.
36067 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
36068 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
36069 if (isSignMask(cast<ConstantFP>(C)))
36072 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
36073 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
36074 if (isSignMask(CN->getConstantFPValue()))
36077 } else if (auto *C = getTargetConstantFromNode(Op1)) {
36078 if (C->getType()->isVectorTy()) {
36079 if (auto *SplatV = C->getSplatValue())
36080 if (isSignMask(cast<ConstantFP>(SplatV)))
36082 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
36083 if (isSignMask(FPConst))
36089 /// Do target-specific dag combines on floating point negations.
36090 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
36091 const X86Subtarget &Subtarget) {
36092 EVT OrigVT = N->getValueType(0);
36093 SDValue Arg = isFNEG(N);
36094 assert(Arg.getNode() && "N is expected to be an FNEG node");
36096 EVT VT = Arg.getValueType();
36097 EVT SVT = VT.getScalarType();
36100 // Let legalize expand this if it isn't a legal type yet.
36101 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36104 // If we're negating a FMUL node on a target with FMA, then we can avoid the
36105 // use of a constant by performing (-0 - A*B) instead.
36106 // FIXME: Check rounding control flags as well once it becomes available.
36107 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
36108 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
36109 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
36110 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
36111 Arg.getOperand(1), Zero);
36112 return DAG.getBitcast(OrigVT, NewNode);
36115 // If we're negating an FMA node, then we can adjust the
36116 // instruction to include the extra negation.
36117 unsigned NewOpcode = 0;
36118 if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
36119 switch (Arg.getOpcode()) {
36120 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
36121 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
36122 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
36123 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
36124 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
36125 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
36126 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
36127 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
36128 // We can't handle scalar intrinsic node here because it would only
36129 // invert one element and not the whole vector. But we could try to handle
36130 // a negation of the lower element only.
36134 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
36135 Arg.getNode()->ops()));
36140 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
36141 const X86Subtarget &Subtarget) {
36142 MVT VT = N->getSimpleValueType(0);
36143 // If we have integer vector types available, use the integer opcodes.
36144 if (VT.isVector() && Subtarget.hasSSE2()) {
36147 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
36149 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
36150 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
36151 unsigned IntOpcode;
36152 switch (N->getOpcode()) {
36153 default: llvm_unreachable("Unexpected FP logic op");
36154 case X86ISD::FOR: IntOpcode = ISD::OR; break;
36155 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
36156 case X86ISD::FAND: IntOpcode = ISD::AND; break;
36157 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
36159 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
36160 return DAG.getBitcast(VT, IntOp);
36166 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
36167 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
36168 if (N->getOpcode() != ISD::XOR)
36171 SDValue LHS = N->getOperand(0);
36172 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
36173 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
36176 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
36177 X86::CondCode(LHS->getConstantOperandVal(0)));
36179 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
36182 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
36183 TargetLowering::DAGCombinerInfo &DCI,
36184 const X86Subtarget &Subtarget) {
36185 // If this is SSE1 only convert to FXOR to avoid scalarization.
36186 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
36187 N->getValueType(0) == MVT::v4i32) {
36188 return DAG.getBitcast(
36189 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
36190 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
36191 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
36194 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
36197 if (DCI.isBeforeLegalizeOps())
36200 if (SDValue SetCC = foldXor1SetCC(N, DAG))
36203 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
36206 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
36210 return combineFneg(N, DAG, Subtarget);
36215 static bool isNullFPScalarOrVectorConst(SDValue V) {
36216 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
36219 /// If a value is a scalar FP zero or a vector FP zero (potentially including
36220 /// undefined elements), return a zero constant that may be used to fold away
36221 /// that value. In the case of a vector, the returned constant will not contain
36222 /// undefined elements even if the input parameter does. This makes it suitable
36223 /// to be used as a replacement operand with operations (eg, bitwise-and) where
36224 /// an undef should not propagate.
36225 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
36226 const X86Subtarget &Subtarget) {
36227 if (!isNullFPScalarOrVectorConst(V))
36230 if (V.getValueType().isVector())
36231 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
36236 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
36237 const X86Subtarget &Subtarget) {
36238 SDValue N0 = N->getOperand(0);
36239 SDValue N1 = N->getOperand(1);
36240 EVT VT = N->getValueType(0);
36243 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
36244 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
36245 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
36246 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
36249 auto isAllOnesConstantFP = [](SDValue V) {
36250 if (V.getSimpleValueType().isVector())
36251 return ISD::isBuildVectorAllOnes(V.getNode());
36252 auto *C = dyn_cast<ConstantFPSDNode>(V);
36253 return C && C->getConstantFPValue()->isAllOnesValue();
36256 // fand (fxor X, -1), Y --> fandn X, Y
36257 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
36258 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
36260 // fand X, (fxor Y, -1) --> fandn Y, X
36261 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
36262 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
36267 /// Do target-specific dag combines on X86ISD::FAND nodes.
36268 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
36269 const X86Subtarget &Subtarget) {
36270 // FAND(0.0, x) -> 0.0
36271 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
36274 // FAND(x, 0.0) -> 0.0
36275 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
36278 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
36281 return lowerX86FPLogicOp(N, DAG, Subtarget);
36284 /// Do target-specific dag combines on X86ISD::FANDN nodes.
36285 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
36286 const X86Subtarget &Subtarget) {
36287 // FANDN(0.0, x) -> x
36288 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
36289 return N->getOperand(1);
36291 // FANDN(x, 0.0) -> 0.0
36292 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
36295 return lowerX86FPLogicOp(N, DAG, Subtarget);
36298 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
36299 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
36300 const X86Subtarget &Subtarget) {
36301 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
36303 // F[X]OR(0.0, x) -> x
36304 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
36305 return N->getOperand(1);
36307 // F[X]OR(x, 0.0) -> x
36308 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
36309 return N->getOperand(0);
36312 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
36315 return lowerX86FPLogicOp(N, DAG, Subtarget);
36318 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
36319 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
36320 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
36322 // Only perform optimizations if UnsafeMath is used.
36323 if (!DAG.getTarget().Options.UnsafeFPMath)
36326 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
36327 // into FMINC and FMAXC, which are Commutative operations.
36328 unsigned NewOp = 0;
36329 switch (N->getOpcode()) {
36330 default: llvm_unreachable("unknown opcode");
36331 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
36332 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
36335 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
36336 N->getOperand(0), N->getOperand(1));
36339 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
36340 const X86Subtarget &Subtarget) {
36341 if (Subtarget.useSoftFloat())
36344 // TODO: Check for global or instruction-level "nnan". In that case, we
36345 // should be able to lower to FMAX/FMIN alone.
36346 // TODO: If an operand is already known to be a NaN or not a NaN, this
36347 // should be an optional swap and FMAX/FMIN.
36349 EVT VT = N->getValueType(0);
36350 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
36351 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
36352 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
36355 // This takes at least 3 instructions, so favor a library call when operating
36356 // on a scalar and minimizing code size.
36357 if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
36360 SDValue Op0 = N->getOperand(0);
36361 SDValue Op1 = N->getOperand(1);
36363 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
36364 DAG.getDataLayout(), *DAG.getContext(), VT);
36366 // There are 4 possibilities involving NaN inputs, and these are the required
36370 // ----------------
36371 // Num | Max | Op0 |
36372 // Op0 ----------------
36373 // NaN | Op1 | NaN |
36374 // ----------------
36376 // The SSE FP max/min instructions were not designed for this case, but rather
36378 // Min = Op1 < Op0 ? Op1 : Op0
36379 // Max = Op1 > Op0 ? Op1 : Op0
36381 // So they always return Op0 if either input is a NaN. However, we can still
36382 // use those instructions for fmaxnum by selecting away a NaN input.
36384 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
36385 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
36386 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
36387 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
36389 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
36390 // are NaN, the NaN value of Op1 is the result.
36391 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
36394 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
36395 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
36396 TargetLowering::DAGCombinerInfo &DCI,
36397 const X86Subtarget &Subtarget) {
36398 // ANDNP(0, x) -> x
36399 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
36400 return N->getOperand(1);
36402 // ANDNP(x, 0) -> 0
36403 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
36404 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
36406 EVT VT = N->getValueType(0);
36408 // Attempt to recursively combine a bitmask ANDNP with shuffles.
36409 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
36411 if (SDValue Res = combineX86ShufflesRecursively(
36412 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
36413 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
36414 DCI.CombineTo(N, Res);
36422 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
36423 TargetLowering::DAGCombinerInfo &DCI) {
36424 SDValue N0 = N->getOperand(0);
36425 SDValue N1 = N->getOperand(1);
36427 // BT ignores high bits in the bit index operand.
36428 unsigned BitWidth = N1.getValueSizeInBits();
36429 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
36430 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
36431 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
36436 // Try to combine sext_in_reg of a cmov of constants by extending the constants.
36437 static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) {
36438 EVT VT = N->getValueType(0);
36440 SDValue N0 = N->getOperand(0);
36441 SDValue N1 = N->getOperand(1);
36442 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
36444 if (ExtraVT != MVT::i16)
36447 // Look through single use any_extends.
36448 if (N0.getOpcode() == ISD::ANY_EXTEND && N0.hasOneUse())
36449 N0 = N0.getOperand(0);
36451 // See if we have a single use cmov.
36452 if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse())
36455 SDValue CMovOp0 = N0.getOperand(0);
36456 SDValue CMovOp1 = N0.getOperand(1);
36458 // Make sure both operands are constants.
36459 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
36460 !isa<ConstantSDNode>(CMovOp1.getNode()))
36465 // If we looked through an any_extend above, add one to the constants.
36466 if (N0.getValueType() != VT) {
36467 CMovOp0 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp0);
36468 CMovOp1 = DAG.getNode(ISD::ANY_EXTEND, DL, VT, CMovOp1);
36471 CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp0, N1);
36472 CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, CMovOp1, N1);
36474 return DAG.getNode(X86ISD::CMOV, DL, VT, CMovOp0, CMovOp1,
36475 N0.getOperand(2), N0.getOperand(3));
36478 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
36479 const X86Subtarget &Subtarget) {
36480 if (SDValue V = combineSextInRegCmov(N, DAG))
36483 EVT VT = N->getValueType(0);
36484 SDValue N0 = N->getOperand(0);
36485 SDValue N1 = N->getOperand(1);
36486 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
36489 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
36490 // both SSE and AVX2 since there is no sign-extended shift right
36491 // operation on a vector with 64-bit elements.
36492 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
36493 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
36494 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
36495 N0.getOpcode() == ISD::SIGN_EXTEND)) {
36496 SDValue N00 = N0.getOperand(0);
36498 // EXTLOAD has a better solution on AVX2,
36499 // it may be replaced with X86ISD::VSEXT node.
36500 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
36501 if (!ISD::isNormalLoad(N00.getNode()))
36504 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
36505 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
36507 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
36513 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
36514 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
36515 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
36516 /// opportunities to combine math ops, use an LEA, or use a complex addressing
36517 /// mode. This can eliminate extend, add, and shift instructions.
36518 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
36519 const X86Subtarget &Subtarget) {
36520 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
36521 Ext->getOpcode() != ISD::ZERO_EXTEND)
36524 // TODO: This should be valid for other integer types.
36525 EVT VT = Ext->getValueType(0);
36526 if (VT != MVT::i64)
36529 SDValue Add = Ext->getOperand(0);
36530 if (Add.getOpcode() != ISD::ADD)
36533 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
36534 bool NSW = Add->getFlags().hasNoSignedWrap();
36535 bool NUW = Add->getFlags().hasNoUnsignedWrap();
36537 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
36539 if ((Sext && !NSW) || (!Sext && !NUW))
36542 // Having a constant operand to the 'add' ensures that we are not increasing
36543 // the instruction count because the constant is extended for free below.
36544 // A constant operand can also become the displacement field of an LEA.
36545 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
36549 // Don't make the 'add' bigger if there's no hope of combining it with some
36550 // other 'add' or 'shl' instruction.
36551 // TODO: It may be profitable to generate simpler LEA instructions in place
36552 // of single 'add' instructions, but the cost model for selecting an LEA
36553 // currently has a high threshold.
36554 bool HasLEAPotential = false;
36555 for (auto *User : Ext->uses()) {
36556 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
36557 HasLEAPotential = true;
36561 if (!HasLEAPotential)
36564 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
36565 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
36566 SDValue AddOp0 = Add.getOperand(0);
36567 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
36568 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
36570 // The wider add is guaranteed to not wrap because both operands are
36573 Flags.setNoSignedWrap(NSW);
36574 Flags.setNoUnsignedWrap(NUW);
36575 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
36578 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
36579 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
36580 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
36581 /// extends from AH (which we otherwise need to do contortions to access).
36582 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
36583 SDValue N0 = N->getOperand(0);
36584 auto OpcodeN = N->getOpcode();
36585 auto OpcodeN0 = N0.getOpcode();
36586 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
36587 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
36590 EVT VT = N->getValueType(0);
36591 EVT InVT = N0.getValueType();
36592 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
36593 !(VT == MVT::i32 || VT == MVT::i64))
36596 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
36597 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
36598 : X86ISD::UDIVREM8_ZEXT_HREG;
36599 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
36601 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
36602 // If this was a 64-bit extend, complete it.
36603 if (VT == MVT::i64)
36604 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
36605 return R.getValue(1);
36608 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
36609 // operands and the result of CMOV is not used anywhere else - promote CMOV
36610 // itself instead of promoting its result. This could be beneficial, because:
36611 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
36612 // (or more) pseudo-CMOVs only when they go one-after-another and
36613 // getting rid of result extension code after CMOV will help that.
36614 // 2) Promotion of constant CMOV arguments is free, hence the
36615 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
36616 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
36617 // promotion is also good in terms of code-size.
36618 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
36620 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
36621 SDValue CMovN = Extend->getOperand(0);
36622 if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse())
36625 EVT TargetVT = Extend->getValueType(0);
36626 unsigned ExtendOpcode = Extend->getOpcode();
36629 EVT VT = CMovN.getValueType();
36630 SDValue CMovOp0 = CMovN.getOperand(0);
36631 SDValue CMovOp1 = CMovN.getOperand(1);
36633 if (!isa<ConstantSDNode>(CMovOp0.getNode()) ||
36634 !isa<ConstantSDNode>(CMovOp1.getNode()))
36637 // Only extend to i32 or i64.
36638 if (TargetVT != MVT::i32 && TargetVT != MVT::i64)
36641 // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32
36643 if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32))
36646 // If this a zero extend to i64, we should only extend to i32 and use a free
36647 // zero extend to finish.
36648 EVT ExtendVT = TargetVT;
36649 if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND)
36650 ExtendVT = MVT::i32;
36652 CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0);
36653 CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1);
36655 SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1,
36656 CMovN.getOperand(2), CMovN.getOperand(3));
36658 // Finish extending if needed.
36659 if (ExtendVT != TargetVT)
36660 Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res);
36665 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
36666 // This is more or less the reverse of combineBitcastvxi1.
36668 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
36669 TargetLowering::DAGCombinerInfo &DCI,
36670 const X86Subtarget &Subtarget) {
36671 unsigned Opcode = N->getOpcode();
36672 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
36673 Opcode != ISD::ANY_EXTEND)
36675 if (!DCI.isBeforeLegalizeOps())
36677 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
36680 SDValue N0 = N->getOperand(0);
36681 EVT VT = N->getValueType(0);
36682 EVT SVT = VT.getScalarType();
36683 EVT InSVT = N0.getValueType().getScalarType();
36684 unsigned EltSizeInBits = SVT.getSizeInBits();
36686 // Input type must be extending a bool vector (bit-casted from a scalar
36687 // integer) to legal integer types.
36688 if (!VT.isVector())
36690 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
36692 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
36695 SDValue N00 = N0.getOperand(0);
36696 EVT SclVT = N0.getOperand(0).getValueType();
36697 if (!SclVT.isScalarInteger())
36702 SmallVector<int, 32> ShuffleMask;
36703 unsigned NumElts = VT.getVectorNumElements();
36704 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
36706 // Broadcast the scalar integer to the vector elements.
36707 if (NumElts > EltSizeInBits) {
36708 // If the scalar integer is greater than the vector element size, then we
36709 // must split it down into sub-sections for broadcasting. For example:
36710 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
36711 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
36712 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
36713 unsigned Scale = NumElts / EltSizeInBits;
36715 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
36716 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
36717 Vec = DAG.getBitcast(VT, Vec);
36719 for (unsigned i = 0; i != Scale; ++i)
36720 ShuffleMask.append(EltSizeInBits, i);
36722 // For smaller scalar integers, we can simply any-extend it to the vector
36723 // element size (we don't care about the upper bits) and broadcast it to all
36725 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
36726 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
36727 ShuffleMask.append(NumElts, 0);
36729 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
36731 // Now, mask the relevant bit in each element.
36732 SmallVector<SDValue, 32> Bits;
36733 for (unsigned i = 0; i != NumElts; ++i) {
36734 int BitIdx = (i % EltSizeInBits);
36735 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
36736 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
36738 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
36739 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
36741 // Compare against the bitmask and extend the result.
36742 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
36743 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
36744 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
36746 // For SEXT, this is now done, otherwise shift the result down for
36748 if (Opcode == ISD::SIGN_EXTEND)
36750 return DAG.getNode(ISD::SRL, DL, VT, Vec,
36751 DAG.getConstant(EltSizeInBits - 1, DL, VT));
36754 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
36755 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
36756 /// with UNDEFs) of the input to vectors of the same size as the target type
36757 /// which then extends the lowest elements.
36758 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
36759 TargetLowering::DAGCombinerInfo &DCI,
36760 const X86Subtarget &Subtarget) {
36761 unsigned Opcode = N->getOpcode();
36762 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
36764 if (!DCI.isBeforeLegalizeOps())
36766 if (!Subtarget.hasSSE2())
36769 SDValue N0 = N->getOperand(0);
36770 EVT VT = N->getValueType(0);
36771 EVT SVT = VT.getScalarType();
36772 EVT InVT = N0.getValueType();
36773 EVT InSVT = InVT.getScalarType();
36775 // Input type must be a vector and we must be extending legal integer types.
36776 if (!VT.isVector())
36778 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
36780 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
36783 // On AVX2+ targets, if the input/output types are both legal then we will be
36784 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
36785 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
36786 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
36791 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
36792 EVT InVT = N.getValueType();
36793 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
36794 Size / InVT.getScalarSizeInBits());
36795 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
36796 DAG.getUNDEF(InVT));
36798 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
36801 // If target-size is less than 128-bits, extend to a type that would extend
36802 // to 128 bits, extend that and extract the original target vector.
36803 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
36804 unsigned Scale = 128 / VT.getSizeInBits();
36806 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
36807 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
36808 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
36809 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
36810 DAG.getIntPtrConstant(0, DL));
36813 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
36814 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
36815 // Also use this if we don't have SSE41 to allow the legalizer do its job.
36816 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
36817 (VT.is256BitVector() && Subtarget.hasInt256()) ||
36818 (VT.is512BitVector() && Subtarget.useAVX512Regs())) {
36819 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
36820 return Opcode == ISD::SIGN_EXTEND
36821 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
36822 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
36825 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
36826 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
36827 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
36828 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
36829 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
36831 SmallVector<SDValue, 8> Opnds;
36832 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
36833 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
36834 DAG.getIntPtrConstant(Offset, DL));
36835 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
36836 SrcVec = Opcode == ISD::SIGN_EXTEND
36837 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
36838 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
36839 Opnds.push_back(SrcVec);
36841 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
36844 // On pre-AVX2 targets, split into 128-bit nodes of
36845 // ISD::*_EXTEND_VECTOR_INREG.
36846 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
36847 return SplitAndExtendInReg(128);
36849 // On pre-AVX512 targets, split into 256-bit nodes of
36850 // ISD::*_EXTEND_VECTOR_INREG.
36851 if (!Subtarget.useAVX512Regs() && !(VT.getSizeInBits() % 256))
36852 return SplitAndExtendInReg(256);
36857 // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm
36859 static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
36860 const X86Subtarget &Subtarget) {
36861 SDValue N0 = N->getOperand(0);
36862 EVT VT = N->getValueType(0);
36865 // Only do this combine with AVX512 for vector extends.
36866 if (!Subtarget.hasAVX512() || !VT.isVector() || N0->getOpcode() != ISD::SETCC)
36869 // Only combine legal element types.
36870 EVT SVT = VT.getVectorElementType();
36871 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 &&
36872 SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64)
36875 // We can only do this if the vector size in 256 bits or less.
36876 unsigned Size = VT.getSizeInBits();
36880 // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
36881 // that's the only integer compares with we have.
36882 ISD::CondCode CC = cast<CondCodeSDNode>(N0->getOperand(2))->get();
36883 if (ISD::isUnsignedIntSetCC(CC))
36886 // Only do this combine if the extension will be fully consumed by the setcc.
36887 EVT N00VT = N0.getOperand(0).getValueType();
36888 EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
36889 if (Size != MatchingVecType.getSizeInBits())
36892 SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
36894 if (N->getOpcode() == ISD::ZERO_EXTEND)
36895 Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
36900 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
36901 TargetLowering::DAGCombinerInfo &DCI,
36902 const X86Subtarget &Subtarget) {
36903 SDValue N0 = N->getOperand(0);
36904 EVT VT = N->getValueType(0);
36905 EVT InVT = N0.getValueType();
36908 if (SDValue DivRem8 = getDivRem8(N, DAG))
36911 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
36914 if (!DCI.isBeforeLegalizeOps())
36917 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
36920 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
36921 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
36922 // Invert and sign-extend a boolean is the same as zero-extend and subtract
36923 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
36924 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
36925 // sext (xor Bool, -1) --> sub (zext Bool), 1
36926 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
36927 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
36930 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
36933 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
36937 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
36940 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
36946 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
36947 const X86Subtarget &Subtarget) {
36948 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
36950 EVT VT = N->getValueType(0);
36952 // Let legalize expand this if it isn't a legal type yet.
36953 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
36956 EVT ScalarVT = VT.getScalarType();
36957 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
36960 SDValue A = N->getOperand(0);
36961 SDValue B = N->getOperand(1);
36962 SDValue C = N->getOperand(2);
36964 auto invertIfNegative = [](SDValue &V) {
36965 if (SDValue NegVal = isFNEG(V.getNode())) {
36972 // Do not convert the passthru input of scalar intrinsics.
36973 // FIXME: We could allow negations of the lower element only.
36974 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
36975 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
36976 bool NegB = invertIfNegative(B);
36977 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
36978 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
36980 // Negative multiplication when NegA xor NegB
36981 bool NegMul = (NegA != NegB);
36982 bool HasNeg = NegA || NegB || NegC;
36984 unsigned NewOpcode;
36986 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
36988 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
36990 // For FMA, we risk reconstructing the node we started with.
36991 // In order to avoid this, we check for negation or opcode change. If
36992 // one of the two happened, then it is a new node and we return it.
36993 if (N->getOpcode() == ISD::FMA) {
36994 if (HasNeg || NewOpcode != N->getOpcode())
36995 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
36999 if (N->getOpcode() == X86ISD::FMADD_RND) {
37000 switch (NewOpcode) {
37001 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
37002 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
37003 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
37004 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
37006 } else if (N->getOpcode() == X86ISD::FMADDS1) {
37007 switch (NewOpcode) {
37008 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
37009 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
37010 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
37011 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
37013 } else if (N->getOpcode() == X86ISD::FMADDS3) {
37014 switch (NewOpcode) {
37015 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
37016 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
37017 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
37018 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
37020 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
37021 switch (NewOpcode) {
37022 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
37023 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
37024 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
37025 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
37027 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
37028 switch (NewOpcode) {
37029 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
37030 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
37031 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
37032 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
37034 } else if (N->getOpcode() == X86ISD::FMADD4S) {
37035 switch (NewOpcode) {
37036 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
37037 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
37038 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
37039 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
37042 llvm_unreachable("Unexpected opcode!");
37045 // Only return the node is the opcode was changed or one of the
37046 // operand was negated. If not, we'll just recreate the same node.
37047 if (HasNeg || NewOpcode != N->getOpcode()) {
37048 if (N->getNumOperands() == 4)
37049 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
37050 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
37056 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
37057 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
37058 const X86Subtarget &Subtarget) {
37060 EVT VT = N->getValueType(0);
37062 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
37066 unsigned NewOpcode;
37067 switch (N->getOpcode()) {
37068 default: llvm_unreachable("Unexpected opcode!");
37069 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
37070 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
37071 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
37072 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
37075 if (N->getNumOperands() == 4)
37076 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37077 NegVal, N->getOperand(3));
37078 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
37082 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
37083 TargetLowering::DAGCombinerInfo &DCI,
37084 const X86Subtarget &Subtarget) {
37085 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
37086 // (and (i32 x86isd::setcc_carry), 1)
37087 // This eliminates the zext. This transformation is necessary because
37088 // ISD::SETCC is always legalized to i8.
37090 SDValue N0 = N->getOperand(0);
37091 EVT VT = N->getValueType(0);
37093 if (N0.getOpcode() == ISD::AND &&
37095 N0.getOperand(0).hasOneUse()) {
37096 SDValue N00 = N0.getOperand(0);
37097 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37098 if (!isOneConstant(N0.getOperand(1)))
37100 return DAG.getNode(ISD::AND, dl, VT,
37101 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37102 N00.getOperand(0), N00.getOperand(1)),
37103 DAG.getConstant(1, dl, VT));
37107 if (N0.getOpcode() == ISD::TRUNCATE &&
37109 N0.getOperand(0).hasOneUse()) {
37110 SDValue N00 = N0.getOperand(0);
37111 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
37112 return DAG.getNode(ISD::AND, dl, VT,
37113 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
37114 N00.getOperand(0), N00.getOperand(1)),
37115 DAG.getConstant(1, dl, VT));
37119 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
37122 if (DCI.isBeforeLegalizeOps())
37123 if (SDValue V = combineExtSetcc(N, DAG, Subtarget))
37126 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
37129 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
37133 if (SDValue R = WidenMaskArithmetic(N, DAG, Subtarget))
37136 if (SDValue DivRem8 = getDivRem8(N, DAG))
37139 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
37142 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
37148 /// Try to map a 128-bit or larger integer comparison to vector instructions
37149 /// before type legalization splits it up into chunks.
37150 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
37151 const X86Subtarget &Subtarget) {
37152 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
37153 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
37155 // We're looking for an oversized integer equality comparison.
37156 SDValue X = SetCC->getOperand(0);
37157 SDValue Y = SetCC->getOperand(1);
37158 EVT OpVT = X.getValueType();
37159 unsigned OpSize = OpVT.getSizeInBits();
37160 if (!OpVT.isScalarInteger() || OpSize < 128)
37163 // Ignore a comparison with zero because that gets special treatment in
37164 // EmitTest(). But make an exception for the special case of a pair of
37165 // logically-combined vector-sized operands compared to zero. This pattern may
37166 // be generated by the memcmp expansion pass with oversized integer compares
37168 bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
37169 X.getOperand(0).getOpcode() == ISD::XOR &&
37170 X.getOperand(1).getOpcode() == ISD::XOR;
37171 if (isNullConstant(Y) && !IsOrXorXorCCZero)
37174 // Bail out if we know that this is not really just an oversized integer.
37175 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
37176 peekThroughBitcasts(Y).getValueType() == MVT::f128)
37179 // TODO: Use PXOR + PTEST for SSE4.1 or later?
37180 // TODO: Add support for AVX-512.
37181 EVT VT = SetCC->getValueType(0);
37183 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
37184 (OpSize == 256 && Subtarget.hasAVX2())) {
37185 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
37187 if (IsOrXorXorCCZero) {
37188 // This is a bitwise-combined equality comparison of 2 pairs of vectors:
37189 // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
37190 // Use 2 vector equality compares and 'and' the results before doing a
37192 SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
37193 SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
37194 SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
37195 SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
37196 SDValue Cmp1 = DAG.getSetCC(DL, VecVT, A, B, ISD::SETEQ);
37197 SDValue Cmp2 = DAG.getSetCC(DL, VecVT, C, D, ISD::SETEQ);
37198 Cmp = DAG.getNode(ISD::AND, DL, VecVT, Cmp1, Cmp2);
37200 SDValue VecX = DAG.getBitcast(VecVT, X);
37201 SDValue VecY = DAG.getBitcast(VecVT, Y);
37202 Cmp = DAG.getSetCC(DL, VecVT, VecX, VecY, ISD::SETEQ);
37204 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
37205 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
37206 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
37207 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
37208 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
37209 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
37210 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
37212 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
37218 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
37219 const X86Subtarget &Subtarget) {
37220 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
37221 SDValue LHS = N->getOperand(0);
37222 SDValue RHS = N->getOperand(1);
37223 EVT VT = N->getValueType(0);
37224 EVT OpVT = LHS.getValueType();
37227 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
37228 // 0-x == y --> x+y == 0
37229 // 0-x != y --> x+y != 0
37230 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
37232 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
37233 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
37235 // x == 0-y --> x+y == 0
37236 // x != 0-y --> x+y != 0
37237 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
37239 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
37240 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
37243 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
37247 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
37248 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
37249 // Put build_vectors on the right.
37250 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
37251 std::swap(LHS, RHS);
37252 CC = ISD::getSetCCSwappedOperands(CC);
37256 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
37257 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
37258 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
37260 if (IsSEXT0 && IsVZero1) {
37261 assert(VT == LHS.getOperand(0).getValueType() &&
37262 "Uexpected operand type");
37263 if (CC == ISD::SETGT)
37264 return DAG.getConstant(0, DL, VT);
37265 if (CC == ISD::SETLE)
37266 return DAG.getConstant(1, DL, VT);
37267 if (CC == ISD::SETEQ || CC == ISD::SETGE)
37268 return DAG.getNOT(DL, LHS.getOperand(0), VT);
37270 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
37271 "Unexpected condition code!");
37272 return LHS.getOperand(0);
37276 // If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just
37277 // pre-promote its result type since vXi1 vectors don't get promoted
37278 // during type legalization.
37279 // NOTE: The element count check is to ignore operand types that need to
37280 // go through type promotion to a 128-bit vector.
37281 if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() &&
37282 VT.getVectorElementType() == MVT::i1 && VT.getVectorNumElements() > 4 &&
37283 (OpVT.getVectorElementType() == MVT::i8 ||
37284 OpVT.getVectorElementType() == MVT::i16)) {
37285 SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
37287 return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
37290 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
37291 // to avoid scalarization via legalization because v4i32 is not a legal type.
37292 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
37293 LHS.getValueType() == MVT::v4f32)
37294 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
37299 static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
37300 TargetLowering::DAGCombinerInfo &DCI) {
37301 SDValue Src = N->getOperand(0);
37302 MVT SrcVT = Src.getSimpleValueType();
37304 // Perform constant folding.
37305 if (ISD::isBuildVectorOfConstantSDNodes(Src.getNode())) {
37306 assert(N->getValueType(0) == MVT::i32 && "Unexpected result type");
37308 for (unsigned Idx = 0, e = Src.getNumOperands(); Idx < e; ++Idx) {
37309 SDValue In = Src.getOperand(Idx);
37310 if (!In.isUndef() &&
37311 cast<ConstantSDNode>(In)->getAPIntValue().isNegative())
37314 return DAG.getConstant(Imm, SDLoc(N), N->getValueType(0));
37317 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37318 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
37319 !DCI.isBeforeLegalizeOps());
37321 // MOVMSK only uses the MSB from each vector element.
37323 APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
37324 if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
37325 DCI.AddToWorklist(Src.getNode());
37326 DCI.CommitTargetLoweringOpt(TLO);
37327 return SDValue(N, 0);
37333 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
37334 TargetLowering::DAGCombinerInfo &DCI,
37335 const X86Subtarget &Subtarget) {
37338 if (DCI.isBeforeLegalizeOps()) {
37339 SDValue Index = N->getOperand(4);
37340 // Remove any sign extends from 32 or smaller to larger than 32.
37341 // Only do this before LegalizeOps in case we need the sign extend for
37343 if (Index.getOpcode() == ISD::SIGN_EXTEND) {
37344 if (Index.getScalarValueSizeInBits() > 32 &&
37345 Index.getOperand(0).getScalarValueSizeInBits() <= 32) {
37346 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
37347 NewOps[4] = Index.getOperand(0);
37348 DAG.UpdateNodeOperands(N, NewOps);
37349 // The original sign extend has less users, add back to worklist in case
37350 // it needs to be removed
37351 DCI.AddToWorklist(Index.getNode());
37352 DCI.AddToWorklist(N);
37353 return SDValue(N, 0);
37357 // Make sure the index is either i32 or i64
37358 unsigned ScalarSize = Index.getScalarValueSizeInBits();
37359 if (ScalarSize != 32 && ScalarSize != 64) {
37360 MVT EltVT = ScalarSize > 32 ? MVT::i64 : MVT::i32;
37361 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
37362 Index.getValueType().getVectorNumElements());
37363 Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
37364 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
37366 DAG.UpdateNodeOperands(N, NewOps);
37367 DCI.AddToWorklist(N);
37368 return SDValue(N, 0);
37371 // Try to remove zero extends from 32->64 if we know the sign bit of
37372 // the input is zero.
37373 if (Index.getOpcode() == ISD::ZERO_EXTEND &&
37374 Index.getScalarValueSizeInBits() == 64 &&
37375 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
37376 if (DAG.SignBitIsZero(Index.getOperand(0))) {
37377 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
37378 NewOps[4] = Index.getOperand(0);
37379 DAG.UpdateNodeOperands(N, NewOps);
37380 // The original zero extend has less users, add back to worklist in case
37381 // it needs to be removed
37382 DCI.AddToWorklist(Index.getNode());
37383 DCI.AddToWorklist(N);
37384 return SDValue(N, 0);
37389 // With AVX2 we only demand the upper bit of the mask.
37390 if (!Subtarget.hasAVX512()) {
37391 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
37392 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
37393 !DCI.isBeforeLegalizeOps());
37394 SDValue Mask = N->getOperand(2);
37396 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
37397 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
37398 DCI.AddToWorklist(Mask.getNode());
37399 DCI.CommitTargetLoweringOpt(TLO);
37400 return SDValue(N, 0);
37407 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
37408 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
37409 const X86Subtarget &Subtarget) {
37411 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
37412 SDValue EFLAGS = N->getOperand(1);
37414 // Try to simplify the EFLAGS and condition code operands.
37415 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
37416 return getSETCC(CC, Flags, DL, DAG);
37421 /// Optimize branch condition evaluation.
37422 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
37423 const X86Subtarget &Subtarget) {
37425 SDValue EFLAGS = N->getOperand(3);
37426 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
37428 // Try to simplify the EFLAGS and condition code operands.
37429 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
37430 // RAUW them under us.
37431 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
37432 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
37433 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
37434 N->getOperand(1), Cond, Flags);
37440 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
37441 SelectionDAG &DAG) {
37442 // Take advantage of vector comparisons producing 0 or -1 in each lane to
37443 // optimize away operation when it's from a constant.
37445 // The general transformation is:
37446 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
37447 // AND(VECTOR_CMP(x,y), constant2)
37448 // constant2 = UNARYOP(constant)
37450 // Early exit if this isn't a vector operation, the operand of the
37451 // unary operation isn't a bitwise AND, or if the sizes of the operations
37452 // aren't the same.
37453 EVT VT = N->getValueType(0);
37454 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
37455 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
37456 VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
37459 // Now check that the other operand of the AND is a constant. We could
37460 // make the transformation for non-constant splats as well, but it's unclear
37461 // that would be a benefit as it would not eliminate any operations, just
37462 // perform one more step in scalar code before moving to the vector unit.
37463 if (BuildVectorSDNode *BV =
37464 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
37465 // Bail out if the vector isn't a constant.
37466 if (!BV->isConstant())
37469 // Everything checks out. Build up the new and improved node.
37471 EVT IntVT = BV->getValueType(0);
37472 // Create a new constant of the appropriate type for the transformed
37474 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
37475 // The AND node needs bitcasts to/from an integer vector type around it.
37476 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
37477 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
37478 N->getOperand(0)->getOperand(0), MaskConst);
37479 SDValue Res = DAG.getBitcast(VT, NewAnd);
37486 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
37487 const X86Subtarget &Subtarget) {
37488 SDValue Op0 = N->getOperand(0);
37489 EVT VT = N->getValueType(0);
37490 EVT InVT = Op0.getValueType();
37492 // UINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
37493 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
37494 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
37495 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
37497 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37498 InVT.getVectorNumElements());
37499 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
37501 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
37502 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
37505 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
37506 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
37507 // the optimization here.
37508 if (DAG.SignBitIsZero(Op0))
37509 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
37514 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
37515 const X86Subtarget &Subtarget) {
37516 // First try to optimize away the conversion entirely when it's
37517 // conditionally from a constant. Vectors only.
37518 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
37521 // Now move on to more general possibilities.
37522 SDValue Op0 = N->getOperand(0);
37523 EVT VT = N->getValueType(0);
37524 EVT InVT = Op0.getValueType();
37526 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
37527 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
37528 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
37529 if (InVT.isVector() && InVT.getScalarSizeInBits() < 32) {
37531 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37532 InVT.getVectorNumElements());
37533 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
37534 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
37537 // Without AVX512DQ we only support i64 to float scalar conversion. For both
37538 // vectors and scalars, see if we know that the upper bits are all the sign
37539 // bit, in which case we can truncate the input to i32 and convert from that.
37540 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
37541 unsigned BitWidth = InVT.getScalarSizeInBits();
37542 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
37543 if (NumSignBits >= (BitWidth - 31)) {
37544 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
37545 if (InVT.isVector())
37546 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
37547 InVT.getVectorNumElements());
37549 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
37550 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
37554 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
37555 // a 32-bit target where SSE doesn't support i64->FP operations.
37556 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
37557 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
37558 EVT LdVT = Ld->getValueType(0);
37560 // This transformation is not supported if the result type is f16 or f128.
37561 if (VT == MVT::f16 || VT == MVT::f128)
37564 if (!Ld->isVolatile() && !VT.isVector() &&
37565 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
37566 !Subtarget.is64Bit() && LdVT == MVT::i64) {
37567 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
37568 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
37569 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
37576 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
37577 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
37578 MVT VT = N->getSimpleValueType(0);
37579 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
37580 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
37581 N->getOperand(0), N->getOperand(1),
37588 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
37589 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
37590 TargetLowering::DAGCombinerInfo &DCI) {
37591 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
37592 // the result is either zero or one (depending on the input carry bit).
37593 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
37594 if (X86::isZeroNode(N->getOperand(0)) &&
37595 X86::isZeroNode(N->getOperand(1)) &&
37596 // We don't have a good way to replace an EFLAGS use, so only do this when
37598 SDValue(N, 1).use_empty()) {
37600 EVT VT = N->getValueType(0);
37601 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
37602 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
37603 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37604 DAG.getConstant(X86::COND_B, DL,
37607 DAG.getConstant(1, DL, VT));
37608 return DCI.CombineTo(N, Res1, CarryOut);
37611 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
37612 MVT VT = N->getSimpleValueType(0);
37613 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
37614 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
37615 N->getOperand(0), N->getOperand(1),
37622 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
37623 /// which is more useful than 0/1 in some cases.
37624 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
37626 // "Condition code B" is also known as "the carry flag" (CF).
37627 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
37628 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
37629 MVT VT = N->getSimpleValueType(0);
37631 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
37633 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
37634 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
37637 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
37638 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
37639 /// with CMP+{ADC, SBB}.
37640 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
37641 bool IsSub = N->getOpcode() == ISD::SUB;
37642 SDValue X = N->getOperand(0);
37643 SDValue Y = N->getOperand(1);
37645 // If this is an add, canonicalize a zext operand to the RHS.
37646 // TODO: Incomplete? What if both sides are zexts?
37647 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
37648 Y.getOpcode() != ISD::ZERO_EXTEND)
37651 // Look through a one-use zext.
37652 bool PeekedThroughZext = false;
37653 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
37654 Y = Y.getOperand(0);
37655 PeekedThroughZext = true;
37658 // If this is an add, canonicalize a setcc operand to the RHS.
37659 // TODO: Incomplete? What if both sides are setcc?
37660 // TODO: Should we allow peeking through a zext of the other operand?
37661 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
37662 Y.getOpcode() != X86ISD::SETCC)
37665 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
37669 EVT VT = N->getValueType(0);
37670 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
37672 // If X is -1 or 0, then we have an opportunity to avoid constants required in
37673 // the general case below.
37674 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
37676 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
37677 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
37678 // This is a complicated way to get -1 or 0 from the carry flag:
37679 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
37680 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
37681 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37682 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37686 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
37687 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
37688 SDValue EFLAGS = Y->getOperand(1);
37689 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
37690 EFLAGS.getValueType().isInteger() &&
37691 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
37692 // Swap the operands of a SUB, and we have the same pattern as above.
37693 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
37694 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
37695 SDValue NewSub = DAG.getNode(
37696 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
37697 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
37698 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
37699 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37700 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37706 if (CC == X86::COND_B) {
37707 // X + SETB Z --> X + (mask SBB Z, Z)
37708 // X - SETB Z --> X - (mask SBB Z, Z)
37709 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
37710 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
37711 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37712 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37713 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37716 if (CC == X86::COND_A) {
37717 SDValue EFLAGS = Y->getOperand(1);
37718 // Try to convert COND_A into COND_B in an attempt to facilitate
37719 // materializing "setb reg".
37721 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
37722 // cannot take an immediate as its first operand.
37724 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
37725 EFLAGS.getValueType().isInteger() &&
37726 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
37727 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
37728 EFLAGS.getNode()->getVTList(),
37729 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
37730 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
37731 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
37732 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
37733 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
37734 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
37738 if (CC != X86::COND_E && CC != X86::COND_NE)
37741 SDValue Cmp = Y.getOperand(1);
37742 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
37743 !X86::isZeroNode(Cmp.getOperand(1)) ||
37744 !Cmp.getOperand(0).getValueType().isInteger())
37747 SDValue Z = Cmp.getOperand(0);
37748 EVT ZVT = Z.getValueType();
37750 // If X is -1 or 0, then we have an opportunity to avoid constants required in
37751 // the general case below.
37753 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
37755 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
37756 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
37757 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
37758 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
37759 SDValue Zero = DAG.getConstant(0, DL, ZVT);
37760 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
37761 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
37762 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37763 DAG.getConstant(X86::COND_B, DL, MVT::i8),
37764 SDValue(Neg.getNode(), 1));
37767 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
37768 // with fake operands:
37769 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
37770 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
37771 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
37772 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
37773 SDValue One = DAG.getConstant(1, DL, ZVT);
37774 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37775 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
37776 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
37780 // (cmp Z, 1) sets the carry flag if Z is 0.
37781 SDValue One = DAG.getConstant(1, DL, ZVT);
37782 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
37784 // Add the flags type for ADC/SBB nodes.
37785 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
37787 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
37788 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
37789 if (CC == X86::COND_NE)
37790 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
37791 DAG.getConstant(-1ULL, DL, VT), Cmp1);
37793 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
37794 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
37795 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
37796 DAG.getConstant(0, DL, VT), Cmp1);
37799 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
37800 const X86Subtarget &Subtarget) {
37801 if (!Subtarget.hasSSE2())
37804 SDValue MulOp = N->getOperand(0);
37805 SDValue Phi = N->getOperand(1);
37807 if (MulOp.getOpcode() != ISD::MUL)
37808 std::swap(MulOp, Phi);
37809 if (MulOp.getOpcode() != ISD::MUL)
37813 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
37816 EVT VT = N->getValueType(0);
37818 unsigned RegSize = 128;
37819 if (Subtarget.useBWIRegs())
37821 else if (Subtarget.hasAVX())
37823 unsigned VectorSize = VT.getVectorNumElements() * 16;
37824 // If the vector size is less than 128, or greater than the supported RegSize,
37825 // do not use PMADD.
37826 if (VectorSize < 128 || VectorSize > RegSize)
37830 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
37831 VT.getVectorNumElements());
37832 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
37833 VT.getVectorNumElements() / 2);
37835 // Shrink the operands of mul.
37836 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
37837 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
37839 // Madd vector size is half of the original vector size
37840 auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
37841 ArrayRef<SDValue> Ops) {
37842 MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
37843 return DAG.getNode(X86ISD::VPMADDWD, DL, VT, Ops);
37845 SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
37847 // Fill the rest of the output with 0
37848 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
37849 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
37850 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
37853 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
37854 const X86Subtarget &Subtarget) {
37855 if (!Subtarget.hasSSE2())
37859 EVT VT = N->getValueType(0);
37860 SDValue Op0 = N->getOperand(0);
37861 SDValue Op1 = N->getOperand(1);
37863 // TODO: There's nothing special about i32, any integer type above i16 should
37864 // work just as well.
37865 if (!VT.isVector() || !VT.isSimple() ||
37866 !(VT.getVectorElementType() == MVT::i32))
37869 unsigned RegSize = 128;
37870 if (Subtarget.useBWIRegs())
37872 else if (Subtarget.hasAVX())
37875 // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
37876 // TODO: We should be able to handle larger vectors by splitting them before
37877 // feeding them into several SADs, and then reducing over those.
37878 if (VT.getSizeInBits() / 4 > RegSize)
37881 // We know N is a reduction add, which means one of its operands is a phi.
37882 // To match SAD, we need the other operand to be a vector select.
37883 SDValue SelectOp, Phi;
37884 if (Op0.getOpcode() == ISD::VSELECT) {
37887 } else if (Op1.getOpcode() == ISD::VSELECT) {
37893 // Check whether we have an abs-diff pattern feeding into the select.
37894 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
37897 // SAD pattern detected. Now build a SAD instruction and an addition for
37898 // reduction. Note that the number of elements of the result of SAD is less
37899 // than the number of elements of its input. Therefore, we could only update
37900 // part of elements in the reduction vector.
37901 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
37903 // The output of PSADBW is a vector of i64.
37904 // We need to turn the vector of i64 into a vector of i32.
37905 // If the reduction vector is at least as wide as the psadbw result, just
37906 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
37908 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
37909 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
37910 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
37912 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
37914 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
37915 // Fill the upper elements with zero to match the add width.
37916 SDValue Zero = DAG.getConstant(0, DL, VT);
37917 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
37918 DAG.getIntPtrConstant(0, DL));
37921 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
37924 /// Convert vector increment or decrement to sub/add with an all-ones constant:
37925 /// add X, <1, 1...> --> sub X, <-1, -1...>
37926 /// sub X, <1, 1...> --> add X, <-1, -1...>
37927 /// The all-ones vector constant can be materialized using a pcmpeq instruction
37928 /// that is commonly recognized as an idiom (has no register dependency), so
37929 /// that's better/smaller than loading a splat 1 constant.
37930 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
37931 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
37932 "Unexpected opcode for increment/decrement transform");
37934 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
37935 // out and wait for legalization if we have an unsupported vector length.
37936 EVT VT = N->getValueType(0);
37937 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
37940 SDNode *N1 = N->getOperand(1).getNode();
37942 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
37943 !SplatVal.isOneValue())
37946 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
37947 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
37948 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
37951 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
37952 const SDLoc &DL, EVT VT,
37953 const X86Subtarget &Subtarget) {
37954 // Example of pattern we try to detect:
37955 // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1))))
37956 //(add (build_vector (extract_elt t, 0),
37957 // (extract_elt t, 2),
37958 // (extract_elt t, 4),
37959 // (extract_elt t, 6)),
37960 // (build_vector (extract_elt t, 1),
37961 // (extract_elt t, 3),
37962 // (extract_elt t, 5),
37963 // (extract_elt t, 7)))
37965 if (!Subtarget.hasSSE2())
37968 if (Op0.getOpcode() != ISD::BUILD_VECTOR ||
37969 Op1.getOpcode() != ISD::BUILD_VECTOR)
37972 if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 ||
37973 VT.getVectorNumElements() < 4 ||
37974 !isPowerOf2_32(VT.getVectorNumElements()))
37977 // Check if one of Op0,Op1 is of the form:
37978 // (build_vector (extract_elt Mul, 0),
37979 // (extract_elt Mul, 2),
37980 // (extract_elt Mul, 4),
37982 // the other is of the form:
37983 // (build_vector (extract_elt Mul, 1),
37984 // (extract_elt Mul, 3),
37985 // (extract_elt Mul, 5),
37987 // and identify Mul.
37989 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) {
37990 SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i),
37991 Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1);
37992 // TODO: Be more tolerant to undefs.
37993 if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
37994 Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
37995 Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
37996 Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
37998 auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1));
37999 auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1));
38000 auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1));
38001 auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1));
38002 if (!Const0L || !Const1L || !Const0H || !Const1H)
38004 unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(),
38005 Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue();
38006 // Commutativity of mul allows factors of a product to reorder.
38008 std::swap(Idx0L, Idx1L);
38010 std::swap(Idx0H, Idx1H);
38011 // Commutativity of add allows pairs of factors to reorder.
38012 if (Idx0L > Idx0H) {
38013 std::swap(Idx0L, Idx0H);
38014 std::swap(Idx1L, Idx1H);
38016 if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 ||
38017 Idx1H != 2 * i + 3)
38020 // First time an extract_elt's source vector is visited. Must be a MUL
38021 // with 2X number of vector elements than the BUILD_VECTOR.
38022 // Both extracts must be from same MUL.
38023 Mul = Op0L->getOperand(0);
38024 if (Mul->getOpcode() != ISD::MUL ||
38025 Mul.getValueType().getVectorNumElements() != 2 * e)
38028 // Check that the extract is from the same MUL previously seen.
38029 if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) ||
38030 Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0))
38034 // Check if the Mul source can be safely shrunk.
38036 if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
38039 auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38040 ArrayRef<SDValue> Ops) {
38041 // Shrink by adding truncate nodes and let DAGCombine fold with the
38043 EVT InVT = Ops[0].getValueType();
38044 assert(InVT.getScalarType() == MVT::i32 &&
38045 "Unexpected scalar element type");
38046 assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
38047 EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
38048 InVT.getVectorNumElements() / 2);
38049 EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
38050 InVT.getVectorNumElements());
38051 return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
38052 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
38053 DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
38055 return SplitOpsAndApply(DAG, Subtarget, DL, VT,
38056 { Mul.getOperand(0), Mul.getOperand(1) },
38060 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
38061 const X86Subtarget &Subtarget) {
38062 const SDNodeFlags Flags = N->getFlags();
38063 if (Flags.hasVectorReduction()) {
38064 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
38066 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
38069 EVT VT = N->getValueType(0);
38070 SDValue Op0 = N->getOperand(0);
38071 SDValue Op1 = N->getOperand(1);
38073 if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, SDLoc(N), VT, Subtarget))
38076 // Try to synthesize horizontal adds from adds of shuffles.
38077 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
38078 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
38079 isHorizontalBinOp(Op0, Op1, true))
38080 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
38082 if (SDValue V = combineIncDecVector(N, DAG))
38085 return combineAddOrSubToADCOrSBB(N, DAG);
38088 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
38089 const X86Subtarget &Subtarget) {
38090 SDValue Op0 = N->getOperand(0);
38091 SDValue Op1 = N->getOperand(1);
38092 EVT VT = N->getValueType(0);
38094 // PSUBUS is supported, starting from SSE2, but truncation for v8i32
38095 // is only worth it with SSSE3 (PSHUFB).
38096 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
38097 !(Subtarget.hasSSSE3() && (VT == MVT::v8i32 || VT == MVT::v8i64)) &&
38098 !(Subtarget.hasAVX() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
38099 !(Subtarget.useBWIRegs() && (VT == MVT::v64i8 || VT == MVT::v32i16 ||
38100 VT == MVT::v16i32 || VT == MVT::v8i64)))
38103 SDValue SubusLHS, SubusRHS;
38104 // Try to find umax(a,b) - b or a - umin(a,b) patterns
38105 // they may be converted to subus(a,b).
38106 // TODO: Need to add IR canonicalization for this code.
38107 if (Op0.getOpcode() == ISD::UMAX) {
38109 SDValue MaxLHS = Op0.getOperand(0);
38110 SDValue MaxRHS = Op0.getOperand(1);
38113 else if (MaxRHS == Op1)
38117 } else if (Op1.getOpcode() == ISD::UMIN) {
38119 SDValue MinLHS = Op1.getOperand(0);
38120 SDValue MinRHS = Op1.getOperand(1);
38123 else if (MinRHS == Op0)
38130 auto SUBUSBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
38131 ArrayRef<SDValue> Ops) {
38132 return DAG.getNode(X86ISD::SUBUS, DL, Ops[0].getValueType(), Ops);
38135 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
38136 // special preprocessing in some cases.
38137 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
38138 return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT,
38139 { SubusLHS, SubusRHS }, SUBUSBuilder);
38141 // Special preprocessing case can be only applied
38142 // if the value was zero extended from 16 bit,
38143 // so we require first 16 bits to be zeros for 32 bit
38144 // values, or first 48 bits for 64 bit values.
38146 DAG.computeKnownBits(SubusLHS, Known);
38147 unsigned NumZeros = Known.countMinLeadingZeros();
38148 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
38151 EVT ExtType = SubusLHS.getValueType();
38153 if (VT == MVT::v8i32 || VT == MVT::v8i64)
38154 ShrinkedType = MVT::v8i16;
38156 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
38158 // If SubusLHS is zeroextended - truncate SubusRHS to it's
38159 // size SubusRHS = umin(0xFFF.., SubusRHS).
38160 SDValue SaturationConst =
38161 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
38162 ShrinkedType.getScalarSizeInBits()),
38163 SDLoc(SubusLHS), ExtType);
38164 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
38166 SDValue NewSubusLHS =
38167 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
38168 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
38170 SplitOpsAndApply(DAG, Subtarget, SDLoc(N), ShrinkedType,
38171 { NewSubusLHS, NewSubusRHS }, SUBUSBuilder);
38172 // Zero extend the result, it may be used somewhere as 32 bit,
38173 // if not zext and following trunc will shrink.
38174 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
38177 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
38178 const X86Subtarget &Subtarget) {
38179 SDValue Op0 = N->getOperand(0);
38180 SDValue Op1 = N->getOperand(1);
38182 // X86 can't encode an immediate LHS of a sub. See if we can push the
38183 // negation into a preceding instruction.
38184 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
38185 // If the RHS of the sub is a XOR with one use and a constant, invert the
38186 // immediate. Then add one to the LHS of the sub so we can turn
38187 // X-Y -> X+~Y+1, saving one register.
38188 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
38189 isa<ConstantSDNode>(Op1.getOperand(1))) {
38190 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
38191 EVT VT = Op0.getValueType();
38192 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
38194 DAG.getConstant(~XorC, SDLoc(Op1), VT));
38195 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
38196 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
38200 // Try to synthesize horizontal subs from subs of shuffles.
38201 EVT VT = N->getValueType(0);
38202 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
38203 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
38204 isHorizontalBinOp(Op0, Op1, false))
38205 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
38207 if (SDValue V = combineIncDecVector(N, DAG))
38210 // Try to create PSUBUS if SUB's argument is max/min
38211 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
38214 return combineAddOrSubToADCOrSBB(N, DAG);
38217 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
38218 TargetLowering::DAGCombinerInfo &DCI,
38219 const X86Subtarget &Subtarget) {
38220 if (DCI.isBeforeLegalize())
38224 unsigned Opcode = N->getOpcode();
38225 MVT VT = N->getSimpleValueType(0);
38226 MVT SVT = VT.getVectorElementType();
38227 unsigned NumElts = VT.getVectorNumElements();
38228 unsigned EltSizeInBits = SVT.getSizeInBits();
38230 SDValue Op = N->getOperand(0);
38231 MVT OpVT = Op.getSimpleValueType();
38232 MVT OpEltVT = OpVT.getVectorElementType();
38233 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
38234 unsigned InputBits = OpEltSizeInBits * NumElts;
38236 // Perform any constant folding.
38237 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
38239 SmallVector<APInt, 64> EltBits;
38240 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
38241 APInt Undefs(NumElts, 0);
38242 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
38244 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
38245 for (unsigned i = 0; i != NumElts; ++i) {
38246 if (UndefElts[i]) {
38250 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
38251 : EltBits[i].sextOrTrunc(EltSizeInBits);
38253 return getConstVector(Vals, Undefs, VT, DAG, DL);
38256 // (vzext (bitcast (vzext (x)) -> (vzext x)
38257 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
38258 SDValue V = peekThroughBitcasts(Op);
38259 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
38260 MVT InnerVT = V.getSimpleValueType();
38261 MVT InnerEltVT = InnerVT.getVectorElementType();
38263 // If the element sizes match exactly, we can just do one larger vzext. This
38264 // is always an exact type match as vzext operates on integer types.
38265 if (OpEltVT == InnerEltVT) {
38266 assert(OpVT == InnerVT && "Types must match for vzext!");
38267 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
38270 // The only other way we can combine them is if only a single element of the
38271 // inner vzext is used in the input to the outer vzext.
38272 if (InnerEltVT.getSizeInBits() < InputBits)
38275 // In this case, the inner vzext is completely dead because we're going to
38276 // only look at bits inside of the low element. Just do the outer vzext on
38277 // a bitcast of the input to the inner.
38278 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
38281 // Check if we can bypass extracting and re-inserting an element of an input
38282 // vector. Essentially:
38283 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
38284 // TODO: Add X86ISD::VSEXT support
38285 if (Opcode == X86ISD::VZEXT &&
38286 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
38287 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
38288 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
38289 SDValue ExtractedV = V.getOperand(0);
38290 SDValue OrigV = ExtractedV.getOperand(0);
38291 if (isNullConstant(ExtractedV.getOperand(1))) {
38292 MVT OrigVT = OrigV.getSimpleValueType();
38293 // Extract a subvector if necessary...
38294 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
38295 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
38296 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
38297 OrigVT.getVectorNumElements() / Ratio);
38298 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
38299 DAG.getIntPtrConstant(0, DL));
38301 Op = DAG.getBitcast(OpVT, OrigV);
38302 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
38309 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
38310 const X86Subtarget &Subtarget) {
38311 MVT VT = N->getSimpleValueType(0);
38314 if (N->getOperand(0) == N->getOperand(1)) {
38315 if (N->getOpcode() == X86ISD::PCMPEQ)
38316 return getOnesVector(VT, DAG, DL);
38317 if (N->getOpcode() == X86ISD::PCMPGT)
38318 return getZeroVector(VT, Subtarget, DAG, DL);
38324 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
38325 TargetLowering::DAGCombinerInfo &DCI,
38326 const X86Subtarget &Subtarget) {
38327 if (DCI.isBeforeLegalizeOps())
38330 MVT OpVT = N->getSimpleValueType(0);
38332 bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1;
38335 SDValue Vec = N->getOperand(0);
38336 SDValue SubVec = N->getOperand(1);
38338 unsigned IdxVal = N->getConstantOperandVal(2);
38339 MVT SubVecVT = SubVec.getSimpleValueType();
38341 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
38342 // Inserting zeros into zeros is a nop.
38343 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
38344 return getZeroVector(OpVT, Subtarget, DAG, dl);
38346 // If we're inserting into a zero vector and then into a larger zero vector,
38347 // just insert into the larger zero vector directly.
38348 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
38349 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
38350 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
38351 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
38352 getZeroVector(OpVT, Subtarget, DAG, dl),
38353 SubVec.getOperand(1),
38354 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
38357 // If we're inserting into a zero vector and our input was extracted from an
38358 // insert into a zero vector of the same type and the extraction was at
38359 // least as large as the original insertion. Just insert the original
38360 // subvector into a zero vector.
38361 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 &&
38362 SubVec.getConstantOperandVal(1) == 0 &&
38363 SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) {
38364 SDValue Ins = SubVec.getOperand(0);
38365 if (Ins.getConstantOperandVal(2) == 0 &&
38366 ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) &&
38367 Ins.getOperand(1).getValueSizeInBits() <= SubVecVT.getSizeInBits())
38368 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
38369 getZeroVector(OpVT, Subtarget, DAG, dl),
38370 Ins.getOperand(1), N->getOperand(2));
38373 // If we're inserting a bitcast into zeros, rewrite the insert and move the
38374 // bitcast to the other side. This helps with detecting zero extending
38376 // TODO: Is this useful for other indices than 0?
38377 if (!IsI1Vector && SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
38378 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
38379 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
38380 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
38381 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
38382 DAG.getBitcast(NewVT, Vec),
38383 SubVec.getOperand(0), N->getOperand(2));
38384 return DAG.getBitcast(OpVT, Insert);
38388 // Stop here if this is an i1 vector.
38392 // If this is an insert of an extract, combine to a shuffle. Don't do this
38393 // if the insert or extract can be represented with a subregister operation.
38394 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
38395 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
38396 (IdxVal != 0 || !Vec.isUndef())) {
38397 int ExtIdxVal = SubVec.getConstantOperandVal(1);
38398 if (ExtIdxVal != 0) {
38399 int VecNumElts = OpVT.getVectorNumElements();
38400 int SubVecNumElts = SubVecVT.getVectorNumElements();
38401 SmallVector<int, 64> Mask(VecNumElts);
38402 // First create an identity shuffle mask.
38403 for (int i = 0; i != VecNumElts; ++i)
38405 // Now insert the extracted portion.
38406 for (int i = 0; i != SubVecNumElts; ++i)
38407 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
38409 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
38413 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
38415 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
38416 // (load16 addr + 16), Elts/2)
38419 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
38420 // (load32 addr + 32), Elts/2)
38422 // or a 16-byte or 32-byte broadcast:
38423 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
38424 // (load16 addr), Elts/2)
38425 // --> X86SubVBroadcast(load16 addr)
38427 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
38428 // (load32 addr), Elts/2)
38429 // --> X86SubVBroadcast(load32 addr)
38430 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
38431 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
38432 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
38433 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
38434 if (Idx2 && Idx2->getZExtValue() == 0) {
38435 SDValue SubVec2 = Vec.getOperand(1);
38436 // If needed, look through bitcasts to get to the load.
38437 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
38439 unsigned Alignment = FirstLd->getAlignment();
38440 unsigned AS = FirstLd->getAddressSpace();
38441 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
38442 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
38443 OpVT, AS, Alignment, &Fast) && Fast) {
38444 SDValue Ops[] = {SubVec2, SubVec};
38445 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
38450 // If lower/upper loads are the same and the only users of the load, then
38451 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
38452 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
38453 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
38454 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
38455 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
38457 // If this is subv_broadcast insert into both halves, use a larger
38459 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
38460 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
38461 SubVec.getOperand(0));
38463 // If we're inserting all zeros into the upper half, change this to
38464 // an insert into an all zeros vector. We will match this to a move
38465 // with implicit upper bit zeroing during isel.
38466 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
38467 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
38468 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
38469 Vec.getOperand(2));
38471 // If we are inserting into both halves of the vector, the starting
38472 // vector should be undef. If it isn't, make it so. Only do this if the
38473 // the early insert has no other uses.
38474 // TODO: Should this be a generic DAG combine?
38475 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
38476 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
38477 SubVec2, Vec.getOperand(2));
38478 DCI.AddToWorklist(Vec.getNode());
38479 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
38489 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
38490 TargetLowering::DAGCombinerInfo &DCI,
38491 const X86Subtarget &Subtarget) {
38492 if (DCI.isBeforeLegalizeOps())
38495 MVT OpVT = N->getSimpleValueType(0);
38496 SDValue InVec = N->getOperand(0);
38497 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
38499 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
38500 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
38502 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
38503 if (OpVT.getScalarType() == MVT::i1)
38504 return DAG.getConstant(1, SDLoc(N), OpVT);
38505 return getOnesVector(OpVT, DAG, SDLoc(N));
38508 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
38509 return DAG.getBuildVector(
38511 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
38516 static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
38517 EVT VT = N->getValueType(0);
38518 SDValue Src = N->getOperand(0);
38520 // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and.
38521 // This occurs frequently in our masked scalar intrinsic code and our
38522 // floating point select lowering with AVX512.
38523 // TODO: SimplifyDemandedBits instead?
38524 if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse())
38525 if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1)))
38526 if (C->getAPIntValue().isOneValue())
38527 return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), MVT::v1i1,
38528 Src.getOperand(0));
38533 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
38534 DAGCombinerInfo &DCI) const {
38535 SelectionDAG &DAG = DCI.DAG;
38536 switch (N->getOpcode()) {
38538 case ISD::SCALAR_TO_VECTOR:
38539 return combineScalarToVector(N, DAG);
38540 case ISD::EXTRACT_VECTOR_ELT:
38541 case X86ISD::PEXTRW:
38542 case X86ISD::PEXTRB:
38543 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
38544 case ISD::INSERT_SUBVECTOR:
38545 return combineInsertSubvector(N, DAG, DCI, Subtarget);
38546 case ISD::EXTRACT_SUBVECTOR:
38547 return combineExtractSubvector(N, DAG, DCI, Subtarget);
38550 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
38551 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
38552 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
38553 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
38554 case ISD::SUB: return combineSub(N, DAG, Subtarget);
38555 case X86ISD::SBB: return combineSBB(N, DAG);
38556 case X86ISD::ADC: return combineADC(N, DAG, DCI);
38557 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
38560 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
38561 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
38562 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
38563 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
38564 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
38565 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
38566 case ISD::STORE: return combineStore(N, DAG, Subtarget);
38567 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
38568 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
38569 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
38571 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
38572 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
38573 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
38574 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
38575 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
38576 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
38578 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
38580 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
38582 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
38583 case X86ISD::BT: return combineBT(N, DAG, DCI);
38584 case ISD::ANY_EXTEND:
38585 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
38586 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
38587 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
38588 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
38589 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
38590 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
38591 case X86ISD::PACKSS:
38592 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
38593 case X86ISD::VSHLI:
38594 case X86ISD::VSRAI:
38595 case X86ISD::VSRLI:
38596 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
38597 case ISD::SIGN_EXTEND_VECTOR_INREG:
38598 case ISD::ZERO_EXTEND_VECTOR_INREG:
38599 case X86ISD::VSEXT:
38600 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
38601 case X86ISD::PINSRB:
38602 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
38603 case X86ISD::SHUFP: // Handle all target specific shuffles
38604 case X86ISD::INSERTPS:
38605 case X86ISD::EXTRQI:
38606 case X86ISD::INSERTQI:
38607 case X86ISD::PALIGNR:
38608 case X86ISD::VSHLDQ:
38609 case X86ISD::VSRLDQ:
38610 case X86ISD::BLENDI:
38611 case X86ISD::UNPCKH:
38612 case X86ISD::UNPCKL:
38613 case X86ISD::MOVHLPS:
38614 case X86ISD::MOVLHPS:
38615 case X86ISD::PSHUFB:
38616 case X86ISD::PSHUFD:
38617 case X86ISD::PSHUFHW:
38618 case X86ISD::PSHUFLW:
38619 case X86ISD::MOVSHDUP:
38620 case X86ISD::MOVSLDUP:
38621 case X86ISD::MOVDDUP:
38622 case X86ISD::MOVSS:
38623 case X86ISD::MOVSD:
38624 case X86ISD::VBROADCAST:
38625 case X86ISD::VPPERM:
38626 case X86ISD::VPERMI:
38627 case X86ISD::VPERMV:
38628 case X86ISD::VPERMV3:
38629 case X86ISD::VPERMIV3:
38630 case X86ISD::VPERMIL2:
38631 case X86ISD::VPERMILPI:
38632 case X86ISD::VPERMILPV:
38633 case X86ISD::VPERM2X128:
38634 case X86ISD::VZEXT_MOVL:
38635 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
38636 case X86ISD::FMADD_RND:
38637 case X86ISD::FMADDS1_RND:
38638 case X86ISD::FMADDS3_RND:
38639 case X86ISD::FMADDS1:
38640 case X86ISD::FMADDS3:
38641 case X86ISD::FMADD4S:
38642 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
38643 case X86ISD::FMADDSUB_RND:
38644 case X86ISD::FMSUBADD_RND:
38645 case X86ISD::FMADDSUB:
38646 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
38647 case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
38648 case X86ISD::MGATHER:
38649 case X86ISD::MSCATTER:
38651 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
38652 case X86ISD::PCMPEQ:
38653 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
38659 /// Return true if the target has native support for the specified value type
38660 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
38661 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
38662 /// some i16 instructions are slow.
38663 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
38664 if (!isTypeLegal(VT))
38667 // There are no vXi8 shifts.
38668 if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8)
38671 if (VT != MVT::i16)
38678 case ISD::SIGN_EXTEND:
38679 case ISD::ZERO_EXTEND:
38680 case ISD::ANY_EXTEND:
38693 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
38694 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
38695 /// we don't adjust the stack we clobber the first frame index.
38696 /// See X86InstrInfo::copyPhysReg.
38697 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
38698 const MachineRegisterInfo &MRI = MF.getRegInfo();
38699 return any_of(MRI.reg_instructions(X86::EFLAGS),
38700 [](const MachineInstr &RI) { return RI.isCopy(); });
38703 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
38704 if (hasCopyImplyingStackAdjustment(MF)) {
38705 MachineFrameInfo &MFI = MF.getFrameInfo();
38706 MFI.setHasCopyImplyingStackAdjustment(true);
38709 TargetLoweringBase::finalizeLowering(MF);
38712 /// This method query the target whether it is beneficial for dag combiner to
38713 /// promote the specified node. If true, it should return the desired promotion
38714 /// type by reference.
38715 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
38716 EVT VT = Op.getValueType();
38717 if (VT != MVT::i16)
38720 bool Promote = false;
38721 bool Commute = false;
38722 switch (Op.getOpcode()) {
38724 case ISD::SIGN_EXTEND:
38725 case ISD::ZERO_EXTEND:
38726 case ISD::ANY_EXTEND:
38731 SDValue N0 = Op.getOperand(0);
38732 // Look out for (store (shl (load), x)).
38733 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
38746 SDValue N0 = Op.getOperand(0);
38747 SDValue N1 = Op.getOperand(1);
38748 if (!Commute && MayFoldLoad(N1))
38750 // Avoid disabling potential load folding opportunities.
38751 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
38753 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
38763 bool X86TargetLowering::
38764 isDesirableToCombineBuildVectorToShuffleTruncate(
38765 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
38767 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
38768 "Element count mismatch");
38770 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
38771 "Shuffle Mask expected to be legal");
38773 // For 32-bit elements VPERMD is better than shuffle+truncate.
38774 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
38775 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
38778 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
38784 //===----------------------------------------------------------------------===//
38785 // X86 Inline Assembly Support
38786 //===----------------------------------------------------------------------===//
38788 // Helper to match a string separated by whitespace.
38789 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
38790 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
38792 for (StringRef Piece : Pieces) {
38793 if (!S.startswith(Piece)) // Check if the piece matches.
38796 S = S.substr(Piece.size());
38797 StringRef::size_type Pos = S.find_first_not_of(" \t");
38798 if (Pos == 0) // We matched a prefix.
38807 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
38809 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
38810 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
38811 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
38812 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
38814 if (AsmPieces.size() == 3)
38816 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
38823 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
38824 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
38826 const std::string &AsmStr = IA->getAsmString();
38828 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
38829 if (!Ty || Ty->getBitWidth() % 16 != 0)
38832 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
38833 SmallVector<StringRef, 4> AsmPieces;
38834 SplitString(AsmStr, AsmPieces, ";\n");
38836 switch (AsmPieces.size()) {
38837 default: return false;
38839 // FIXME: this should verify that we are targeting a 486 or better. If not,
38840 // we will turn this bswap into something that will be lowered to logical
38841 // ops instead of emitting the bswap asm. For now, we don't support 486 or
38842 // lower so don't worry about this.
38844 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
38845 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
38846 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
38847 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
38848 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
38849 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
38850 // No need to check constraints, nothing other than the equivalent of
38851 // "=r,0" would be valid here.
38852 return IntrinsicLowering::LowerToByteSwap(CI);
38855 // rorw $$8, ${0:w} --> llvm.bswap.i16
38856 if (CI->getType()->isIntegerTy(16) &&
38857 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
38858 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
38859 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
38861 StringRef ConstraintsStr = IA->getConstraintString();
38862 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38863 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38864 if (clobbersFlagRegisters(AsmPieces))
38865 return IntrinsicLowering::LowerToByteSwap(CI);
38869 if (CI->getType()->isIntegerTy(32) &&
38870 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
38871 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
38872 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
38873 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
38875 StringRef ConstraintsStr = IA->getConstraintString();
38876 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
38877 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
38878 if (clobbersFlagRegisters(AsmPieces))
38879 return IntrinsicLowering::LowerToByteSwap(CI);
38882 if (CI->getType()->isIntegerTy(64)) {
38883 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
38884 if (Constraints.size() >= 2 &&
38885 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
38886 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
38887 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
38888 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
38889 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
38890 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
38891 return IntrinsicLowering::LowerToByteSwap(CI);
38899 /// Given a constraint letter, return the type of constraint for this target.
38900 X86TargetLowering::ConstraintType
38901 X86TargetLowering::getConstraintType(StringRef Constraint) const {
38902 if (Constraint.size() == 1) {
38903 switch (Constraint[0]) {
38915 case 'k': // AVX512 masking registers.
38916 return C_RegisterClass;
38940 else if (Constraint.size() == 2) {
38941 switch (Constraint[0]) {
38945 switch (Constraint[1]) {
38956 return C_RegisterClass;
38960 return TargetLowering::getConstraintType(Constraint);
38963 /// Examine constraint type and operand type and determine a weight value.
38964 /// This object must already have been set up with the operand type
38965 /// and the current alternative constraint selected.
38966 TargetLowering::ConstraintWeight
38967 X86TargetLowering::getSingleConstraintMatchWeight(
38968 AsmOperandInfo &info, const char *constraint) const {
38969 ConstraintWeight weight = CW_Invalid;
38970 Value *CallOperandVal = info.CallOperandVal;
38971 // If we don't have a value, we can't do a match,
38972 // but allow it at the lowest weight.
38973 if (!CallOperandVal)
38975 Type *type = CallOperandVal->getType();
38976 // Look at the constraint type.
38977 switch (*constraint) {
38979 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
38991 if (CallOperandVal->getType()->isIntegerTy())
38992 weight = CW_SpecificReg;
38997 if (type->isFloatingPointTy())
38998 weight = CW_SpecificReg;
39001 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39002 weight = CW_SpecificReg;
39005 unsigned Size = StringRef(constraint).size();
39006 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
39007 char NextChar = Size == 2 ? constraint[1] : 'i';
39010 switch (NextChar) {
39016 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
39017 return CW_SpecificReg;
39019 // Conditional OpMask regs (AVX512)
39021 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39022 return CW_Register;
39026 if (type->isX86_MMXTy() && Subtarget.hasMMX())
39029 // Any SSE reg when ISA >= SSE2, same as 'Y'
39033 if (!Subtarget.hasSSE2())
39037 // Fall through (handle "Y" constraint).
39041 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
39042 weight = CW_Register;
39045 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
39046 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()))
39047 weight = CW_Register;
39050 // Enable conditional vector operations using %k<#> registers.
39051 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
39052 weight = CW_Register;
39055 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
39056 if (C->getZExtValue() <= 31)
39057 weight = CW_Constant;
39061 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39062 if (C->getZExtValue() <= 63)
39063 weight = CW_Constant;
39067 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39068 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
39069 weight = CW_Constant;
39073 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39074 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
39075 weight = CW_Constant;
39079 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39080 if (C->getZExtValue() <= 3)
39081 weight = CW_Constant;
39085 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39086 if (C->getZExtValue() <= 0xff)
39087 weight = CW_Constant;
39092 if (isa<ConstantFP>(CallOperandVal)) {
39093 weight = CW_Constant;
39097 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39098 if ((C->getSExtValue() >= -0x80000000LL) &&
39099 (C->getSExtValue() <= 0x7fffffffLL))
39100 weight = CW_Constant;
39104 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
39105 if (C->getZExtValue() <= 0xffffffff)
39106 weight = CW_Constant;
39113 /// Try to replace an X constraint, which matches anything, with another that
39114 /// has more specific requirements based on the type of the corresponding
39116 const char *X86TargetLowering::
39117 LowerXConstraint(EVT ConstraintVT) const {
39118 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
39119 // 'f' like normal targets.
39120 if (ConstraintVT.isFloatingPoint()) {
39121 if (Subtarget.hasSSE2())
39123 if (Subtarget.hasSSE1())
39127 return TargetLowering::LowerXConstraint(ConstraintVT);
39130 /// Lower the specified operand into the Ops vector.
39131 /// If it is invalid, don't add anything to Ops.
39132 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
39133 std::string &Constraint,
39134 std::vector<SDValue>&Ops,
39135 SelectionDAG &DAG) const {
39138 // Only support length 1 constraints for now.
39139 if (Constraint.length() > 1) return;
39141 char ConstraintLetter = Constraint[0];
39142 switch (ConstraintLetter) {
39145 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39146 if (C->getZExtValue() <= 31) {
39147 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39148 Op.getValueType());
39154 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39155 if (C->getZExtValue() <= 63) {
39156 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39157 Op.getValueType());
39163 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39164 if (isInt<8>(C->getSExtValue())) {
39165 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39166 Op.getValueType());
39172 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39173 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
39174 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
39175 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
39176 Op.getValueType());
39182 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39183 if (C->getZExtValue() <= 3) {
39184 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39185 Op.getValueType());
39191 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39192 if (C->getZExtValue() <= 255) {
39193 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39194 Op.getValueType());
39200 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39201 if (C->getZExtValue() <= 127) {
39202 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39203 Op.getValueType());
39209 // 32-bit signed value
39210 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39211 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
39212 C->getSExtValue())) {
39213 // Widen to 64 bits here to get it sign extended.
39214 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
39217 // FIXME gcc accepts some relocatable values here too, but only in certain
39218 // memory models; it's complicated.
39223 // 32-bit unsigned value
39224 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
39225 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
39226 C->getZExtValue())) {
39227 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
39228 Op.getValueType());
39232 // FIXME gcc accepts some relocatable values here too, but only in certain
39233 // memory models; it's complicated.
39237 // Literal immediates are always ok.
39238 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
39239 // Widen to 64 bits here to get it sign extended.
39240 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
39244 // In any sort of PIC mode addresses need to be computed at runtime by
39245 // adding in a register or some sort of table lookup. These can't
39246 // be used as immediates.
39247 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
39250 // If we are in non-pic codegen mode, we allow the address of a global (with
39251 // an optional displacement) to be used with 'i'.
39252 GlobalAddressSDNode *GA = nullptr;
39253 int64_t Offset = 0;
39255 // Match either (GA), (GA+C), (GA+C1+C2), etc.
39257 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
39258 Offset += GA->getOffset();
39260 } else if (Op.getOpcode() == ISD::ADD) {
39261 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
39262 Offset += C->getZExtValue();
39263 Op = Op.getOperand(0);
39266 } else if (Op.getOpcode() == ISD::SUB) {
39267 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
39268 Offset += -C->getZExtValue();
39269 Op = Op.getOperand(0);
39274 // Otherwise, this isn't something we can handle, reject it.
39278 const GlobalValue *GV = GA->getGlobal();
39279 // If we require an extra load to get this address, as in PIC mode, we
39280 // can't accept it.
39281 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
39284 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
39285 GA->getValueType(0), Offset);
39290 if (Result.getNode()) {
39291 Ops.push_back(Result);
39294 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
39297 /// Check if \p RC is a general purpose register class.
39298 /// I.e., GR* or one of their variant.
39299 static bool isGRClass(const TargetRegisterClass &RC) {
39300 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
39301 RC.hasSuperClassEq(&X86::GR16RegClass) ||
39302 RC.hasSuperClassEq(&X86::GR32RegClass) ||
39303 RC.hasSuperClassEq(&X86::GR64RegClass) ||
39304 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
39307 /// Check if \p RC is a vector register class.
39308 /// I.e., FR* / VR* or one of their variant.
39309 static bool isFRClass(const TargetRegisterClass &RC) {
39310 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
39311 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
39312 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
39313 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
39314 RC.hasSuperClassEq(&X86::VR512RegClass);
39317 std::pair<unsigned, const TargetRegisterClass *>
39318 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
39319 StringRef Constraint,
39321 // First, see if this is a constraint that directly corresponds to an LLVM
39323 if (Constraint.size() == 1) {
39324 // GCC Constraint Letters
39325 switch (Constraint[0]) {
39327 // TODO: Slight differences here in allocation order and leaving
39328 // RIP in the class. Do they matter any more here than they do
39329 // in the normal allocation?
39331 if (Subtarget.hasAVX512()) {
39332 // Only supported in AVX512 or later.
39333 switch (VT.SimpleTy) {
39336 return std::make_pair(0U, &X86::VK32RegClass);
39338 return std::make_pair(0U, &X86::VK16RegClass);
39340 return std::make_pair(0U, &X86::VK8RegClass);
39342 return std::make_pair(0U, &X86::VK1RegClass);
39344 return std::make_pair(0U, &X86::VK64RegClass);
39348 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
39349 if (Subtarget.is64Bit()) {
39350 if (VT == MVT::i32 || VT == MVT::f32)
39351 return std::make_pair(0U, &X86::GR32RegClass);
39352 if (VT == MVT::i16)
39353 return std::make_pair(0U, &X86::GR16RegClass);
39354 if (VT == MVT::i8 || VT == MVT::i1)
39355 return std::make_pair(0U, &X86::GR8RegClass);
39356 if (VT == MVT::i64 || VT == MVT::f64)
39357 return std::make_pair(0U, &X86::GR64RegClass);
39361 // 32-bit fallthrough
39362 case 'Q': // Q_REGS
39363 if (VT == MVT::i32 || VT == MVT::f32)
39364 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
39365 if (VT == MVT::i16)
39366 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
39367 if (VT == MVT::i8 || VT == MVT::i1)
39368 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
39369 if (VT == MVT::i64)
39370 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
39372 case 'r': // GENERAL_REGS
39373 case 'l': // INDEX_REGS
39374 if (VT == MVT::i8 || VT == MVT::i1)
39375 return std::make_pair(0U, &X86::GR8RegClass);
39376 if (VT == MVT::i16)
39377 return std::make_pair(0U, &X86::GR16RegClass);
39378 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
39379 return std::make_pair(0U, &X86::GR32RegClass);
39380 return std::make_pair(0U, &X86::GR64RegClass);
39381 case 'R': // LEGACY_REGS
39382 if (VT == MVT::i8 || VT == MVT::i1)
39383 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
39384 if (VT == MVT::i16)
39385 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
39386 if (VT == MVT::i32 || !Subtarget.is64Bit())
39387 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
39388 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
39389 case 'f': // FP Stack registers.
39390 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
39391 // value to the correct fpstack register class.
39392 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
39393 return std::make_pair(0U, &X86::RFP32RegClass);
39394 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
39395 return std::make_pair(0U, &X86::RFP64RegClass);
39396 return std::make_pair(0U, &X86::RFP80RegClass);
39397 case 'y': // MMX_REGS if MMX allowed.
39398 if (!Subtarget.hasMMX()) break;
39399 return std::make_pair(0U, &X86::VR64RegClass);
39400 case 'Y': // SSE_REGS if SSE2 allowed
39401 if (!Subtarget.hasSSE2()) break;
39404 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
39405 if (!Subtarget.hasSSE1()) break;
39406 bool VConstraint = (Constraint[0] == 'v');
39408 switch (VT.SimpleTy) {
39410 // Scalar SSE types.
39413 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
39414 return std::make_pair(0U, &X86::FR32XRegClass);
39415 return std::make_pair(0U, &X86::FR32RegClass);
39418 if (VConstraint && Subtarget.hasVLX())
39419 return std::make_pair(0U, &X86::FR64XRegClass);
39420 return std::make_pair(0U, &X86::FR64RegClass);
39421 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
39429 if (VConstraint && Subtarget.hasVLX())
39430 return std::make_pair(0U, &X86::VR128XRegClass);
39431 return std::make_pair(0U, &X86::VR128RegClass);
39439 if (VConstraint && Subtarget.hasVLX())
39440 return std::make_pair(0U, &X86::VR256XRegClass);
39441 return std::make_pair(0U, &X86::VR256RegClass);
39446 return std::make_pair(0U, &X86::VR512RegClass);
39450 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
39451 switch (Constraint[1]) {
39457 return getRegForInlineAsmConstraint(TRI, "Y", VT);
39459 if (!Subtarget.hasMMX()) break;
39460 return std::make_pair(0U, &X86::VR64RegClass);
39463 if (!Subtarget.hasSSE1()) break;
39464 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
39466 // This register class doesn't allocate k0 for masked vector operation.
39467 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
39468 switch (VT.SimpleTy) {
39471 return std::make_pair(0U, &X86::VK32WMRegClass);
39473 return std::make_pair(0U, &X86::VK16WMRegClass);
39475 return std::make_pair(0U, &X86::VK8WMRegClass);
39477 return std::make_pair(0U, &X86::VK1WMRegClass);
39479 return std::make_pair(0U, &X86::VK64WMRegClass);
39486 // Use the default implementation in TargetLowering to convert the register
39487 // constraint into a member of a register class.
39488 std::pair<unsigned, const TargetRegisterClass*> Res;
39489 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
39491 // Not found as a standard register?
39493 // Map st(0) -> st(7) -> ST0
39494 if (Constraint.size() == 7 && Constraint[0] == '{' &&
39495 tolower(Constraint[1]) == 's' &&
39496 tolower(Constraint[2]) == 't' &&
39497 Constraint[3] == '(' &&
39498 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
39499 Constraint[5] == ')' &&
39500 Constraint[6] == '}') {
39502 Res.first = X86::FP0+Constraint[4]-'0';
39503 Res.second = &X86::RFP80RegClass;
39507 // GCC allows "st(0)" to be called just plain "st".
39508 if (StringRef("{st}").equals_lower(Constraint)) {
39509 Res.first = X86::FP0;
39510 Res.second = &X86::RFP80RegClass;
39515 if (StringRef("{flags}").equals_lower(Constraint)) {
39516 Res.first = X86::EFLAGS;
39517 Res.second = &X86::CCRRegClass;
39521 // 'A' means [ER]AX + [ER]DX.
39522 if (Constraint == "A") {
39523 if (Subtarget.is64Bit()) {
39524 Res.first = X86::RAX;
39525 Res.second = &X86::GR64_ADRegClass;
39527 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
39528 "Expecting 64, 32 or 16 bit subtarget");
39529 Res.first = X86::EAX;
39530 Res.second = &X86::GR32_ADRegClass;
39537 // Make sure it isn't a register that requires 64-bit mode.
39538 if (!Subtarget.is64Bit() &&
39539 (isFRClass(*Res.second) || isGRClass(*Res.second)) &&
39540 TRI->getEncodingValue(Res.first) >= 8) {
39541 // Register requires REX prefix, but we're in 32-bit mode.
39543 Res.second = nullptr;
39547 // Make sure it isn't a register that requires AVX512.
39548 if (!Subtarget.hasAVX512() && isFRClass(*Res.second) &&
39549 TRI->getEncodingValue(Res.first) & 0x10) {
39550 // Register requires EVEX prefix.
39552 Res.second = nullptr;
39556 // Otherwise, check to see if this is a register class of the wrong value
39557 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
39558 // turn into {ax},{dx}.
39559 // MVT::Other is used to specify clobber names.
39560 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
39561 return Res; // Correct type already, nothing to do.
39563 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
39564 // return "eax". This should even work for things like getting 64bit integer
39565 // registers when given an f64 type.
39566 const TargetRegisterClass *Class = Res.second;
39567 // The generic code will match the first register class that contains the
39568 // given register. Thus, based on the ordering of the tablegened file,
39569 // the "plain" GR classes might not come first.
39570 // Therefore, use a helper method.
39571 if (isGRClass(*Class)) {
39572 unsigned Size = VT.getSizeInBits();
39573 if (Size == 1) Size = 8;
39574 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
39576 bool is64Bit = Subtarget.is64Bit();
39577 const TargetRegisterClass *RC =
39578 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
39579 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
39580 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
39581 : &X86::GR64RegClass;
39582 if (RC->contains(DestReg))
39583 Res = std::make_pair(DestReg, RC);
39585 // No register found/type mismatch.
39587 Res.second = nullptr;
39589 } else if (isFRClass(*Class)) {
39590 // Handle references to XMM physical registers that got mapped into the
39591 // wrong class. This can happen with constraints like {xmm0} where the
39592 // target independent register mapper will just pick the first match it can
39593 // find, ignoring the required type.
39595 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
39596 if (VT == MVT::f32 || VT == MVT::i32)
39597 Res.second = &X86::FR32RegClass;
39598 else if (VT == MVT::f64 || VT == MVT::i64)
39599 Res.second = &X86::FR64RegClass;
39600 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
39601 Res.second = &X86::VR128RegClass;
39602 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
39603 Res.second = &X86::VR256RegClass;
39604 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
39605 Res.second = &X86::VR512RegClass;
39607 // Type mismatch and not a clobber: Return an error;
39609 Res.second = nullptr;
39616 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
39617 const AddrMode &AM, Type *Ty,
39618 unsigned AS) const {
39619 // Scaling factors are not free at all.
39620 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
39621 // will take 2 allocations in the out of order engine instead of 1
39622 // for plain addressing mode, i.e. inst (reg1).
39624 // vaddps (%rsi,%rdx), %ymm0, %ymm1
39625 // Requires two allocations (one for the load, one for the computation)
39627 // vaddps (%rsi), %ymm0, %ymm1
39628 // Requires just 1 allocation, i.e., freeing allocations for other operations
39629 // and having less micro operations to execute.
39631 // For some X86 architectures, this is even worse because for instance for
39632 // stores, the complex addressing mode forces the instruction to use the
39633 // "load" ports instead of the dedicated "store" port.
39634 // E.g., on Haswell:
39635 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
39636 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
39637 if (isLegalAddressingMode(DL, AM, Ty, AS))
39638 // Scale represents reg2 * scale, thus account for 1
39639 // as soon as we use a second register.
39640 return AM.Scale != 0;
39644 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
39645 // Integer division on x86 is expensive. However, when aggressively optimizing
39646 // for code size, we prefer to use a div instruction, as it is usually smaller
39647 // than the alternative sequence.
39648 // The exception to this is vector division. Since x86 doesn't have vector
39649 // integer division, leaving the division as-is is a loss even in terms of
39650 // size, because it will have to be scalarized, while the alternative code
39651 // sequence can be performed in vector form.
39653 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
39654 return OptSize && !VT.isVector();
39657 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
39658 if (!Subtarget.is64Bit())
39661 // Update IsSplitCSR in X86MachineFunctionInfo.
39662 X86MachineFunctionInfo *AFI =
39663 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
39664 AFI->setIsSplitCSR(true);
39667 void X86TargetLowering::insertCopiesSplitCSR(
39668 MachineBasicBlock *Entry,
39669 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
39670 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
39671 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
39675 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
39676 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
39677 MachineBasicBlock::iterator MBBI = Entry->begin();
39678 for (const MCPhysReg *I = IStart; *I; ++I) {
39679 const TargetRegisterClass *RC = nullptr;
39680 if (X86::GR64RegClass.contains(*I))
39681 RC = &X86::GR64RegClass;
39683 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
39685 unsigned NewVR = MRI->createVirtualRegister(RC);
39686 // Create copy from CSR to a virtual register.
39687 // FIXME: this currently does not emit CFI pseudo-instructions, it works
39688 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
39689 // nounwind. If we want to generalize this later, we may need to emit
39690 // CFI pseudo-instructions.
39691 assert(Entry->getParent()->getFunction().hasFnAttribute(
39692 Attribute::NoUnwind) &&
39693 "Function should be nounwind in insertCopiesSplitCSR!");
39694 Entry->addLiveIn(*I);
39695 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
39698 // Insert the copy-back instructions right before the terminator.
39699 for (auto *Exit : Exits)
39700 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
39701 TII->get(TargetOpcode::COPY), *I)
39706 bool X86TargetLowering::supportSwiftError() const {
39707 return Subtarget.is64Bit();
39710 /// Returns the name of the symbol used to emit stack probes or the empty
39711 /// string if not applicable.
39712 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
39713 // If the function specifically requests stack probes, emit them.
39714 if (MF.getFunction().hasFnAttribute("probe-stack"))
39715 return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
39717 // Generally, if we aren't on Windows, the platform ABI does not include
39718 // support for stack probes, so don't emit them.
39719 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() ||
39720 MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
39723 // We need a stack probe to conform to the Windows ABI. Choose the right
39725 if (Subtarget.is64Bit())
39726 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
39727 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";