1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetLowering.h"
39 #include "llvm/CodeGen/WinEHFuncInfo.h"
40 #include "llvm/IR/CallSite.h"
41 #include "llvm/IR/CallingConv.h"
42 #include "llvm/IR/Constants.h"
43 #include "llvm/IR/DerivedTypes.h"
44 #include "llvm/IR/DiagnosticInfo.h"
45 #include "llvm/IR/Function.h"
46 #include "llvm/IR/GlobalAlias.h"
47 #include "llvm/IR/GlobalVariable.h"
48 #include "llvm/IR/Instructions.h"
49 #include "llvm/IR/Intrinsics.h"
50 #include "llvm/MC/MCAsmInfo.h"
51 #include "llvm/MC/MCContext.h"
52 #include "llvm/MC/MCExpr.h"
53 #include "llvm/MC/MCSymbol.h"
54 #include "llvm/Support/CommandLine.h"
55 #include "llvm/Support/Debug.h"
56 #include "llvm/Support/ErrorHandling.h"
57 #include "llvm/Support/KnownBits.h"
58 #include "llvm/Support/MathExtras.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
192 if (Subtarget.hasCMov()) {
193 setOperationAction(ISD::ABS , MVT::i16 , Custom);
194 setOperationAction(ISD::ABS , MVT::i32 , Custom);
195 if (Subtarget.is64Bit())
196 setOperationAction(ISD::ABS , MVT::i64 , Custom);
199 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
201 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
205 if (Subtarget.is64Bit()) {
206 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
207 // f32/f64 are legal, f80 is custom.
208 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
211 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
212 } else if (!Subtarget.useSoftFloat()) {
213 // We have an algorithm for SSE2->double, and we turn this into a
214 // 64-bit FILD followed by conditional FADD for other targets.
215 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
216 // We have an algorithm for SSE2, and we turn this into a 64-bit
217 // FILD or VCVTUSI2SS/SD for other targets.
218 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
221 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
223 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
224 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
226 if (!Subtarget.useSoftFloat()) {
227 // SSE has no i16 to fp conversion, only i32.
228 if (X86ScalarSSEf32) {
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 // f32 and f64 cases are Legal, f80 case is not
231 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
233 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
234 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
237 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
238 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
241 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
243 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
244 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
246 if (!Subtarget.useSoftFloat()) {
247 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
248 // are Legal, f80 is custom lowered.
249 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
250 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
252 if (X86ScalarSSEf32) {
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 // f32 and f64 cases are Legal, f80 case is not
255 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
257 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
258 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
261 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
262 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
263 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
266 // Handle FP_TO_UINT by promoting the destination to a larger signed
268 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
269 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
272 if (Subtarget.is64Bit()) {
273 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
274 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
275 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
276 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
278 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
279 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
281 } else if (!Subtarget.useSoftFloat()) {
282 // Since AVX is a superset of SSE3, only check for SSE here.
283 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
284 // Expand FP_TO_UINT into a select.
285 // FIXME: We would like to use a Custom expander here eventually to do
286 // the optimal thing for SSE vs. the default expansion in the legalizer.
287 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
289 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
290 // With SSE3 we can use fisttpll to convert to a signed i64; without
291 // SSE, we're stuck with a fistpll.
292 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
294 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
297 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
298 if (!X86ScalarSSEf64) {
299 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
300 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
301 if (Subtarget.is64Bit()) {
302 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
303 // Without SSE, i64->f64 goes through memory.
304 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
306 } else if (!Subtarget.is64Bit())
307 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
309 // Scalar integer divide and remainder are lowered to use operations that
310 // produce two results, to match the available instructions. This exposes
311 // the two-result form to trivial CSE, which is able to combine x/y and x%y
312 // into a single instruction.
314 // Scalar integer multiply-high is also lowered to use two-result
315 // operations, to match the available instructions. However, plain multiply
316 // (low) operations are left as Legal, as there are single-result
317 // instructions for this in x86. Using the two-result multiply instructions
318 // when both high and low results are needed must be arranged by dagcombine.
319 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
320 setOperationAction(ISD::MULHS, VT, Expand);
321 setOperationAction(ISD::MULHU, VT, Expand);
322 setOperationAction(ISD::SDIV, VT, Expand);
323 setOperationAction(ISD::UDIV, VT, Expand);
324 setOperationAction(ISD::SREM, VT, Expand);
325 setOperationAction(ISD::UREM, VT, Expand);
328 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
329 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
330 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
331 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
332 setOperationAction(ISD::BR_CC, VT, Expand);
333 setOperationAction(ISD::SELECT_CC, VT, Expand);
335 if (Subtarget.is64Bit())
336 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
337 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
338 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
339 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
340 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
342 setOperationAction(ISD::FREM , MVT::f32 , Expand);
343 setOperationAction(ISD::FREM , MVT::f64 , Expand);
344 setOperationAction(ISD::FREM , MVT::f80 , Expand);
345 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
347 // Promote the i8 variants and force them on up to i32 which has a shorter
349 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
350 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
351 if (!Subtarget.hasBMI()) {
352 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
353 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
354 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
355 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
356 if (Subtarget.is64Bit()) {
357 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
358 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
362 if (Subtarget.hasLZCNT()) {
363 // When promoting the i8 variants, force them to i32 for a shorter
365 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
366 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
368 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
369 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
370 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
371 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
372 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
373 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
374 if (Subtarget.is64Bit()) {
375 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
376 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
380 // Special handling for half-precision floating point conversions.
381 // If we don't have F16C support, then lower half float conversions
382 // into library calls.
383 if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
384 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
388 // There's never any support for operations beyond MVT::f32.
389 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
390 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
391 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
392 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
394 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
395 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
396 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
397 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
398 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
399 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
401 if (Subtarget.hasPOPCNT()) {
402 setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
404 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
405 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
406 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
407 if (Subtarget.is64Bit())
408 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
411 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
413 if (!Subtarget.hasMOVBE())
414 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
416 // These should be promoted to a larger select which is supported.
417 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
418 // X86 wants to expand cmov itself.
419 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
420 setOperationAction(ISD::SELECT, VT, Custom);
421 setOperationAction(ISD::SETCC, VT, Custom);
423 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
424 if (VT == MVT::i64 && !Subtarget.is64Bit())
426 setOperationAction(ISD::SELECT, VT, Custom);
427 setOperationAction(ISD::SETCC, VT, Custom);
430 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
431 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
432 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
434 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
435 // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
436 // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
437 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
438 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
439 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
440 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
441 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
444 for (auto VT : { MVT::i32, MVT::i64 }) {
445 if (VT == MVT::i64 && !Subtarget.is64Bit())
447 setOperationAction(ISD::ConstantPool , VT, Custom);
448 setOperationAction(ISD::JumpTable , VT, Custom);
449 setOperationAction(ISD::GlobalAddress , VT, Custom);
450 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
451 setOperationAction(ISD::ExternalSymbol , VT, Custom);
452 setOperationAction(ISD::BlockAddress , VT, Custom);
455 // 64-bit shl, sra, srl (iff 32-bit x86)
456 for (auto VT : { MVT::i32, MVT::i64 }) {
457 if (VT == MVT::i64 && !Subtarget.is64Bit())
459 setOperationAction(ISD::SHL_PARTS, VT, Custom);
460 setOperationAction(ISD::SRA_PARTS, VT, Custom);
461 setOperationAction(ISD::SRL_PARTS, VT, Custom);
464 if (Subtarget.hasSSE1())
465 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
467 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
469 // Expand certain atomics
470 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
471 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
474 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
475 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
476 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
477 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
480 if (Subtarget.hasCmpxchg16b()) {
481 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
484 // FIXME - use subtarget debug flags
485 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
486 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
487 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
488 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
491 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
492 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
494 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
495 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
497 setOperationAction(ISD::TRAP, MVT::Other, Legal);
498 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
500 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
501 setOperationAction(ISD::VASTART , MVT::Other, Custom);
502 setOperationAction(ISD::VAEND , MVT::Other, Expand);
503 bool Is64Bit = Subtarget.is64Bit();
504 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
505 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
507 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
508 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
510 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
512 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
513 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
514 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
516 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
517 // f32 and f64 use SSE.
518 // Set up the FP register classes.
519 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
520 : &X86::FR32RegClass);
521 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
522 : &X86::FR64RegClass);
524 for (auto VT : { MVT::f32, MVT::f64 }) {
525 // Use ANDPD to simulate FABS.
526 setOperationAction(ISD::FABS, VT, Custom);
528 // Use XORP to simulate FNEG.
529 setOperationAction(ISD::FNEG, VT, Custom);
531 // Use ANDPD and ORPD to simulate FCOPYSIGN.
532 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
534 // We don't support sin/cos/fmod
535 setOperationAction(ISD::FSIN , VT, Expand);
536 setOperationAction(ISD::FCOS , VT, Expand);
537 setOperationAction(ISD::FSINCOS, VT, Expand);
540 // Lower this to MOVMSK plus an AND.
541 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
542 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
544 // Expand FP immediates into loads from the stack, except for the special
546 addLegalFPImmediate(APFloat(+0.0)); // xorpd
547 addLegalFPImmediate(APFloat(+0.0f)); // xorps
548 } else if (UseX87 && X86ScalarSSEf32) {
549 // Use SSE for f32, x87 for f64.
550 // Set up the FP register classes.
551 addRegisterClass(MVT::f32, &X86::FR32RegClass);
552 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
554 // Use ANDPS to simulate FABS.
555 setOperationAction(ISD::FABS , MVT::f32, Custom);
557 // Use XORP to simulate FNEG.
558 setOperationAction(ISD::FNEG , MVT::f32, Custom);
560 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
562 // Use ANDPS and ORPS to simulate FCOPYSIGN.
563 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
564 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
566 // We don't support sin/cos/fmod
567 setOperationAction(ISD::FSIN , MVT::f32, Expand);
568 setOperationAction(ISD::FCOS , MVT::f32, Expand);
569 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
571 // Special cases we handle for FP constants.
572 addLegalFPImmediate(APFloat(+0.0f)); // xorps
573 addLegalFPImmediate(APFloat(+0.0)); // FLD0
574 addLegalFPImmediate(APFloat(+1.0)); // FLD1
575 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
576 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
578 // Always expand sin/cos functions even though x87 has an instruction.
579 setOperationAction(ISD::FSIN , MVT::f64, Expand);
580 setOperationAction(ISD::FCOS , MVT::f64, Expand);
581 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
583 // f32 and f64 in x87.
584 // Set up the FP register classes.
585 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
586 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
588 for (auto VT : { MVT::f32, MVT::f64 }) {
589 setOperationAction(ISD::UNDEF, VT, Expand);
590 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
592 // Always expand sin/cos functions even though x87 has an instruction.
593 setOperationAction(ISD::FSIN , VT, Expand);
594 setOperationAction(ISD::FCOS , VT, Expand);
595 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 // Always expand sin/cos functions even though x87 has an instruction.
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
644 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
645 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
646 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
647 setOperationAction(ISD::FRINT, MVT::f80, Expand);
648 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
649 setOperationAction(ISD::FMA, MVT::f80, Expand);
652 // Always use a library call for pow.
653 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
654 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
657 setOperationAction(ISD::FLOG, MVT::f80, Expand);
658 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
660 setOperationAction(ISD::FEXP, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
662 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
663 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
665 // Some FP actions are always expanded for vector types.
666 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
667 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
668 setOperationAction(ISD::FSIN, VT, Expand);
669 setOperationAction(ISD::FSINCOS, VT, Expand);
670 setOperationAction(ISD::FCOS, VT, Expand);
671 setOperationAction(ISD::FREM, VT, Expand);
672 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
673 setOperationAction(ISD::FPOW, VT, Expand);
674 setOperationAction(ISD::FLOG, VT, Expand);
675 setOperationAction(ISD::FLOG2, VT, Expand);
676 setOperationAction(ISD::FLOG10, VT, Expand);
677 setOperationAction(ISD::FEXP, VT, Expand);
678 setOperationAction(ISD::FEXP2, VT, Expand);
681 // First set operation action for all vector types to either promote
682 // (for widening) or expand (for scalarization). Then we will selectively
683 // turn on ones that can be effectively codegen'd.
684 for (MVT VT : MVT::vector_valuetypes()) {
685 setOperationAction(ISD::SDIV, VT, Expand);
686 setOperationAction(ISD::UDIV, VT, Expand);
687 setOperationAction(ISD::SREM, VT, Expand);
688 setOperationAction(ISD::UREM, VT, Expand);
689 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
690 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
691 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
692 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::FMA, VT, Expand);
694 setOperationAction(ISD::FFLOOR, VT, Expand);
695 setOperationAction(ISD::FCEIL, VT, Expand);
696 setOperationAction(ISD::FTRUNC, VT, Expand);
697 setOperationAction(ISD::FRINT, VT, Expand);
698 setOperationAction(ISD::FNEARBYINT, VT, Expand);
699 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
700 setOperationAction(ISD::MULHS, VT, Expand);
701 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
702 setOperationAction(ISD::MULHU, VT, Expand);
703 setOperationAction(ISD::SDIVREM, VT, Expand);
704 setOperationAction(ISD::UDIVREM, VT, Expand);
705 setOperationAction(ISD::CTPOP, VT, Expand);
706 setOperationAction(ISD::CTTZ, VT, Expand);
707 setOperationAction(ISD::CTLZ, VT, Expand);
708 setOperationAction(ISD::ROTL, VT, Expand);
709 setOperationAction(ISD::ROTR, VT, Expand);
710 setOperationAction(ISD::BSWAP, VT, Expand);
711 setOperationAction(ISD::SETCC, VT, Expand);
712 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
713 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
714 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
715 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
717 setOperationAction(ISD::TRUNCATE, VT, Expand);
718 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
719 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
720 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
721 setOperationAction(ISD::SELECT_CC, VT, Expand);
722 for (MVT InnerVT : MVT::vector_valuetypes()) {
723 setTruncStoreAction(InnerVT, VT, Expand);
725 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
726 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
728 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
729 // types, we have to deal with them whether we ask for Expansion or not.
730 // Setting Expand causes its own optimisation problems though, so leave
732 if (VT.getVectorElementType() == MVT::i1)
733 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
735 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
736 // split/scalarized right now.
737 if (VT.getVectorElementType() == MVT::f16)
738 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
742 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
743 // with -msoft-float, disable use of MMX as well.
744 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
745 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
746 // No operations on x86mmx supported, everything uses intrinsics.
749 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
750 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
753 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
754 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
755 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
756 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
757 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
758 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
759 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
760 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
761 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
764 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
765 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
766 : &X86::VR128RegClass);
768 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
769 // registers cannot be used even for integer operations.
770 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
774 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
775 : &X86::VR128RegClass);
776 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
777 : &X86::VR128RegClass);
779 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
780 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
781 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
782 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
783 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
785 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
787 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
788 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
789 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
790 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
791 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
793 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
794 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
795 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
796 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
798 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
802 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
803 setOperationAction(ISD::SETCC, VT, Custom);
804 setOperationAction(ISD::CTPOP, VT, Custom);
805 setOperationAction(ISD::CTTZ, VT, Custom);
808 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
809 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
810 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
811 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
812 setOperationAction(ISD::VSELECT, VT, Custom);
813 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
816 // We support custom legalizing of sext and anyext loads for specific
817 // memory vector types which we can load as a scalar (or sequence of
818 // scalars) and extend in-register to a legal 128-bit vector type. For sext
819 // loads these must work with a single scalar load.
820 for (MVT VT : MVT::integer_vector_valuetypes()) {
821 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
832 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
833 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
834 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
835 setOperationAction(ISD::VSELECT, VT, Custom);
837 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
840 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
841 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
844 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
845 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
846 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
847 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
853 // Custom lower v2i64 and v2f64 selects.
854 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
855 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
857 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
860 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
863 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
865 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
868 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
869 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
871 for (MVT VT : MVT::fp_vector_valuetypes())
872 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
874 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
875 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
876 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
879 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
880 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
882 // In the customized shift lowering, the legal v4i32/v2i64 cases
883 // in AVX2 will be recognized.
884 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
885 setOperationAction(ISD::SRL, VT, Custom);
886 setOperationAction(ISD::SHL, VT, Custom);
887 setOperationAction(ISD::SRA, VT, Custom);
891 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
892 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
893 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
894 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
895 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
898 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
902 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
903 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
904 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
905 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
906 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
907 setOperationAction(ISD::FRINT, RoundedTy, Legal);
908 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
911 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
915 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
916 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
917 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
918 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
920 // FIXME: Do we need to handle scalar-to-vector here?
921 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
923 // We directly match byte blends in the backend as they match the VSELECT
925 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
927 // SSE41 brings specific instructions for doing vector sign extend even in
928 // cases where we don't have SRA.
929 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
930 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
931 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
934 for (MVT VT : MVT::integer_vector_valuetypes()) {
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
936 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
937 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
940 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
941 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
942 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
951 // i8 vectors are custom because the source register and source
952 // source memory operand types are not the same width.
953 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
956 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
957 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
958 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
959 setOperationAction(ISD::ROTL, VT, Custom);
961 // XOP can efficiently perform BITREVERSE with VPPERM.
962 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
963 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
966 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
967 setOperationAction(ISD::BITREVERSE, VT, Custom);
970 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
971 bool HasInt256 = Subtarget.hasInt256();
973 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
974 : &X86::VR256RegClass);
975 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
986 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
987 setOperationAction(ISD::FFLOOR, VT, Legal);
988 setOperationAction(ISD::FCEIL, VT, Legal);
989 setOperationAction(ISD::FTRUNC, VT, Legal);
990 setOperationAction(ISD::FRINT, VT, Legal);
991 setOperationAction(ISD::FNEARBYINT, VT, Legal);
992 setOperationAction(ISD::FNEG, VT, Custom);
993 setOperationAction(ISD::FABS, VT, Custom);
994 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
997 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
998 // even though v8i16 is a legal type.
999 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1000 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1003 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1004 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1006 for (MVT VT : MVT::fp_vector_valuetypes())
1007 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1009 // In the customized shift lowering, the legal v8i32/v4i64 cases
1010 // in AVX2 will be recognized.
1011 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1012 setOperationAction(ISD::SRL, VT, Custom);
1013 setOperationAction(ISD::SHL, VT, Custom);
1014 setOperationAction(ISD::SRA, VT, Custom);
1017 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1021 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1022 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1023 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1027 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1030 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1032 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1033 setOperationAction(ISD::SETCC, VT, Custom);
1034 setOperationAction(ISD::CTPOP, VT, Custom);
1035 setOperationAction(ISD::CTTZ, VT, Custom);
1036 setOperationAction(ISD::CTLZ, VT, Custom);
1039 if (Subtarget.hasAnyFMA()) {
1040 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1041 MVT::v2f64, MVT::v4f64 })
1042 setOperationAction(ISD::FMA, VT, Legal);
1045 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1046 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1047 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1050 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1051 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1052 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1055 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1056 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1058 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1061 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1063 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1064 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1076 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1077 // when we have a 256bit-wide blend with immediate.
1078 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1080 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1081 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1082 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1083 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1091 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1092 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1093 setOperationAction(ISD::MLOAD, VT, Legal);
1094 setOperationAction(ISD::MSTORE, VT, Legal);
1097 // Extract subvector is special because the value type
1098 // (result) is 128-bit but the source is 256-bit wide.
1099 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1100 MVT::v4f32, MVT::v2f64 }) {
1101 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1104 // Custom lower several nodes for 256-bit types.
1105 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1106 MVT::v8f32, MVT::v4f64 }) {
1107 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1108 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1109 setOperationAction(ISD::VSELECT, VT, Custom);
1110 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1111 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1113 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1114 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1118 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1120 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1121 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1122 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1123 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1130 // Custom legalize 2x32 to get a little better code.
1131 setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
1132 setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
1134 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1135 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1136 setOperationAction(ISD::MGATHER, VT, Custom);
1140 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1141 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1143 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1144 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1146 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1147 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1148 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1150 for (MVT VT : MVT::fp_vector_valuetypes())
1151 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1153 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1154 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1155 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1156 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1157 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1158 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1161 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1162 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1163 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1164 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1165 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1166 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1167 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1168 setTruncStoreAction(VT, MaskVT, Custom);
1171 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1172 setOperationAction(ISD::FNEG, VT, Custom);
1173 setOperationAction(ISD::FABS, VT, Custom);
1174 setOperationAction(ISD::FMA, VT, Legal);
1175 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1178 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1179 setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote);
1180 setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote);
1181 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1182 setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote);
1183 setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote);
1184 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1185 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1186 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1187 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1190 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1191 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1192 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1193 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1194 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1195 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1196 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1197 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1198 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1200 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1201 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1202 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1203 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1204 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1205 if (Subtarget.hasVLX()){
1206 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1207 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1208 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1209 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1210 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1212 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1213 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1214 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1215 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1216 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1218 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1219 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1220 setOperationAction(ISD::MLOAD, VT, Custom);
1221 setOperationAction(ISD::MSTORE, VT, Custom);
1225 if (Subtarget.hasDQI()) {
1226 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1227 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1228 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1229 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1230 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1232 if (Subtarget.hasVLX()) {
1233 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1234 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1235 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1236 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1239 if (Subtarget.hasVLX()) {
1240 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1241 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1242 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1243 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1246 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1248 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1249 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1250 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1251 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1252 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1255 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1256 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
1257 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1258 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
1259 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1260 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1262 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1263 setOperationAction(ISD::FFLOOR, VT, Legal);
1264 setOperationAction(ISD::FCEIL, VT, Legal);
1265 setOperationAction(ISD::FTRUNC, VT, Legal);
1266 setOperationAction(ISD::FRINT, VT, Legal);
1267 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1270 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1271 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1273 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1274 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1275 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1277 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1278 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1279 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1280 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1281 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1283 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1284 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1286 setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
1287 setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
1289 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1290 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1291 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1292 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1293 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1294 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1297 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1298 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1299 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1301 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1302 setOperationAction(ISD::ADD, VT, Custom);
1303 setOperationAction(ISD::SUB, VT, Custom);
1304 setOperationAction(ISD::MUL, VT, Custom);
1305 setOperationAction(ISD::SETCC, VT, Custom);
1306 setOperationAction(ISD::SELECT, VT, Custom);
1307 setOperationAction(ISD::TRUNCATE, VT, Custom);
1309 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1310 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1311 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1312 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1313 setOperationAction(ISD::VSELECT, VT, Expand);
1316 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1317 setOperationAction(ISD::SMAX, VT, Legal);
1318 setOperationAction(ISD::UMAX, VT, Legal);
1319 setOperationAction(ISD::SMIN, VT, Legal);
1320 setOperationAction(ISD::UMIN, VT, Legal);
1321 setOperationAction(ISD::ABS, VT, Legal);
1322 setOperationAction(ISD::SRL, VT, Custom);
1323 setOperationAction(ISD::SHL, VT, Custom);
1324 setOperationAction(ISD::SRA, VT, Custom);
1325 setOperationAction(ISD::CTPOP, VT, Custom);
1326 setOperationAction(ISD::CTTZ, VT, Custom);
1329 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1330 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1332 setOperationAction(ISD::ROTL, VT, Custom);
1333 setOperationAction(ISD::ROTR, VT, Custom);
1336 // Need to promote to 64-bit even though we have 32-bit masked instructions
1337 // because the IR optimizers rearrange bitcasts around logic ops leaving
1338 // too many variations to handle if we don't promote them.
1339 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1340 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1341 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1343 if (Subtarget.hasCDI()) {
1344 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1345 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1346 MVT::v4i64, MVT::v8i64}) {
1347 setOperationAction(ISD::CTLZ, VT, Legal);
1348 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1350 } // Subtarget.hasCDI()
1352 if (Subtarget.hasDQI()) {
1353 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1354 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1355 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1356 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1359 if (Subtarget.hasVPOPCNTDQ()) {
1360 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1361 // version of popcntd/q.
1362 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1363 MVT::v4i32, MVT::v2i64})
1364 setOperationAction(ISD::CTPOP, VT, Legal);
1367 // Custom lower several nodes.
1368 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1369 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
1370 setOperationAction(ISD::MSCATTER, VT, Custom);
1372 setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v1i1, Legal);
1374 // Extract subvector is special because the value type
1375 // (result) is 256-bit but the source is 512-bit wide.
1376 // 128-bit was made Legal under AVX1.
1377 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1378 MVT::v8f32, MVT::v4f64 })
1379 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1380 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1381 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1382 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1384 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1385 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1386 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1387 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1388 setOperationAction(ISD::VSELECT, VT, Custom);
1389 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1390 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1391 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1392 setOperationAction(ISD::MLOAD, VT, Legal);
1393 setOperationAction(ISD::MSTORE, VT, Legal);
1394 setOperationAction(ISD::MGATHER, VT, Custom);
1395 setOperationAction(ISD::MSCATTER, VT, Custom);
1397 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1398 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1399 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1403 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1404 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1405 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1407 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1408 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1410 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1411 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1412 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1413 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1414 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1415 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1417 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1418 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1419 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1420 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1421 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1422 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1423 setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
1424 setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
1425 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1426 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1427 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1428 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1429 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1430 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1431 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1432 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1433 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1434 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1435 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1436 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1437 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1438 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1439 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1440 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1441 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1442 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1443 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1444 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1445 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1446 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1447 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1448 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1449 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1450 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1451 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1452 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1453 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1454 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1455 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1456 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1457 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1458 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1459 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1460 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1461 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1462 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1463 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1465 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1467 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1468 if (Subtarget.hasVLX()) {
1469 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1470 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1473 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1474 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1475 setOperationAction(ISD::MLOAD, VT, Action);
1476 setOperationAction(ISD::MSTORE, VT, Action);
1479 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1480 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1481 setOperationAction(ISD::VSELECT, VT, Custom);
1482 setOperationAction(ISD::ABS, VT, Legal);
1483 setOperationAction(ISD::SRL, VT, Custom);
1484 setOperationAction(ISD::SHL, VT, Custom);
1485 setOperationAction(ISD::SRA, VT, Custom);
1486 setOperationAction(ISD::MLOAD, VT, Legal);
1487 setOperationAction(ISD::MSTORE, VT, Legal);
1488 setOperationAction(ISD::CTPOP, VT, Custom);
1489 setOperationAction(ISD::CTTZ, VT, Custom);
1490 setOperationAction(ISD::CTLZ, VT, Custom);
1491 setOperationAction(ISD::SMAX, VT, Legal);
1492 setOperationAction(ISD::UMAX, VT, Legal);
1493 setOperationAction(ISD::SMIN, VT, Legal);
1494 setOperationAction(ISD::UMIN, VT, Legal);
1496 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1497 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1498 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1501 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
1502 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1505 if (Subtarget.hasBITALG()) {
1506 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v32i8,
1507 MVT::v16i16, MVT::v16i8, MVT::v8i16 })
1508 setOperationAction(ISD::CTPOP, VT, Legal);
1512 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1513 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1514 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1516 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1517 setOperationAction(ISD::ADD, VT, Custom);
1518 setOperationAction(ISD::SUB, VT, Custom);
1519 setOperationAction(ISD::MUL, VT, Custom);
1520 setOperationAction(ISD::VSELECT, VT, Expand);
1522 setOperationAction(ISD::TRUNCATE, VT, Custom);
1523 setOperationAction(ISD::SETCC, VT, Custom);
1524 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1525 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1526 setOperationAction(ISD::SELECT, VT, Custom);
1527 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1528 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1531 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1532 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1533 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1534 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1536 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1537 setOperationAction(ISD::SMAX, VT, Legal);
1538 setOperationAction(ISD::UMAX, VT, Legal);
1539 setOperationAction(ISD::SMIN, VT, Legal);
1540 setOperationAction(ISD::UMIN, VT, Legal);
1544 // We want to custom lower some of our intrinsics.
1545 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1546 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1547 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1548 if (!Subtarget.is64Bit()) {
1549 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1550 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1553 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1554 // handle type legalization for these operations here.
1556 // FIXME: We really should do custom legalization for addition and
1557 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1558 // than generic legalization for 64-bit multiplication-with-overflow, though.
1559 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1560 if (VT == MVT::i64 && !Subtarget.is64Bit())
1562 // Add/Sub/Mul with overflow operations are custom lowered.
1563 setOperationAction(ISD::SADDO, VT, Custom);
1564 setOperationAction(ISD::UADDO, VT, Custom);
1565 setOperationAction(ISD::SSUBO, VT, Custom);
1566 setOperationAction(ISD::USUBO, VT, Custom);
1567 setOperationAction(ISD::SMULO, VT, Custom);
1568 setOperationAction(ISD::UMULO, VT, Custom);
1570 // Support carry in as value rather than glue.
1571 setOperationAction(ISD::ADDCARRY, VT, Custom);
1572 setOperationAction(ISD::SUBCARRY, VT, Custom);
1573 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1576 if (!Subtarget.is64Bit()) {
1577 // These libcalls are not available in 32-bit.
1578 setLibcallName(RTLIB::SHL_I128, nullptr);
1579 setLibcallName(RTLIB::SRL_I128, nullptr);
1580 setLibcallName(RTLIB::SRA_I128, nullptr);
1581 setLibcallName(RTLIB::MUL_I128, nullptr);
1584 // Combine sin / cos into one node or libcall if possible.
1585 if (Subtarget.hasSinCos()) {
1586 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1587 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1588 if (Subtarget.isTargetDarwin()) {
1589 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1590 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1591 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1592 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1596 if (Subtarget.isTargetWin64()) {
1597 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1598 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1599 setOperationAction(ISD::SREM, MVT::i128, Custom);
1600 setOperationAction(ISD::UREM, MVT::i128, Custom);
1601 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1602 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1605 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1606 // is. We should promote the value to 64-bits to solve this.
1607 // This is what the CRT headers do - `fmodf` is an inline header
1608 // function casting to f64 and calling `fmod`.
1609 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1610 Subtarget.isTargetWindowsItanium()))
1611 for (ISD::NodeType Op :
1612 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1613 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1614 if (isOperationExpand(Op, MVT::f32))
1615 setOperationAction(Op, MVT::f32, Promote);
1617 // We have target-specific dag combine patterns for the following nodes:
1618 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1619 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1620 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1621 setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
1622 setTargetDAGCombine(ISD::BITCAST);
1623 setTargetDAGCombine(ISD::VSELECT);
1624 setTargetDAGCombine(ISD::SELECT);
1625 setTargetDAGCombine(ISD::SHL);
1626 setTargetDAGCombine(ISD::SRA);
1627 setTargetDAGCombine(ISD::SRL);
1628 setTargetDAGCombine(ISD::OR);
1629 setTargetDAGCombine(ISD::AND);
1630 setTargetDAGCombine(ISD::ADD);
1631 setTargetDAGCombine(ISD::FADD);
1632 setTargetDAGCombine(ISD::FSUB);
1633 setTargetDAGCombine(ISD::FNEG);
1634 setTargetDAGCombine(ISD::FMA);
1635 setTargetDAGCombine(ISD::FMINNUM);
1636 setTargetDAGCombine(ISD::FMAXNUM);
1637 setTargetDAGCombine(ISD::SUB);
1638 setTargetDAGCombine(ISD::LOAD);
1639 setTargetDAGCombine(ISD::MLOAD);
1640 setTargetDAGCombine(ISD::STORE);
1641 setTargetDAGCombine(ISD::MSTORE);
1642 setTargetDAGCombine(ISD::TRUNCATE);
1643 setTargetDAGCombine(ISD::ZERO_EXTEND);
1644 setTargetDAGCombine(ISD::ANY_EXTEND);
1645 setTargetDAGCombine(ISD::SIGN_EXTEND);
1646 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1647 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1648 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1649 setTargetDAGCombine(ISD::SINT_TO_FP);
1650 setTargetDAGCombine(ISD::UINT_TO_FP);
1651 setTargetDAGCombine(ISD::SETCC);
1652 setTargetDAGCombine(ISD::MUL);
1653 setTargetDAGCombine(ISD::XOR);
1654 setTargetDAGCombine(ISD::MSCATTER);
1655 setTargetDAGCombine(ISD::MGATHER);
1657 computeRegisterProperties(Subtarget.getRegisterInfo());
1659 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1660 MaxStoresPerMemsetOptSize = 8;
1661 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1662 MaxStoresPerMemcpyOptSize = 4;
1663 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1664 MaxStoresPerMemmoveOptSize = 4;
1666 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1667 // that needs to benchmarked and balanced with the potential use of vector
1668 // load/store types (PR33329, PR33914).
1669 MaxLoadsPerMemcmp = 2;
1670 MaxLoadsPerMemcmpOptSize = 2;
1672 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1673 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1675 // An out-of-order CPU can speculatively execute past a predictable branch,
1676 // but a conditional move could be stalled by an expensive earlier operation.
1677 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1678 EnableExtLdPromotion = true;
1679 setPrefFunctionAlignment(4); // 2^4 bytes.
1681 verifyIntrinsicTables();
1684 // This has so far only been implemented for 64-bit MachO.
1685 bool X86TargetLowering::useLoadStackGuardNode() const {
1686 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1689 bool X86TargetLowering::useStackGuardXorFP() const {
1690 // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
1691 return Subtarget.getTargetTriple().isOSMSVCRT();
1694 SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
1695 const SDLoc &DL) const {
1696 EVT PtrTy = getPointerTy(DAG.getDataLayout());
1697 unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
1698 MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
1699 return SDValue(Node, 0);
1702 TargetLoweringBase::LegalizeTypeAction
1703 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1704 if (ExperimentalVectorWideningLegalization &&
1705 VT.getVectorNumElements() != 1 &&
1706 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1707 return TypeWidenVector;
1709 return TargetLoweringBase::getPreferredVectorAction(VT);
1712 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1713 LLVMContext& Context,
1718 if (VT.getSizeInBits() >= 512) {
1719 EVT EltVT = VT.getVectorElementType();
1720 const unsigned NumElts = VT.getVectorNumElements();
1721 if (Subtarget.hasAVX512())
1722 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1723 EltVT == MVT::f32 || EltVT == MVT::f64)
1724 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1725 if (Subtarget.hasBWI())
1726 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1727 return EVT::getVectorVT(Context, MVT::i1, NumElts);
1730 if (VT.isSimple()) {
1731 MVT VVT = VT.getSimpleVT();
1732 const unsigned NumElts = VVT.getVectorNumElements();
1733 MVT EltVT = VVT.getVectorElementType();
1735 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1736 return MVT::getVectorVT(MVT::i1, NumElts);
1738 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1739 EVT LegalVT = getTypeToTransformTo(Context, VT);
1740 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1743 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1745 case 2: return MVT::v2i1;
1746 case 4: return MVT::v4i1;
1747 case 8: return MVT::v8i1;
1751 return VT.changeVectorElementTypeToInteger();
1754 /// Helper for getByValTypeAlignment to determine
1755 /// the desired ByVal argument alignment.
1756 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1759 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1760 if (VTy->getBitWidth() == 128)
1762 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1763 unsigned EltAlign = 0;
1764 getMaxByValAlign(ATy->getElementType(), EltAlign);
1765 if (EltAlign > MaxAlign)
1766 MaxAlign = EltAlign;
1767 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1768 for (auto *EltTy : STy->elements()) {
1769 unsigned EltAlign = 0;
1770 getMaxByValAlign(EltTy, EltAlign);
1771 if (EltAlign > MaxAlign)
1772 MaxAlign = EltAlign;
1779 /// Return the desired alignment for ByVal aggregate
1780 /// function arguments in the caller parameter area. For X86, aggregates
1781 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1782 /// are at 4-byte boundaries.
1783 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1784 const DataLayout &DL) const {
1785 if (Subtarget.is64Bit()) {
1786 // Max of 8 and alignment of type.
1787 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1794 if (Subtarget.hasSSE1())
1795 getMaxByValAlign(Ty, Align);
1799 /// Returns the target specific optimal type for load
1800 /// and store operations as a result of memset, memcpy, and memmove
1801 /// lowering. If DstAlign is zero that means it's safe to destination
1802 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1803 /// means there isn't a need to check it against alignment requirement,
1804 /// probably because the source does not need to be loaded. If 'IsMemset' is
1805 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1806 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1807 /// source is constant so it does not need to be loaded.
1808 /// It returns EVT::Other if the type should be determined using generic
1809 /// target-independent logic.
1811 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1812 unsigned DstAlign, unsigned SrcAlign,
1813 bool IsMemset, bool ZeroMemset,
1815 MachineFunction &MF) const {
1816 const Function *F = MF.getFunction();
1817 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1819 (!Subtarget.isUnalignedMem16Slow() ||
1820 ((DstAlign == 0 || DstAlign >= 16) &&
1821 (SrcAlign == 0 || SrcAlign >= 16)))) {
1822 // FIXME: Check if unaligned 32-byte accesses are slow.
1823 if (Size >= 32 && Subtarget.hasAVX()) {
1824 // Although this isn't a well-supported type for AVX1, we'll let
1825 // legalization and shuffle lowering produce the optimal codegen. If we
1826 // choose an optimal type with a vector element larger than a byte,
1827 // getMemsetStores() may create an intermediate splat (using an integer
1828 // multiply) before we splat as a vector.
1831 if (Subtarget.hasSSE2())
1833 // TODO: Can SSE1 handle a byte vector?
1834 if (Subtarget.hasSSE1())
1836 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1837 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1838 // Do not use f64 to lower memcpy if source is string constant. It's
1839 // better to use i32 to avoid the loads.
1840 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1841 // The gymnastics of splatting a byte value into an XMM register and then
1842 // only using 8-byte stores (because this is a CPU with slow unaligned
1843 // 16-byte accesses) makes that a loser.
1847 // This is a compromise. If we reach here, unaligned accesses may be slow on
1848 // this target. However, creating smaller, aligned accesses could be even
1849 // slower and would certainly be a lot more code.
1850 if (Subtarget.is64Bit() && Size >= 8)
1855 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1857 return X86ScalarSSEf32;
1858 else if (VT == MVT::f64)
1859 return X86ScalarSSEf64;
1864 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1869 switch (VT.getSizeInBits()) {
1871 // 8-byte and under are always assumed to be fast.
1875 *Fast = !Subtarget.isUnalignedMem16Slow();
1878 *Fast = !Subtarget.isUnalignedMem32Slow();
1880 // TODO: What about AVX-512 (512-bit) accesses?
1883 // Misaligned accesses of any size are always allowed.
1887 /// Return the entry encoding for a jump table in the
1888 /// current function. The returned value is a member of the
1889 /// MachineJumpTableInfo::JTEntryKind enum.
1890 unsigned X86TargetLowering::getJumpTableEncoding() const {
1891 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1893 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1894 return MachineJumpTableInfo::EK_Custom32;
1896 // Otherwise, use the normal jump table encoding heuristics.
1897 return TargetLowering::getJumpTableEncoding();
1900 bool X86TargetLowering::useSoftFloat() const {
1901 return Subtarget.useSoftFloat();
1904 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1905 ArgListTy &Args) const {
1907 // Only relabel X86-32 for C / Stdcall CCs.
1908 if (Subtarget.is64Bit())
1910 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1912 unsigned ParamRegs = 0;
1913 if (auto *M = MF->getFunction()->getParent())
1914 ParamRegs = M->getNumberRegisterParameters();
1916 // Mark the first N int arguments as having reg
1917 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1918 Type *T = Args[Idx].Ty;
1919 if (T->isPointerTy() || T->isIntegerTy())
1920 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1921 unsigned numRegs = 1;
1922 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1924 if (ParamRegs < numRegs)
1926 ParamRegs -= numRegs;
1927 Args[Idx].IsInReg = true;
1933 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1934 const MachineBasicBlock *MBB,
1935 unsigned uid,MCContext &Ctx) const{
1936 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1937 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1939 return MCSymbolRefExpr::create(MBB->getSymbol(),
1940 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1943 /// Returns relocation base for the given PIC jumptable.
1944 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1945 SelectionDAG &DAG) const {
1946 if (!Subtarget.is64Bit())
1947 // This doesn't have SDLoc associated with it, but is not really the
1948 // same as a Register.
1949 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1950 getPointerTy(DAG.getDataLayout()));
1954 /// This returns the relocation base for the given PIC jumptable,
1955 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1956 const MCExpr *X86TargetLowering::
1957 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1958 MCContext &Ctx) const {
1959 // X86-64 uses RIP relative addressing based on the jump table label.
1960 if (Subtarget.isPICStyleRIPRel())
1961 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1963 // Otherwise, the reference is relative to the PIC base.
1964 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1967 std::pair<const TargetRegisterClass *, uint8_t>
1968 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1970 const TargetRegisterClass *RRC = nullptr;
1972 switch (VT.SimpleTy) {
1974 return TargetLowering::findRepresentativeClass(TRI, VT);
1975 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1976 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1979 RRC = &X86::VR64RegClass;
1981 case MVT::f32: case MVT::f64:
1982 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1983 case MVT::v4f32: case MVT::v2f64:
1984 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1985 case MVT::v8f32: case MVT::v4f64:
1986 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1987 case MVT::v16f32: case MVT::v8f64:
1988 RRC = &X86::VR128XRegClass;
1991 return std::make_pair(RRC, Cost);
1994 unsigned X86TargetLowering::getAddressSpace() const {
1995 if (Subtarget.is64Bit())
1996 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2000 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2001 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2002 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2005 static Constant* SegmentOffset(IRBuilder<> &IRB,
2006 unsigned Offset, unsigned AddressSpace) {
2007 return ConstantExpr::getIntToPtr(
2008 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2009 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2012 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2013 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2014 // tcbhead_t; use it instead of the usual global variable (see
2015 // sysdeps/{i386,x86_64}/nptl/tls.h)
2016 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2017 if (Subtarget.isTargetFuchsia()) {
2018 // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
2019 return SegmentOffset(IRB, 0x10, getAddressSpace());
2021 // %fs:0x28, unless we're using a Kernel code model, in which case
2022 // it's %gs:0x28. gs:0x14 on i386.
2023 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2024 return SegmentOffset(IRB, Offset, getAddressSpace());
2028 return TargetLowering::getIRStackGuard(IRB);
2031 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2032 // MSVC CRT provides functionalities for stack protection.
2033 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2034 // MSVC CRT has a global variable holding security cookie.
2035 M.getOrInsertGlobal("__security_cookie",
2036 Type::getInt8PtrTy(M.getContext()));
2038 // MSVC CRT has a function to validate security cookie.
2039 auto *SecurityCheckCookie = cast<Function>(
2040 M.getOrInsertFunction("__security_check_cookie",
2041 Type::getVoidTy(M.getContext()),
2042 Type::getInt8PtrTy(M.getContext())));
2043 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2044 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2047 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2048 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2050 TargetLowering::insertSSPDeclarations(M);
2053 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2054 // MSVC CRT has a global variable holding security cookie.
2055 if (Subtarget.getTargetTriple().isOSMSVCRT())
2056 return M.getGlobalVariable("__security_cookie");
2057 return TargetLowering::getSDagStackGuard(M);
2060 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2061 // MSVC CRT has a function to validate security cookie.
2062 if (Subtarget.getTargetTriple().isOSMSVCRT())
2063 return M.getFunction("__security_check_cookie");
2064 return TargetLowering::getSSPStackGuardCheck(M);
2067 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2068 if (Subtarget.getTargetTriple().isOSContiki())
2069 return getDefaultSafeStackPointerLocation(IRB, false);
2071 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2072 // definition of TLS_SLOT_SAFESTACK in
2073 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2074 if (Subtarget.isTargetAndroid()) {
2075 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2077 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2078 return SegmentOffset(IRB, Offset, getAddressSpace());
2081 // Fuchsia is similar.
2082 if (Subtarget.isTargetFuchsia()) {
2083 // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
2084 return SegmentOffset(IRB, 0x18, getAddressSpace());
2087 return TargetLowering::getSafeStackPointerLocation(IRB);
2090 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2091 unsigned DestAS) const {
2092 assert(SrcAS != DestAS && "Expected different address spaces!");
2094 return SrcAS < 256 && DestAS < 256;
2097 //===----------------------------------------------------------------------===//
2098 // Return Value Calling Convention Implementation
2099 //===----------------------------------------------------------------------===//
2101 #include "X86GenCallingConv.inc"
2103 bool X86TargetLowering::CanLowerReturn(
2104 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2105 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2106 SmallVector<CCValAssign, 16> RVLocs;
2107 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2108 return CCInfo.CheckReturn(Outs, RetCC_X86);
2111 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2112 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2116 /// Lowers masks values (v*i1) to the local register values
2117 /// \returns DAG node after lowering to register type
2118 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2119 const SDLoc &Dl, SelectionDAG &DAG) {
2120 EVT ValVT = ValArg.getValueType();
2122 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2123 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2124 // Two stage lowering might be required
2125 // bitcast: v8i1 -> i8 / v16i1 -> i16
2126 // anyextend: i8 -> i32 / i16 -> i32
2127 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2128 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2129 if (ValLoc == MVT::i32)
2130 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2132 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2133 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2134 // One stage lowering is required
2135 // bitcast: v32i1 -> i32 / v64i1 -> i64
2136 return DAG.getBitcast(ValLoc, ValArg);
2138 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2141 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2142 static void Passv64i1ArgInRegs(
2143 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2144 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2145 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2146 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
2147 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2148 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2149 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2150 "The value should reside in two registers");
2152 // Before splitting the value we cast it to i64
2153 Arg = DAG.getBitcast(MVT::i64, Arg);
2155 // Splitting the value into two i32 types
2157 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2158 DAG.getConstant(0, Dl, MVT::i32));
2159 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2160 DAG.getConstant(1, Dl, MVT::i32));
2162 // Attach the two i32 types into corresponding registers
2163 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2164 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2168 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2170 const SmallVectorImpl<ISD::OutputArg> &Outs,
2171 const SmallVectorImpl<SDValue> &OutVals,
2172 const SDLoc &dl, SelectionDAG &DAG) const {
2173 MachineFunction &MF = DAG.getMachineFunction();
2174 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2176 // In some cases we need to disable registers from the default CSR list.
2177 // For example, when they are used for argument passing.
2178 bool ShouldDisableCalleeSavedRegister =
2179 CallConv == CallingConv::X86_RegCall ||
2180 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2182 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2183 report_fatal_error("X86 interrupts may not return any value");
2185 SmallVector<CCValAssign, 16> RVLocs;
2186 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2187 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2190 SmallVector<SDValue, 6> RetOps;
2191 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2192 // Operand #1 = Bytes To Pop
2193 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2196 // Copy the result values into the output registers.
2197 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2199 CCValAssign &VA = RVLocs[I];
2200 assert(VA.isRegLoc() && "Can only return in registers!");
2202 // Add the register to the CalleeSaveDisableRegs list.
2203 if (ShouldDisableCalleeSavedRegister)
2204 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2206 SDValue ValToCopy = OutVals[OutsIndex];
2207 EVT ValVT = ValToCopy.getValueType();
2209 // Promote values to the appropriate types.
2210 if (VA.getLocInfo() == CCValAssign::SExt)
2211 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2212 else if (VA.getLocInfo() == CCValAssign::ZExt)
2213 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2214 else if (VA.getLocInfo() == CCValAssign::AExt) {
2215 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2216 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2218 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2220 else if (VA.getLocInfo() == CCValAssign::BCvt)
2221 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2223 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2224 "Unexpected FP-extend for return value.");
2226 // If this is x86-64, and we disabled SSE, we can't return FP values,
2227 // or SSE or MMX vectors.
2228 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2229 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2230 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2231 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2232 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2233 } else if (ValVT == MVT::f64 &&
2234 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2235 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2236 // llvm-gcc has never done it right and no one has noticed, so this
2237 // should be OK for now.
2238 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2239 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2242 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2243 // the RET instruction and handled by the FP Stackifier.
2244 if (VA.getLocReg() == X86::FP0 ||
2245 VA.getLocReg() == X86::FP1) {
2246 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2247 // change the value to the FP stack register class.
2248 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2249 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2250 RetOps.push_back(ValToCopy);
2251 // Don't emit a copytoreg.
2255 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2256 // which is returned in RAX / RDX.
2257 if (Subtarget.is64Bit()) {
2258 if (ValVT == MVT::x86mmx) {
2259 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2260 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2261 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2263 // If we don't have SSE2 available, convert to v4f32 so the generated
2264 // register is legal.
2265 if (!Subtarget.hasSSE2())
2266 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2271 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2273 if (VA.needsCustom()) {
2274 assert(VA.getValVT() == MVT::v64i1 &&
2275 "Currently the only custom case is when we split v64i1 to 2 regs");
2277 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2280 assert(2 == RegsToPass.size() &&
2281 "Expecting two registers after Pass64BitArgInRegs");
2283 // Add the second register to the CalleeSaveDisableRegs list.
2284 if (ShouldDisableCalleeSavedRegister)
2285 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2287 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2290 // Add nodes to the DAG and add the values into the RetOps list
2291 for (auto &Reg : RegsToPass) {
2292 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2293 Flag = Chain.getValue(1);
2294 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2298 // Swift calling convention does not require we copy the sret argument
2299 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2301 // All x86 ABIs require that for returning structs by value we copy
2302 // the sret argument into %rax/%eax (depending on ABI) for the return.
2303 // We saved the argument into a virtual register in the entry block,
2304 // so now we copy the value out and into %rax/%eax.
2306 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2307 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2308 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2309 // either case FuncInfo->setSRetReturnReg() will have been called.
2310 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2311 // When we have both sret and another return value, we should use the
2312 // original Chain stored in RetOps[0], instead of the current Chain updated
2313 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2315 // For the case of sret and another return value, we have
2316 // Chain_0 at the function entry
2317 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2318 // If we use Chain_1 in getCopyFromReg, we will have
2319 // Val = getCopyFromReg(Chain_1)
2320 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2322 // getCopyToReg(Chain_0) will be glued together with
2323 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2324 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2325 // Data dependency from Unit B to Unit A due to usage of Val in
2326 // getCopyToReg(Chain_1, Val)
2327 // Chain dependency from Unit A to Unit B
2329 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2330 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2331 getPointerTy(MF.getDataLayout()));
2334 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2335 X86::RAX : X86::EAX;
2336 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2337 Flag = Chain.getValue(1);
2339 // RAX/EAX now acts like a return value.
2341 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2343 // Add the returned register to the CalleeSaveDisableRegs list.
2344 if (ShouldDisableCalleeSavedRegister)
2345 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2348 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2349 const MCPhysReg *I =
2350 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2353 if (X86::GR64RegClass.contains(*I))
2354 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2356 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2360 RetOps[0] = Chain; // Update chain.
2362 // Add the flag if we have it.
2364 RetOps.push_back(Flag);
2366 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2367 if (CallConv == CallingConv::X86_INTR)
2368 opcode = X86ISD::IRET;
2369 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2372 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2373 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2376 SDValue TCChain = Chain;
2377 SDNode *Copy = *N->use_begin();
2378 if (Copy->getOpcode() == ISD::CopyToReg) {
2379 // If the copy has a glue operand, we conservatively assume it isn't safe to
2380 // perform a tail call.
2381 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2383 TCChain = Copy->getOperand(0);
2384 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2387 bool HasRet = false;
2388 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2390 if (UI->getOpcode() != X86ISD::RET_FLAG)
2392 // If we are returning more than one value, we can definitely
2393 // not make a tail call see PR19530
2394 if (UI->getNumOperands() > 4)
2396 if (UI->getNumOperands() == 4 &&
2397 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2409 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2410 ISD::NodeType ExtendKind) const {
2411 MVT ReturnMVT = MVT::i32;
2413 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2414 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2415 // The ABI does not require i1, i8 or i16 to be extended.
2417 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2418 // always extending i8/i16 return values, so keep doing that for now.
2420 ReturnMVT = MVT::i8;
2423 EVT MinVT = getRegisterType(Context, ReturnMVT);
2424 return VT.bitsLT(MinVT) ? MinVT : VT;
2427 /// Reads two 32 bit registers and creates a 64 bit mask value.
2428 /// \param VA The current 32 bit value that need to be assigned.
2429 /// \param NextVA The next 32 bit value that need to be assigned.
2430 /// \param Root The parent DAG node.
2431 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2432 /// glue purposes. In the case the DAG is already using
2433 /// physical register instead of virtual, we should glue
2434 /// our new SDValue to InFlag SDvalue.
2435 /// \return a new SDvalue of size 64bit.
2436 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2437 SDValue &Root, SelectionDAG &DAG,
2438 const SDLoc &Dl, const X86Subtarget &Subtarget,
2439 SDValue *InFlag = nullptr) {
2440 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2441 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2442 assert(VA.getValVT() == MVT::v64i1 &&
2443 "Expecting first location of 64 bit width type");
2444 assert(NextVA.getValVT() == VA.getValVT() &&
2445 "The locations should have the same type");
2446 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2447 "The values should reside in two registers");
2451 SDValue ArgValueLo, ArgValueHi;
2453 MachineFunction &MF = DAG.getMachineFunction();
2454 const TargetRegisterClass *RC = &X86::GR32RegClass;
2456 // Read a 32 bit value from the registers
2457 if (nullptr == InFlag) {
2458 // When no physical register is present,
2459 // create an intermediate virtual register
2460 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2461 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2462 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2463 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2465 // When a physical register is available read the value from it and glue
2466 // the reads together.
2468 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2469 *InFlag = ArgValueLo.getValue(2);
2471 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2472 *InFlag = ArgValueHi.getValue(2);
2475 // Convert the i32 type into v32i1 type
2476 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2478 // Convert the i32 type into v32i1 type
2479 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2481 // Concatenate the two values together
2482 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2485 /// The function will lower a register of various sizes (8/16/32/64)
2486 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2487 /// \returns a DAG node contains the operand after lowering to mask type.
2488 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2489 const EVT &ValLoc, const SDLoc &Dl,
2490 SelectionDAG &DAG) {
2491 SDValue ValReturned = ValArg;
2493 if (ValVT == MVT::v1i1)
2494 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2496 if (ValVT == MVT::v64i1) {
2497 // In 32 bit machine, this case is handled by getv64i1Argument
2498 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2499 // In 64 bit machine, There is no need to truncate the value only bitcast
2502 switch (ValVT.getSimpleVT().SimpleTy) {
2513 llvm_unreachable("Expecting a vector of i1 types");
2516 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2518 return DAG.getBitcast(ValVT, ValReturned);
2521 /// Lower the result values of a call into the
2522 /// appropriate copies out of appropriate physical registers.
2524 SDValue X86TargetLowering::LowerCallResult(
2525 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2526 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2527 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2528 uint32_t *RegMask) const {
2530 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2531 // Assign locations to each value returned by this call.
2532 SmallVector<CCValAssign, 16> RVLocs;
2533 bool Is64Bit = Subtarget.is64Bit();
2534 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2536 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2538 // Copy all of the result registers out of their specified physreg.
2539 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2541 CCValAssign &VA = RVLocs[I];
2542 EVT CopyVT = VA.getLocVT();
2544 // In some calling conventions we need to remove the used registers
2545 // from the register mask.
2547 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2548 SubRegs.isValid(); ++SubRegs)
2549 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2552 // If this is x86-64, and we disabled SSE, we can't return FP values
2553 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2554 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2555 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2556 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2559 // If we prefer to use the value in xmm registers, copy it out as f80 and
2560 // use a truncate to move it from fp stack reg to xmm reg.
2561 bool RoundAfterCopy = false;
2562 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2563 isScalarFPTypeInSSEReg(VA.getValVT())) {
2564 if (!Subtarget.hasX87())
2565 report_fatal_error("X87 register return with X87 disabled");
2567 RoundAfterCopy = (CopyVT != VA.getLocVT());
2571 if (VA.needsCustom()) {
2572 assert(VA.getValVT() == MVT::v64i1 &&
2573 "Currently the only custom case is when we split v64i1 to 2 regs");
2575 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2577 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2579 Val = Chain.getValue(0);
2580 InFlag = Chain.getValue(2);
2584 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2585 // This truncation won't change the value.
2586 DAG.getIntPtrConstant(1, dl));
2588 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2589 if (VA.getValVT().isVector() &&
2590 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2591 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2592 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2593 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2595 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2598 InVals.push_back(Val);
2604 //===----------------------------------------------------------------------===//
2605 // C & StdCall & Fast Calling Convention implementation
2606 //===----------------------------------------------------------------------===//
2607 // StdCall calling convention seems to be standard for many Windows' API
2608 // routines and around. It differs from C calling convention just a little:
2609 // callee should clean up the stack, not caller. Symbols should be also
2610 // decorated in some fancy way :) It doesn't support any vector arguments.
2611 // For info on fast calling convention see Fast Calling Convention (tail call)
2612 // implementation LowerX86_32FastCCCallTo.
2614 /// CallIsStructReturn - Determines whether a call uses struct return
2616 enum StructReturnType {
2621 static StructReturnType
2622 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2624 return NotStructReturn;
2626 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2627 if (!Flags.isSRet())
2628 return NotStructReturn;
2629 if (Flags.isInReg() || IsMCU)
2630 return RegStructReturn;
2631 return StackStructReturn;
2634 /// Determines whether a function uses struct return semantics.
2635 static StructReturnType
2636 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2638 return NotStructReturn;
2640 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2641 if (!Flags.isSRet())
2642 return NotStructReturn;
2643 if (Flags.isInReg() || IsMCU)
2644 return RegStructReturn;
2645 return StackStructReturn;
2648 /// Make a copy of an aggregate at address specified by "Src" to address
2649 /// "Dst" with size and alignment information specified by the specific
2650 /// parameter attribute. The copy will be passed as a byval function parameter.
2651 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2652 SDValue Chain, ISD::ArgFlagsTy Flags,
2653 SelectionDAG &DAG, const SDLoc &dl) {
2654 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2656 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2657 /*isVolatile*/false, /*AlwaysInline=*/true,
2658 /*isTailCall*/false,
2659 MachinePointerInfo(), MachinePointerInfo());
2662 /// Return true if the calling convention is one that we can guarantee TCO for.
2663 static bool canGuaranteeTCO(CallingConv::ID CC) {
2664 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2665 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2666 CC == CallingConv::HHVM);
2669 /// Return true if we might ever do TCO for calls with this calling convention.
2670 static bool mayTailCallThisCC(CallingConv::ID CC) {
2672 // C calling conventions:
2673 case CallingConv::C:
2674 case CallingConv::Win64:
2675 case CallingConv::X86_64_SysV:
2676 // Callee pop conventions:
2677 case CallingConv::X86_ThisCall:
2678 case CallingConv::X86_StdCall:
2679 case CallingConv::X86_VectorCall:
2680 case CallingConv::X86_FastCall:
2683 return canGuaranteeTCO(CC);
2687 /// Return true if the function is being made into a tailcall target by
2688 /// changing its ABI.
2689 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2690 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2693 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2695 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2696 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2699 ImmutableCallSite CS(CI);
2700 CallingConv::ID CalleeCC = CS.getCallingConv();
2701 if (!mayTailCallThisCC(CalleeCC))
2708 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2709 const SmallVectorImpl<ISD::InputArg> &Ins,
2710 const SDLoc &dl, SelectionDAG &DAG,
2711 const CCValAssign &VA,
2712 MachineFrameInfo &MFI, unsigned i) const {
2713 // Create the nodes corresponding to a load from this parameter slot.
2714 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2715 bool AlwaysUseMutable = shouldGuaranteeTCO(
2716 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2717 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2719 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2721 // If value is passed by pointer we have address passed instead of the value
2722 // itself. No need to extend if the mask value and location share the same
2724 bool ExtendedInMem =
2725 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2726 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2728 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2729 ValVT = VA.getLocVT();
2731 ValVT = VA.getValVT();
2733 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2734 // taken by a return address.
2736 if (CallConv == CallingConv::X86_INTR) {
2737 // X86 interrupts may take one or two arguments.
2738 // On the stack there will be no return address as in regular call.
2739 // Offset of last argument need to be set to -4/-8 bytes.
2740 // Where offset of the first argument out of two, should be set to 0 bytes.
2741 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2742 if (Subtarget.is64Bit() && Ins.size() == 2) {
2743 // The stack pointer needs to be realigned for 64 bit handlers with error
2744 // code, so the argument offset changes by 8 bytes.
2749 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2750 // changed with more analysis.
2751 // In case of tail call optimization mark all arguments mutable. Since they
2752 // could be overwritten by lowering of arguments in case of a tail call.
2753 if (Flags.isByVal()) {
2754 unsigned Bytes = Flags.getByValSize();
2755 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2756 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2757 // Adjust SP offset of interrupt parameter.
2758 if (CallConv == CallingConv::X86_INTR) {
2759 MFI.setObjectOffset(FI, Offset);
2761 return DAG.getFrameIndex(FI, PtrVT);
2764 // This is an argument in memory. We might be able to perform copy elision.
2765 if (Flags.isCopyElisionCandidate()) {
2766 EVT ArgVT = Ins[i].ArgVT;
2768 if (Ins[i].PartOffset == 0) {
2769 // If this is a one-part value or the first part of a multi-part value,
2770 // create a stack object for the entire argument value type and return a
2771 // load from our portion of it. This assumes that if the first part of an
2772 // argument is in memory, the rest will also be in memory.
2773 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2774 /*Immutable=*/false);
2775 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2777 ValVT, dl, Chain, PartAddr,
2778 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2780 // This is not the first piece of an argument in memory. See if there is
2781 // already a fixed stack object including this offset. If so, assume it
2782 // was created by the PartOffset == 0 branch above and create a load from
2783 // the appropriate offset into it.
2784 int64_t PartBegin = VA.getLocMemOffset();
2785 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2786 int FI = MFI.getObjectIndexBegin();
2787 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2788 int64_t ObjBegin = MFI.getObjectOffset(FI);
2789 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2790 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2793 if (MFI.isFixedObjectIndex(FI)) {
2795 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2796 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2798 ValVT, dl, Chain, Addr,
2799 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2800 Ins[i].PartOffset));
2805 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2806 VA.getLocMemOffset(), isImmutable);
2808 // Set SExt or ZExt flag.
2809 if (VA.getLocInfo() == CCValAssign::ZExt) {
2810 MFI.setObjectZExt(FI, true);
2811 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2812 MFI.setObjectSExt(FI, true);
2815 // Adjust SP offset of interrupt parameter.
2816 if (CallConv == CallingConv::X86_INTR) {
2817 MFI.setObjectOffset(FI, Offset);
2820 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2821 SDValue Val = DAG.getLoad(
2822 ValVT, dl, Chain, FIN,
2823 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2824 return ExtendedInMem
2825 ? (VA.getValVT().isVector()
2826 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2827 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2831 // FIXME: Get this from tablegen.
2832 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2833 const X86Subtarget &Subtarget) {
2834 assert(Subtarget.is64Bit());
2836 if (Subtarget.isCallingConvWin64(CallConv)) {
2837 static const MCPhysReg GPR64ArgRegsWin64[] = {
2838 X86::RCX, X86::RDX, X86::R8, X86::R9
2840 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2843 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2844 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2846 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2849 // FIXME: Get this from tablegen.
2850 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2851 CallingConv::ID CallConv,
2852 const X86Subtarget &Subtarget) {
2853 assert(Subtarget.is64Bit());
2854 if (Subtarget.isCallingConvWin64(CallConv)) {
2855 // The XMM registers which might contain var arg parameters are shadowed
2856 // in their paired GPR. So we only need to save the GPR to their home
2858 // TODO: __vectorcall will change this.
2862 const Function *Fn = MF.getFunction();
2863 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2864 bool isSoftFloat = Subtarget.useSoftFloat();
2865 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2866 "SSE register cannot be used when SSE is disabled!");
2867 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2868 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2872 static const MCPhysReg XMMArgRegs64Bit[] = {
2873 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2874 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2876 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2880 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2881 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2882 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2883 return A.getValNo() < B.getValNo();
2888 SDValue X86TargetLowering::LowerFormalArguments(
2889 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2890 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2891 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2892 MachineFunction &MF = DAG.getMachineFunction();
2893 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2894 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2896 const Function *Fn = MF.getFunction();
2897 if (Fn->hasExternalLinkage() &&
2898 Subtarget.isTargetCygMing() &&
2899 Fn->getName() == "main")
2900 FuncInfo->setForceFramePointer(true);
2902 MachineFrameInfo &MFI = MF.getFrameInfo();
2903 bool Is64Bit = Subtarget.is64Bit();
2904 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2907 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2908 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2910 if (CallConv == CallingConv::X86_INTR) {
2911 bool isLegal = Ins.size() == 1 ||
2912 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2913 (!Is64Bit && Ins[1].VT == MVT::i32)));
2915 report_fatal_error("X86 interrupts may take one or two arguments");
2918 // Assign locations to all of the incoming arguments.
2919 SmallVector<CCValAssign, 16> ArgLocs;
2920 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2922 // Allocate shadow area for Win64.
2924 CCInfo.AllocateStack(32, 8);
2926 CCInfo.AnalyzeArguments(Ins, CC_X86);
2928 // In vectorcall calling convention a second pass is required for the HVA
2930 if (CallingConv::X86_VectorCall == CallConv) {
2931 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2934 // The next loop assumes that the locations are in the same order of the
2936 assert(isSortedByValueNo(ArgLocs) &&
2937 "Argument Location list must be sorted before lowering");
2940 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2942 assert(InsIndex < Ins.size() && "Invalid Ins index");
2943 CCValAssign &VA = ArgLocs[I];
2945 if (VA.isRegLoc()) {
2946 EVT RegVT = VA.getLocVT();
2947 if (VA.needsCustom()) {
2949 VA.getValVT() == MVT::v64i1 &&
2950 "Currently the only custom case is when we split v64i1 to 2 regs");
2952 // v64i1 values, in regcall calling convention, that are
2953 // compiled to 32 bit arch, are split up into two registers.
2955 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2957 const TargetRegisterClass *RC;
2958 if (RegVT == MVT::i32)
2959 RC = &X86::GR32RegClass;
2960 else if (Is64Bit && RegVT == MVT::i64)
2961 RC = &X86::GR64RegClass;
2962 else if (RegVT == MVT::f32)
2963 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2964 else if (RegVT == MVT::f64)
2965 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2966 else if (RegVT == MVT::f80)
2967 RC = &X86::RFP80RegClass;
2968 else if (RegVT == MVT::f128)
2969 RC = &X86::FR128RegClass;
2970 else if (RegVT.is512BitVector())
2971 RC = &X86::VR512RegClass;
2972 else if (RegVT.is256BitVector())
2973 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2974 else if (RegVT.is128BitVector())
2975 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2976 else if (RegVT == MVT::x86mmx)
2977 RC = &X86::VR64RegClass;
2978 else if (RegVT == MVT::v1i1)
2979 RC = &X86::VK1RegClass;
2980 else if (RegVT == MVT::v8i1)
2981 RC = &X86::VK8RegClass;
2982 else if (RegVT == MVT::v16i1)
2983 RC = &X86::VK16RegClass;
2984 else if (RegVT == MVT::v32i1)
2985 RC = &X86::VK32RegClass;
2986 else if (RegVT == MVT::v64i1)
2987 RC = &X86::VK64RegClass;
2989 llvm_unreachable("Unknown argument type!");
2991 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2992 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2995 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2996 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2998 if (VA.getLocInfo() == CCValAssign::SExt)
2999 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3000 DAG.getValueType(VA.getValVT()));
3001 else if (VA.getLocInfo() == CCValAssign::ZExt)
3002 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3003 DAG.getValueType(VA.getValVT()));
3004 else if (VA.getLocInfo() == CCValAssign::BCvt)
3005 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3007 if (VA.isExtInLoc()) {
3008 // Handle MMX values passed in XMM regs.
3009 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3010 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3011 else if (VA.getValVT().isVector() &&
3012 VA.getValVT().getScalarType() == MVT::i1 &&
3013 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3014 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3015 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3016 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3018 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3021 assert(VA.isMemLoc());
3023 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3026 // If value is passed via pointer - do a load.
3027 if (VA.getLocInfo() == CCValAssign::Indirect)
3029 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3031 InVals.push_back(ArgValue);
3034 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3035 // Swift calling convention does not require we copy the sret argument
3036 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3037 if (CallConv == CallingConv::Swift)
3040 // All x86 ABIs require that for returning structs by value we copy the
3041 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3042 // the argument into a virtual register so that we can access it from the
3044 if (Ins[I].Flags.isSRet()) {
3045 unsigned Reg = FuncInfo->getSRetReturnReg();
3047 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3048 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3049 FuncInfo->setSRetReturnReg(Reg);
3051 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3052 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3057 unsigned StackSize = CCInfo.getNextStackOffset();
3058 // Align stack specially for tail calls.
3059 if (shouldGuaranteeTCO(CallConv,
3060 MF.getTarget().Options.GuaranteedTailCallOpt))
3061 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3063 // If the function takes variable number of arguments, make a frame index for
3064 // the start of the first vararg value... for expansion of llvm.va_start. We
3065 // can skip this if there are no va_start calls.
3066 if (MFI.hasVAStart() &&
3067 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3068 CallConv != CallingConv::X86_ThisCall))) {
3069 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3072 // Figure out if XMM registers are in use.
3073 assert(!(Subtarget.useSoftFloat() &&
3074 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3075 "SSE register cannot be used when SSE is disabled!");
3077 // 64-bit calling conventions support varargs and register parameters, so we
3078 // have to do extra work to spill them in the prologue.
3079 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3080 // Find the first unallocated argument registers.
3081 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3082 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3083 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3084 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3085 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3086 "SSE register cannot be used when SSE is disabled!");
3088 // Gather all the live in physical registers.
3089 SmallVector<SDValue, 6> LiveGPRs;
3090 SmallVector<SDValue, 8> LiveXMMRegs;
3092 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3093 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3095 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3097 if (!ArgXMMs.empty()) {
3098 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3099 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3100 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3101 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3102 LiveXMMRegs.push_back(
3103 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3108 // Get to the caller-allocated home save location. Add 8 to account
3109 // for the return address.
3110 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3111 FuncInfo->setRegSaveFrameIndex(
3112 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3113 // Fixup to set vararg frame on shadow area (4 x i64).
3115 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3117 // For X86-64, if there are vararg parameters that are passed via
3118 // registers, then we must store them to their spots on the stack so
3119 // they may be loaded by dereferencing the result of va_next.
3120 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3121 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3122 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3123 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3126 // Store the integer parameter registers.
3127 SmallVector<SDValue, 8> MemOps;
3128 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3129 getPointerTy(DAG.getDataLayout()));
3130 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3131 for (SDValue Val : LiveGPRs) {
3132 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3133 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3135 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3136 MachinePointerInfo::getFixedStack(
3137 DAG.getMachineFunction(),
3138 FuncInfo->getRegSaveFrameIndex(), Offset));
3139 MemOps.push_back(Store);
3143 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3144 // Now store the XMM (fp + vector) parameter registers.
3145 SmallVector<SDValue, 12> SaveXMMOps;
3146 SaveXMMOps.push_back(Chain);
3147 SaveXMMOps.push_back(ALVal);
3148 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3149 FuncInfo->getRegSaveFrameIndex(), dl));
3150 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3151 FuncInfo->getVarArgsFPOffset(), dl));
3152 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3154 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3155 MVT::Other, SaveXMMOps));
3158 if (!MemOps.empty())
3159 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3162 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3163 // Find the largest legal vector type.
3164 MVT VecVT = MVT::Other;
3165 // FIXME: Only some x86_32 calling conventions support AVX512.
3166 if (Subtarget.hasAVX512() &&
3167 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3168 CallConv == CallingConv::Intel_OCL_BI)))
3169 VecVT = MVT::v16f32;
3170 else if (Subtarget.hasAVX())
3172 else if (Subtarget.hasSSE2())
3175 // We forward some GPRs and some vector types.
3176 SmallVector<MVT, 2> RegParmTypes;
3177 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3178 RegParmTypes.push_back(IntVT);
3179 if (VecVT != MVT::Other)
3180 RegParmTypes.push_back(VecVT);
3182 // Compute the set of forwarded registers. The rest are scratch.
3183 SmallVectorImpl<ForwardedRegister> &Forwards =
3184 FuncInfo->getForwardedMustTailRegParms();
3185 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3187 // Conservatively forward AL on x86_64, since it might be used for varargs.
3188 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3189 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3190 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3193 // Copy all forwards from physical to virtual registers.
3194 for (ForwardedRegister &F : Forwards) {
3195 // FIXME: Can we use a less constrained schedule?
3196 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3197 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3198 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3202 // Some CCs need callee pop.
3203 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3204 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3205 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3206 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3207 // X86 interrupts must pop the error code (and the alignment padding) if
3209 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3211 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3212 // If this is an sret function, the return should pop the hidden pointer.
3213 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3214 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3215 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3216 FuncInfo->setBytesToPopOnReturn(4);
3220 // RegSaveFrameIndex is X86-64 only.
3221 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3222 if (CallConv == CallingConv::X86_FastCall ||
3223 CallConv == CallingConv::X86_ThisCall)
3224 // fastcc functions can't have varargs.
3225 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3228 FuncInfo->setArgumentStackSize(StackSize);
3230 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3231 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3232 if (Personality == EHPersonality::CoreCLR) {
3234 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3235 // that we'd prefer this slot be allocated towards the bottom of the frame
3236 // (i.e. near the stack pointer after allocating the frame). Every
3237 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3238 // offset from the bottom of this and each funclet's frame must be the
3239 // same, so the size of funclets' (mostly empty) frames is dictated by
3240 // how far this slot is from the bottom (since they allocate just enough
3241 // space to accommodate holding this slot at the correct offset).
3242 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3243 EHInfo->PSPSymFrameIdx = PSPSymFI;
3247 if (CallConv == CallingConv::X86_RegCall ||
3248 Fn->hasFnAttribute("no_caller_saved_registers")) {
3249 MachineRegisterInfo &MRI = MF.getRegInfo();
3250 for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
3251 MRI.disableCalleeSavedRegister(Pair.first);
3257 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3258 SDValue Arg, const SDLoc &dl,
3260 const CCValAssign &VA,
3261 ISD::ArgFlagsTy Flags) const {
3262 unsigned LocMemOffset = VA.getLocMemOffset();
3263 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3264 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3266 if (Flags.isByVal())
3267 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3269 return DAG.getStore(
3270 Chain, dl, Arg, PtrOff,
3271 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3274 /// Emit a load of return address if tail call
3275 /// optimization is performed and it is required.
3276 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3277 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3278 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3279 // Adjust the Return address stack slot.
3280 EVT VT = getPointerTy(DAG.getDataLayout());
3281 OutRetAddr = getReturnAddressFrameIndex(DAG);
3283 // Load the "old" Return address.
3284 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3285 return SDValue(OutRetAddr.getNode(), 1);
3288 /// Emit a store of the return address if tail call
3289 /// optimization is performed and it is required (FPDiff!=0).
3290 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3291 SDValue Chain, SDValue RetAddrFrIdx,
3292 EVT PtrVT, unsigned SlotSize,
3293 int FPDiff, const SDLoc &dl) {
3294 // Store the return address to the appropriate stack slot.
3295 if (!FPDiff) return Chain;
3296 // Calculate the new stack slot for the return address.
3297 int NewReturnAddrFI =
3298 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3300 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3301 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3302 MachinePointerInfo::getFixedStack(
3303 DAG.getMachineFunction(), NewReturnAddrFI));
3307 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3308 /// operation of specified width.
3309 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3311 unsigned NumElems = VT.getVectorNumElements();
3312 SmallVector<int, 8> Mask;
3313 Mask.push_back(NumElems);
3314 for (unsigned i = 1; i != NumElems; ++i)
3316 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3320 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3321 SmallVectorImpl<SDValue> &InVals) const {
3322 SelectionDAG &DAG = CLI.DAG;
3324 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3325 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3326 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3327 SDValue Chain = CLI.Chain;
3328 SDValue Callee = CLI.Callee;
3329 CallingConv::ID CallConv = CLI.CallConv;
3330 bool &isTailCall = CLI.IsTailCall;
3331 bool isVarArg = CLI.IsVarArg;
3333 MachineFunction &MF = DAG.getMachineFunction();
3334 bool Is64Bit = Subtarget.is64Bit();
3335 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3336 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3337 bool IsSibcall = false;
3338 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3339 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3340 const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
3341 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3342 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3343 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3345 if (CallConv == CallingConv::X86_INTR)
3346 report_fatal_error("X86 interrupts may not be called directly");
3348 if (Attr.getValueAsString() == "true")
3351 if (Subtarget.isPICStyleGOT() &&
3352 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3353 // If we are using a GOT, disable tail calls to external symbols with
3354 // default visibility. Tail calling such a symbol requires using a GOT
3355 // relocation, which forces early binding of the symbol. This breaks code
3356 // that require lazy function symbol resolution. Using musttail or
3357 // GuaranteedTailCallOpt will override this.
3358 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3359 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3360 G->getGlobal()->hasDefaultVisibility()))
3364 bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
3366 // Force this to be a tail call. The verifier rules are enough to ensure
3367 // that we can lower this successfully without moving the return address
3370 } else if (isTailCall) {
3371 // Check if it's really possible to do a tail call.
3372 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3373 isVarArg, SR != NotStructReturn,
3374 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3375 Outs, OutVals, Ins, DAG);
3377 // Sibcalls are automatically detected tailcalls which do not require
3379 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3386 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3387 "Var args not supported with calling convention fastcc, ghc or hipe");
3389 // Analyze operands of the call, assigning locations to each operand.
3390 SmallVector<CCValAssign, 16> ArgLocs;
3391 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3393 // Allocate shadow area for Win64.
3395 CCInfo.AllocateStack(32, 8);
3397 CCInfo.AnalyzeArguments(Outs, CC_X86);
3399 // In vectorcall calling convention a second pass is required for the HVA
3401 if (CallingConv::X86_VectorCall == CallConv) {
3402 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3405 // Get a count of how many bytes are to be pushed on the stack.
3406 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3408 // This is a sibcall. The memory operands are available in caller's
3409 // own caller's stack.
3411 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3412 canGuaranteeTCO(CallConv))
3413 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3416 if (isTailCall && !IsSibcall && !IsMustTail) {
3417 // Lower arguments at fp - stackoffset + fpdiff.
3418 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3420 FPDiff = NumBytesCallerPushed - NumBytes;
3422 // Set the delta of movement of the returnaddr stackslot.
3423 // But only set if delta is greater than previous delta.
3424 if (FPDiff < X86Info->getTCReturnAddrDelta())
3425 X86Info->setTCReturnAddrDelta(FPDiff);
3428 unsigned NumBytesToPush = NumBytes;
3429 unsigned NumBytesToPop = NumBytes;
3431 // If we have an inalloca argument, all stack space has already been allocated
3432 // for us and be right at the top of the stack. We don't support multiple
3433 // arguments passed in memory when using inalloca.
3434 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3436 if (!ArgLocs.back().isMemLoc())
3437 report_fatal_error("cannot use inalloca attribute on a register "
3439 if (ArgLocs.back().getLocMemOffset() != 0)
3440 report_fatal_error("any parameter with the inalloca attribute must be "
3441 "the only memory argument");
3445 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3446 NumBytes - NumBytesToPush, dl);
3448 SDValue RetAddrFrIdx;
3449 // Load return address for tail calls.
3450 if (isTailCall && FPDiff)
3451 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3452 Is64Bit, FPDiff, dl);
3454 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3455 SmallVector<SDValue, 8> MemOpChains;
3458 // The next loop assumes that the locations are in the same order of the
3460 assert(isSortedByValueNo(ArgLocs) &&
3461 "Argument Location list must be sorted before lowering");
3463 // Walk the register/memloc assignments, inserting copies/loads. In the case
3464 // of tail call optimization arguments are handle later.
3465 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3466 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3468 assert(OutIndex < Outs.size() && "Invalid Out index");
3469 // Skip inalloca arguments, they have already been written.
3470 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3471 if (Flags.isInAlloca())
3474 CCValAssign &VA = ArgLocs[I];
3475 EVT RegVT = VA.getLocVT();
3476 SDValue Arg = OutVals[OutIndex];
3477 bool isByVal = Flags.isByVal();
3479 // Promote the value if needed.
3480 switch (VA.getLocInfo()) {
3481 default: llvm_unreachable("Unknown loc info!");
3482 case CCValAssign::Full: break;
3483 case CCValAssign::SExt:
3484 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3486 case CCValAssign::ZExt:
3487 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3489 case CCValAssign::AExt:
3490 if (Arg.getValueType().isVector() &&
3491 Arg.getValueType().getVectorElementType() == MVT::i1)
3492 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3493 else if (RegVT.is128BitVector()) {
3494 // Special case: passing MMX values in XMM registers.
3495 Arg = DAG.getBitcast(MVT::i64, Arg);
3496 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3497 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3499 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3501 case CCValAssign::BCvt:
3502 Arg = DAG.getBitcast(RegVT, Arg);
3504 case CCValAssign::Indirect: {
3505 // Store the argument.
3506 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3507 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3508 Chain = DAG.getStore(
3509 Chain, dl, Arg, SpillSlot,
3510 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3516 if (VA.needsCustom()) {
3517 assert(VA.getValVT() == MVT::v64i1 &&
3518 "Currently the only custom case is when we split v64i1 to 2 regs");
3519 // Split v64i1 value into two registers
3520 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3522 } else if (VA.isRegLoc()) {
3523 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3524 if (isVarArg && IsWin64) {
3525 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3526 // shadow reg if callee is a varargs function.
3527 unsigned ShadowReg = 0;
3528 switch (VA.getLocReg()) {
3529 case X86::XMM0: ShadowReg = X86::RCX; break;
3530 case X86::XMM1: ShadowReg = X86::RDX; break;
3531 case X86::XMM2: ShadowReg = X86::R8; break;
3532 case X86::XMM3: ShadowReg = X86::R9; break;
3535 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3537 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3538 assert(VA.isMemLoc());
3539 if (!StackPtr.getNode())
3540 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3541 getPointerTy(DAG.getDataLayout()));
3542 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3543 dl, DAG, VA, Flags));
3547 if (!MemOpChains.empty())
3548 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3550 if (Subtarget.isPICStyleGOT()) {
3551 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3554 RegsToPass.push_back(std::make_pair(
3555 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3556 getPointerTy(DAG.getDataLayout()))));
3558 // If we are tail calling and generating PIC/GOT style code load the
3559 // address of the callee into ECX. The value in ecx is used as target of
3560 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3561 // for tail calls on PIC/GOT architectures. Normally we would just put the
3562 // address of GOT into ebx and then call target@PLT. But for tail calls
3563 // ebx would be restored (since ebx is callee saved) before jumping to the
3566 // Note: The actual moving to ECX is done further down.
3567 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3568 if (G && !G->getGlobal()->hasLocalLinkage() &&
3569 G->getGlobal()->hasDefaultVisibility())
3570 Callee = LowerGlobalAddress(Callee, DAG);
3571 else if (isa<ExternalSymbolSDNode>(Callee))
3572 Callee = LowerExternalSymbol(Callee, DAG);
3576 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3577 // From AMD64 ABI document:
3578 // For calls that may call functions that use varargs or stdargs
3579 // (prototype-less calls or calls to functions containing ellipsis (...) in
3580 // the declaration) %al is used as hidden argument to specify the number
3581 // of SSE registers used. The contents of %al do not need to match exactly
3582 // the number of registers, but must be an ubound on the number of SSE
3583 // registers used and is in the range 0 - 8 inclusive.
3585 // Count the number of XMM registers allocated.
3586 static const MCPhysReg XMMArgRegs[] = {
3587 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3588 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3590 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3591 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3592 && "SSE registers cannot be used when SSE is disabled");
3594 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3595 DAG.getConstant(NumXMMRegs, dl,
3599 if (isVarArg && IsMustTail) {
3600 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3601 for (const auto &F : Forwards) {
3602 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3603 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3607 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3608 // don't need this because the eligibility check rejects calls that require
3609 // shuffling arguments passed in memory.
3610 if (!IsSibcall && isTailCall) {
3611 // Force all the incoming stack arguments to be loaded from the stack
3612 // before any new outgoing arguments are stored to the stack, because the
3613 // outgoing stack slots may alias the incoming argument stack slots, and
3614 // the alias isn't otherwise explicit. This is slightly more conservative
3615 // than necessary, because it means that each store effectively depends
3616 // on every argument instead of just those arguments it would clobber.
3617 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3619 SmallVector<SDValue, 8> MemOpChains2;
3622 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3624 CCValAssign &VA = ArgLocs[I];
3626 if (VA.isRegLoc()) {
3627 if (VA.needsCustom()) {
3628 assert((CallConv == CallingConv::X86_RegCall) &&
3629 "Expecting custom case only in regcall calling convention");
3630 // This means that we are in special case where one argument was
3631 // passed through two register locations - Skip the next location
3638 assert(VA.isMemLoc());
3639 SDValue Arg = OutVals[OutsIndex];
3640 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3641 // Skip inalloca arguments. They don't require any work.
3642 if (Flags.isInAlloca())
3644 // Create frame index.
3645 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3646 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3647 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3648 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3650 if (Flags.isByVal()) {
3651 // Copy relative to framepointer.
3652 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3653 if (!StackPtr.getNode())
3654 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3655 getPointerTy(DAG.getDataLayout()));
3656 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3659 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3663 // Store relative to framepointer.
3664 MemOpChains2.push_back(DAG.getStore(
3665 ArgChain, dl, Arg, FIN,
3666 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3670 if (!MemOpChains2.empty())
3671 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3673 // Store the return address to the appropriate stack slot.
3674 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3675 getPointerTy(DAG.getDataLayout()),
3676 RegInfo->getSlotSize(), FPDiff, dl);
3679 // Build a sequence of copy-to-reg nodes chained together with token chain
3680 // and flag operands which copy the outgoing args into registers.
3682 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3683 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3684 RegsToPass[i].second, InFlag);
3685 InFlag = Chain.getValue(1);
3688 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3689 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3690 // In the 64-bit large code model, we have to make all calls
3691 // through a register, since the call instruction's 32-bit
3692 // pc-relative offset may not be large enough to hold the whole
3694 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3695 // If the callee is a GlobalAddress node (quite common, every direct call
3696 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3698 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3700 // We should use extra load for direct calls to dllimported functions in
3702 const GlobalValue *GV = G->getGlobal();
3703 if (!GV->hasDLLImportStorageClass()) {
3704 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3706 Callee = DAG.getTargetGlobalAddress(
3707 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3709 if (OpFlags == X86II::MO_GOTPCREL) {
3711 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3712 getPointerTy(DAG.getDataLayout()), Callee);
3713 // Add extra indirection
3714 Callee = DAG.getLoad(
3715 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3716 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3719 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3720 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3721 unsigned char OpFlags =
3722 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3724 Callee = DAG.getTargetExternalSymbol(
3725 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3726 } else if (Subtarget.isTarget64BitILP32() &&
3727 Callee->getValueType(0) == MVT::i32) {
3728 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3729 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3732 // Returns a chain & a flag for retval copy to use.
3733 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3734 SmallVector<SDValue, 8> Ops;
3736 if (!IsSibcall && isTailCall) {
3737 Chain = DAG.getCALLSEQ_END(Chain,
3738 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3739 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3740 InFlag = Chain.getValue(1);
3743 Ops.push_back(Chain);
3744 Ops.push_back(Callee);
3747 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3749 // Add argument registers to the end of the list so that they are known live
3751 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3752 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3753 RegsToPass[i].second.getValueType()));
3755 // Add a register mask operand representing the call-preserved registers.
3756 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3757 // set X86_INTR calling convention because it has the same CSR mask
3758 // (same preserved registers).
3759 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3760 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3761 assert(Mask && "Missing call preserved mask for calling convention");
3763 // If this is an invoke in a 32-bit function using a funclet-based
3764 // personality, assume the function clobbers all registers. If an exception
3765 // is thrown, the runtime will not restore CSRs.
3766 // FIXME: Model this more precisely so that we can register allocate across
3767 // the normal edge and spill and fill across the exceptional edge.
3768 if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
3769 const Function *CallerFn = MF.getFunction();
3770 EHPersonality Pers =
3771 CallerFn->hasPersonalityFn()
3772 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3773 : EHPersonality::Unknown;
3774 if (isFuncletEHPersonality(Pers))
3775 Mask = RegInfo->getNoPreservedMask();
3778 // Define a new register mask from the existing mask.
3779 uint32_t *RegMask = nullptr;
3781 // In some calling conventions we need to remove the used physical registers
3782 // from the reg mask.
3783 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3784 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3786 // Allocate a new Reg Mask and copy Mask.
3787 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3788 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3789 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3791 // Make sure all sub registers of the argument registers are reset
3793 for (auto const &RegPair : RegsToPass)
3794 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3795 SubRegs.isValid(); ++SubRegs)
3796 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3798 // Create the RegMask Operand according to our updated mask.
3799 Ops.push_back(DAG.getRegisterMask(RegMask));
3801 // Create the RegMask Operand according to the static mask.
3802 Ops.push_back(DAG.getRegisterMask(Mask));
3805 if (InFlag.getNode())
3806 Ops.push_back(InFlag);
3810 //// If this is the first return lowered for this function, add the regs
3811 //// to the liveout set for the function.
3812 // This isn't right, although it's probably harmless on x86; liveouts
3813 // should be computed from returns not tail calls. Consider a void
3814 // function making a tail call to a function returning int.
3815 MF.getFrameInfo().setHasTailCall();
3816 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3819 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3820 InFlag = Chain.getValue(1);
3822 // Create the CALLSEQ_END node.
3823 unsigned NumBytesForCalleeToPop;
3824 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3825 DAG.getTarget().Options.GuaranteedTailCallOpt))
3826 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3827 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3828 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3829 SR == StackStructReturn)
3830 // If this is a call to a struct-return function, the callee
3831 // pops the hidden struct pointer, so we have to push it back.
3832 // This is common for Darwin/X86, Linux & Mingw32 targets.
3833 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3834 NumBytesForCalleeToPop = 4;
3836 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3838 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3839 // No need to reset the stack after the call if the call doesn't return. To
3840 // make the MI verify, we'll pretend the callee does it for us.
3841 NumBytesForCalleeToPop = NumBytes;
3844 // Returns a flag for retval copy to use.
3846 Chain = DAG.getCALLSEQ_END(Chain,
3847 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3848 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3851 InFlag = Chain.getValue(1);
3854 // Handle result values, copying them out of physregs into vregs that we
3856 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3860 //===----------------------------------------------------------------------===//
3861 // Fast Calling Convention (tail call) implementation
3862 //===----------------------------------------------------------------------===//
3864 // Like std call, callee cleans arguments, convention except that ECX is
3865 // reserved for storing the tail called function address. Only 2 registers are
3866 // free for argument passing (inreg). Tail call optimization is performed
3868 // * tailcallopt is enabled
3869 // * caller/callee are fastcc
3870 // On X86_64 architecture with GOT-style position independent code only local
3871 // (within module) calls are supported at the moment.
3872 // To keep the stack aligned according to platform abi the function
3873 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3874 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3875 // If a tail called function callee has more arguments than the caller the
3876 // caller needs to make sure that there is room to move the RETADDR to. This is
3877 // achieved by reserving an area the size of the argument delta right after the
3878 // original RETADDR, but before the saved framepointer or the spilled registers
3879 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3891 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3894 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3895 SelectionDAG& DAG) const {
3896 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3897 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3898 unsigned StackAlignment = TFI.getStackAlignment();
3899 uint64_t AlignMask = StackAlignment - 1;
3900 int64_t Offset = StackSize;
3901 unsigned SlotSize = RegInfo->getSlotSize();
3902 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3903 // Number smaller than 12 so just add the difference.
3904 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3906 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3907 Offset = ((~AlignMask) & Offset) + StackAlignment +
3908 (StackAlignment-SlotSize);
3913 /// Return true if the given stack call argument is already available in the
3914 /// same position (relatively) of the caller's incoming argument stack.
3916 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3917 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3918 const X86InstrInfo *TII, const CCValAssign &VA) {
3919 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3922 // Look through nodes that don't alter the bits of the incoming value.
3923 unsigned Op = Arg.getOpcode();
3924 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3925 Arg = Arg.getOperand(0);
3928 if (Op == ISD::TRUNCATE) {
3929 const SDValue &TruncInput = Arg.getOperand(0);
3930 if (TruncInput.getOpcode() == ISD::AssertZext &&
3931 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3932 Arg.getValueType()) {
3933 Arg = TruncInput.getOperand(0);
3941 if (Arg.getOpcode() == ISD::CopyFromReg) {
3942 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3943 if (!TargetRegisterInfo::isVirtualRegister(VR))
3945 MachineInstr *Def = MRI->getVRegDef(VR);
3948 if (!Flags.isByVal()) {
3949 if (!TII->isLoadFromStackSlot(*Def, FI))
3952 unsigned Opcode = Def->getOpcode();
3953 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3954 Opcode == X86::LEA64_32r) &&
3955 Def->getOperand(1).isFI()) {
3956 FI = Def->getOperand(1).getIndex();
3957 Bytes = Flags.getByValSize();
3961 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3962 if (Flags.isByVal())
3963 // ByVal argument is passed in as a pointer but it's now being
3964 // dereferenced. e.g.
3965 // define @foo(%struct.X* %A) {
3966 // tail call @bar(%struct.X* byval %A)
3969 SDValue Ptr = Ld->getBasePtr();
3970 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3973 FI = FINode->getIndex();
3974 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3975 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3976 FI = FINode->getIndex();
3977 Bytes = Flags.getByValSize();
3981 assert(FI != INT_MAX);
3982 if (!MFI.isFixedObjectIndex(FI))
3985 if (Offset != MFI.getObjectOffset(FI))
3988 // If this is not byval, check that the argument stack object is immutable.
3989 // inalloca and argument copy elision can create mutable argument stack
3990 // objects. Byval objects can be mutated, but a byval call intends to pass the
3992 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3995 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3996 // If the argument location is wider than the argument type, check that any
3997 // extension flags match.
3998 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3999 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4004 return Bytes == MFI.getObjectSize(FI);
4007 /// Check whether the call is eligible for tail call optimization. Targets
4008 /// that want to do tail call optimization should implement this function.
4009 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4010 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4011 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4012 const SmallVectorImpl<ISD::OutputArg> &Outs,
4013 const SmallVectorImpl<SDValue> &OutVals,
4014 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4015 if (!mayTailCallThisCC(CalleeCC))
4018 // If -tailcallopt is specified, make fastcc functions tail-callable.
4019 MachineFunction &MF = DAG.getMachineFunction();
4020 const Function *CallerF = MF.getFunction();
4022 // If the function return type is x86_fp80 and the callee return type is not,
4023 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4024 // perform a tailcall optimization here.
4025 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4028 CallingConv::ID CallerCC = CallerF->getCallingConv();
4029 bool CCMatch = CallerCC == CalleeCC;
4030 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4031 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4033 // Win64 functions have extra shadow space for argument homing. Don't do the
4034 // sibcall if the caller and callee have mismatched expectations for this
4036 if (IsCalleeWin64 != IsCallerWin64)
4039 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4040 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4045 // Look for obvious safe cases to perform tail call optimization that do not
4046 // require ABI changes. This is what gcc calls sibcall.
4048 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4049 // emit a special epilogue.
4050 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4051 if (RegInfo->needsStackRealignment(MF))
4054 // Also avoid sibcall optimization if either caller or callee uses struct
4055 // return semantics.
4056 if (isCalleeStructRet || isCallerStructRet)
4059 // Do not sibcall optimize vararg calls unless all arguments are passed via
4061 LLVMContext &C = *DAG.getContext();
4062 if (isVarArg && !Outs.empty()) {
4063 // Optimizing for varargs on Win64 is unlikely to be safe without
4064 // additional testing.
4065 if (IsCalleeWin64 || IsCallerWin64)
4068 SmallVector<CCValAssign, 16> ArgLocs;
4069 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4071 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4072 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4073 if (!ArgLocs[i].isRegLoc())
4077 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4078 // stack. Therefore, if it's not used by the call it is not safe to optimize
4079 // this into a sibcall.
4080 bool Unused = false;
4081 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4088 SmallVector<CCValAssign, 16> RVLocs;
4089 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4090 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4091 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4092 CCValAssign &VA = RVLocs[i];
4093 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4098 // Check that the call results are passed in the same way.
4099 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4100 RetCC_X86, RetCC_X86))
4102 // The callee has to preserve all registers the caller needs to preserve.
4103 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4104 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4106 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4107 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4111 unsigned StackArgsSize = 0;
4113 // If the callee takes no arguments then go on to check the results of the
4115 if (!Outs.empty()) {
4116 // Check if stack adjustment is needed. For now, do not do this if any
4117 // argument is passed on the stack.
4118 SmallVector<CCValAssign, 16> ArgLocs;
4119 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4121 // Allocate shadow area for Win64
4123 CCInfo.AllocateStack(32, 8);
4125 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4126 StackArgsSize = CCInfo.getNextStackOffset();
4128 if (CCInfo.getNextStackOffset()) {
4129 // Check if the arguments are already laid out in the right way as
4130 // the caller's fixed stack objects.
4131 MachineFrameInfo &MFI = MF.getFrameInfo();
4132 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4133 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4134 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4135 CCValAssign &VA = ArgLocs[i];
4136 SDValue Arg = OutVals[i];
4137 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4138 if (VA.getLocInfo() == CCValAssign::Indirect)
4140 if (!VA.isRegLoc()) {
4141 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4148 bool PositionIndependent = isPositionIndependent();
4149 // If the tailcall address may be in a register, then make sure it's
4150 // possible to register allocate for it. In 32-bit, the call address can
4151 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4152 // callee-saved registers are restored. These happen to be the same
4153 // registers used to pass 'inreg' arguments so watch out for those.
4154 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4155 !isa<ExternalSymbolSDNode>(Callee)) ||
4156 PositionIndependent)) {
4157 unsigned NumInRegs = 0;
4158 // In PIC we need an extra register to formulate the address computation
4160 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4162 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4163 CCValAssign &VA = ArgLocs[i];
4166 unsigned Reg = VA.getLocReg();
4169 case X86::EAX: case X86::EDX: case X86::ECX:
4170 if (++NumInRegs == MaxInRegs)
4177 const MachineRegisterInfo &MRI = MF.getRegInfo();
4178 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4182 bool CalleeWillPop =
4183 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4184 MF.getTarget().Options.GuaranteedTailCallOpt);
4186 if (unsigned BytesToPop =
4187 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4188 // If we have bytes to pop, the callee must pop them.
4189 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4190 if (!CalleePopMatches)
4192 } else if (CalleeWillPop && StackArgsSize > 0) {
4193 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4201 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4202 const TargetLibraryInfo *libInfo) const {
4203 return X86::createFastISel(funcInfo, libInfo);
4206 //===----------------------------------------------------------------------===//
4207 // Other Lowering Hooks
4208 //===----------------------------------------------------------------------===//
4210 static bool MayFoldLoad(SDValue Op) {
4211 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4214 static bool MayFoldIntoStore(SDValue Op) {
4215 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4218 static bool MayFoldIntoZeroExtend(SDValue Op) {
4219 if (Op.hasOneUse()) {
4220 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4221 return (ISD::ZERO_EXTEND == Opcode);
4226 static bool isTargetShuffle(unsigned Opcode) {
4228 default: return false;
4229 case X86ISD::BLENDI:
4230 case X86ISD::PSHUFB:
4231 case X86ISD::PSHUFD:
4232 case X86ISD::PSHUFHW:
4233 case X86ISD::PSHUFLW:
4235 case X86ISD::INSERTPS:
4236 case X86ISD::EXTRQI:
4237 case X86ISD::INSERTQI:
4238 case X86ISD::PALIGNR:
4239 case X86ISD::VSHLDQ:
4240 case X86ISD::VSRLDQ:
4241 case X86ISD::MOVLHPS:
4242 case X86ISD::MOVHLPS:
4243 case X86ISD::MOVLPS:
4244 case X86ISD::MOVLPD:
4245 case X86ISD::MOVSHDUP:
4246 case X86ISD::MOVSLDUP:
4247 case X86ISD::MOVDDUP:
4250 case X86ISD::UNPCKL:
4251 case X86ISD::UNPCKH:
4252 case X86ISD::VBROADCAST:
4253 case X86ISD::VPERMILPI:
4254 case X86ISD::VPERMILPV:
4255 case X86ISD::VPERM2X128:
4256 case X86ISD::VPERMIL2:
4257 case X86ISD::VPERMI:
4258 case X86ISD::VPPERM:
4259 case X86ISD::VPERMV:
4260 case X86ISD::VPERMV3:
4261 case X86ISD::VPERMIV3:
4262 case X86ISD::VZEXT_MOVL:
4267 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4269 default: return false;
4271 case X86ISD::PSHUFB:
4272 case X86ISD::VPERMILPV:
4273 case X86ISD::VPERMIL2:
4274 case X86ISD::VPPERM:
4275 case X86ISD::VPERMV:
4276 case X86ISD::VPERMV3:
4277 case X86ISD::VPERMIV3:
4279 // 'Faux' Target Shuffles.
4286 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4287 MachineFunction &MF = DAG.getMachineFunction();
4288 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4289 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4290 int ReturnAddrIndex = FuncInfo->getRAIndex();
4292 if (ReturnAddrIndex == 0) {
4293 // Set up a frame object for the return address.
4294 unsigned SlotSize = RegInfo->getSlotSize();
4295 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4298 FuncInfo->setRAIndex(ReturnAddrIndex);
4301 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4304 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4305 bool hasSymbolicDisplacement) {
4306 // Offset should fit into 32 bit immediate field.
4307 if (!isInt<32>(Offset))
4310 // If we don't have a symbolic displacement - we don't have any extra
4312 if (!hasSymbolicDisplacement)
4315 // FIXME: Some tweaks might be needed for medium code model.
4316 if (M != CodeModel::Small && M != CodeModel::Kernel)
4319 // For small code model we assume that latest object is 16MB before end of 31
4320 // bits boundary. We may also accept pretty large negative constants knowing
4321 // that all objects are in the positive half of address space.
4322 if (M == CodeModel::Small && Offset < 16*1024*1024)
4325 // For kernel code model we know that all object resist in the negative half
4326 // of 32bits address space. We may not accept negative offsets, since they may
4327 // be just off and we may accept pretty large positive ones.
4328 if (M == CodeModel::Kernel && Offset >= 0)
4334 /// Determines whether the callee is required to pop its own arguments.
4335 /// Callee pop is necessary to support tail calls.
4336 bool X86::isCalleePop(CallingConv::ID CallingConv,
4337 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4338 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4339 // can guarantee TCO.
4340 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4343 switch (CallingConv) {
4346 case CallingConv::X86_StdCall:
4347 case CallingConv::X86_FastCall:
4348 case CallingConv::X86_ThisCall:
4349 case CallingConv::X86_VectorCall:
4354 /// \brief Return true if the condition is an unsigned comparison operation.
4355 static bool isX86CCUnsigned(unsigned X86CC) {
4358 llvm_unreachable("Invalid integer condition!");
4374 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4375 switch (SetCCOpcode) {
4376 default: llvm_unreachable("Invalid integer condition!");
4377 case ISD::SETEQ: return X86::COND_E;
4378 case ISD::SETGT: return X86::COND_G;
4379 case ISD::SETGE: return X86::COND_GE;
4380 case ISD::SETLT: return X86::COND_L;
4381 case ISD::SETLE: return X86::COND_LE;
4382 case ISD::SETNE: return X86::COND_NE;
4383 case ISD::SETULT: return X86::COND_B;
4384 case ISD::SETUGT: return X86::COND_A;
4385 case ISD::SETULE: return X86::COND_BE;
4386 case ISD::SETUGE: return X86::COND_AE;
4390 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4391 /// condition code, returning the condition code and the LHS/RHS of the
4392 /// comparison to make.
4393 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4394 bool isFP, SDValue &LHS, SDValue &RHS,
4395 SelectionDAG &DAG) {
4397 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4398 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4399 // X > -1 -> X == 0, jump !sign.
4400 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4401 return X86::COND_NS;
4403 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4404 // X < 0 -> X == 0, jump on sign.
4407 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4409 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4410 return X86::COND_LE;
4414 return TranslateIntegerX86CC(SetCCOpcode);
4417 // First determine if it is required or is profitable to flip the operands.
4419 // If LHS is a foldable load, but RHS is not, flip the condition.
4420 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4421 !ISD::isNON_EXTLoad(RHS.getNode())) {
4422 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4423 std::swap(LHS, RHS);
4426 switch (SetCCOpcode) {
4432 std::swap(LHS, RHS);
4436 // On a floating point condition, the flags are set as follows:
4438 // 0 | 0 | 0 | X > Y
4439 // 0 | 0 | 1 | X < Y
4440 // 1 | 0 | 0 | X == Y
4441 // 1 | 1 | 1 | unordered
4442 switch (SetCCOpcode) {
4443 default: llvm_unreachable("Condcode should be pre-legalized away");
4445 case ISD::SETEQ: return X86::COND_E;
4446 case ISD::SETOLT: // flipped
4448 case ISD::SETGT: return X86::COND_A;
4449 case ISD::SETOLE: // flipped
4451 case ISD::SETGE: return X86::COND_AE;
4452 case ISD::SETUGT: // flipped
4454 case ISD::SETLT: return X86::COND_B;
4455 case ISD::SETUGE: // flipped
4457 case ISD::SETLE: return X86::COND_BE;
4459 case ISD::SETNE: return X86::COND_NE;
4460 case ISD::SETUO: return X86::COND_P;
4461 case ISD::SETO: return X86::COND_NP;
4463 case ISD::SETUNE: return X86::COND_INVALID;
4467 /// Is there a floating point cmov for the specific X86 condition code?
4468 /// Current x86 isa includes the following FP cmov instructions:
4469 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4470 static bool hasFPCMov(unsigned X86CC) {
4487 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4489 unsigned Intrinsic) const {
4491 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4495 Info.opc = ISD::INTRINSIC_W_CHAIN;
4496 Info.readMem = false;
4497 Info.writeMem = false;
4501 switch (IntrData->Type) {
4502 case EXPAND_FROM_MEM: {
4503 Info.ptrVal = I.getArgOperand(0);
4504 Info.memVT = MVT::getVT(I.getType());
4506 Info.readMem = true;
4509 case COMPRESS_TO_MEM: {
4510 Info.ptrVal = I.getArgOperand(0);
4511 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4513 Info.writeMem = true;
4516 case TRUNCATE_TO_MEM_VI8:
4517 case TRUNCATE_TO_MEM_VI16:
4518 case TRUNCATE_TO_MEM_VI32: {
4519 Info.ptrVal = I.getArgOperand(0);
4520 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4521 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4522 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4524 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4525 ScalarVT = MVT::i16;
4526 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4527 ScalarVT = MVT::i32;
4529 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4531 Info.writeMem = true;
4541 /// Returns true if the target can instruction select the
4542 /// specified FP immediate natively. If false, the legalizer will
4543 /// materialize the FP immediate as a load from a constant pool.
4544 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4545 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4546 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4552 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4553 ISD::LoadExtType ExtTy,
4555 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4556 // relocation target a movq or addq instruction: don't let the load shrink.
4557 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4558 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4559 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4560 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4564 /// \brief Returns true if it is beneficial to convert a load of a constant
4565 /// to just the constant itself.
4566 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4568 assert(Ty->isIntegerTy());
4570 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4571 if (BitSize == 0 || BitSize > 64)
4576 bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
4577 // TODO: It might be a win to ease or lift this restriction, but the generic
4578 // folds in DAGCombiner conflict with vector folds for an AVX512 target.
4579 if (VT.isVector() && Subtarget.hasAVX512())
4585 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
4586 unsigned Index) const {
4587 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4590 // Mask vectors support all subregister combinations and operations that
4591 // extract half of vector.
4592 if (ResVT.getVectorElementType() == MVT::i1)
4593 return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
4594 (Index == ResVT.getVectorNumElements()));
4596 return (Index % ResVT.getVectorNumElements()) == 0;
4599 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4600 // Speculate cttz only if we can directly use TZCNT.
4601 return Subtarget.hasBMI();
4604 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4605 // Speculate ctlz only if we can directly use LZCNT.
4606 return Subtarget.hasLZCNT();
4609 bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
4610 const SelectionDAG &DAG) const {
4611 // Do not merge to float value size (128 bytes) if no implicit
4612 // float attribute is set.
4613 bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute(
4614 Attribute::NoImplicitFloat);
4617 unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
4618 return (MemVT.getSizeInBits() <= MaxIntSize);
4623 bool X86TargetLowering::isCtlzFast() const {
4624 return Subtarget.hasFastLZCNT();
4627 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4628 const Instruction &AndI) const {
4632 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4633 if (!Subtarget.hasBMI())
4636 // There are only 32-bit and 64-bit forms for 'andn'.
4637 EVT VT = Y.getValueType();
4638 if (VT != MVT::i32 && VT != MVT::i64)
4644 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4645 MVT VT = MVT::getIntegerVT(NumBits);
4646 if (isTypeLegal(VT))
4649 // PMOVMSKB can handle this.
4650 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4653 // VPMOVMSKB can handle this.
4654 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4657 // TODO: Allow 64-bit type for 32-bit target.
4658 // TODO: 512-bit types should be allowed, but make sure that those
4659 // cases are handled in combineVectorSizedSetCCEquality().
4661 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4664 /// Val is the undef sentinel value or equal to the specified value.
4665 static bool isUndefOrEqual(int Val, int CmpVal) {
4666 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4669 /// Val is either the undef or zero sentinel value.
4670 static bool isUndefOrZero(int Val) {
4671 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4674 /// Return true if every element in Mask, beginning
4675 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4676 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4677 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4678 if (Mask[i] != SM_SentinelUndef)
4683 /// Return true if Val is undef or if its value falls within the
4684 /// specified range (L, H].
4685 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4686 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4689 /// Return true if every element in Mask is undef or if its value
4690 /// falls within the specified range (L, H].
4691 static bool isUndefOrInRange(ArrayRef<int> Mask,
4694 if (!isUndefOrInRange(M, Low, Hi))
4699 /// Return true if Val is undef, zero or if its value falls within the
4700 /// specified range (L, H].
4701 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4702 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4705 /// Return true if every element in Mask is undef, zero or if its value
4706 /// falls within the specified range (L, H].
4707 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4709 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4714 /// Return true if every element in Mask, beginning
4715 /// from position Pos and ending in Pos+Size, falls within the specified
4716 /// sequential range (Low, Low+Size]. or is undef.
4717 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4718 unsigned Pos, unsigned Size, int Low) {
4719 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4720 if (!isUndefOrEqual(Mask[i], Low))
4725 /// Return true if every element in Mask, beginning
4726 /// from position Pos and ending in Pos+Size, falls within the specified
4727 /// sequential range (Low, Low+Size], or is undef or is zero.
4728 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4729 unsigned Size, int Low) {
4730 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4731 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4736 /// Return true if every element in Mask, beginning
4737 /// from position Pos and ending in Pos+Size is undef or is zero.
4738 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4740 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4741 if (!isUndefOrZero(Mask[i]))
4746 /// \brief Helper function to test whether a shuffle mask could be
4747 /// simplified by widening the elements being shuffled.
4749 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4750 /// leaves it in an unspecified state.
4752 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4753 /// shuffle masks. The latter have the special property of a '-2' representing
4754 /// a zero-ed lane of a vector.
4755 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4756 SmallVectorImpl<int> &WidenedMask) {
4757 WidenedMask.assign(Mask.size() / 2, 0);
4758 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4760 int M1 = Mask[i + 1];
4762 // If both elements are undef, its trivial.
4763 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4764 WidenedMask[i / 2] = SM_SentinelUndef;
4768 // Check for an undef mask and a mask value properly aligned to fit with
4769 // a pair of values. If we find such a case, use the non-undef mask's value.
4770 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4771 WidenedMask[i / 2] = M1 / 2;
4774 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4775 WidenedMask[i / 2] = M0 / 2;
4779 // When zeroing, we need to spread the zeroing across both lanes to widen.
4780 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4781 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4782 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4783 WidenedMask[i / 2] = SM_SentinelZero;
4789 // Finally check if the two mask values are adjacent and aligned with
4791 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4792 WidenedMask[i / 2] = M0 / 2;
4796 // Otherwise we can't safely widen the elements used in this shuffle.
4799 assert(WidenedMask.size() == Mask.size() / 2 &&
4800 "Incorrect size of mask after widening the elements!");
4805 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4806 bool X86::isZeroNode(SDValue Elt) {
4807 return isNullConstant(Elt) || isNullFPConstant(Elt);
4810 // Build a vector of constants.
4811 // Use an UNDEF node if MaskElt == -1.
4812 // Split 64-bit constants in the 32-bit mode.
4813 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4814 const SDLoc &dl, bool IsMask = false) {
4816 SmallVector<SDValue, 32> Ops;
4819 MVT ConstVecVT = VT;
4820 unsigned NumElts = VT.getVectorNumElements();
4821 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4822 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4823 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4827 MVT EltVT = ConstVecVT.getVectorElementType();
4828 for (unsigned i = 0; i < NumElts; ++i) {
4829 bool IsUndef = Values[i] < 0 && IsMask;
4830 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4831 DAG.getConstant(Values[i], dl, EltVT);
4832 Ops.push_back(OpNode);
4834 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4835 DAG.getConstant(0, dl, EltVT));
4837 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4839 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4843 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4844 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4845 assert(Bits.size() == Undefs.getBitWidth() &&
4846 "Unequal constant and undef arrays");
4847 SmallVector<SDValue, 32> Ops;
4850 MVT ConstVecVT = VT;
4851 unsigned NumElts = VT.getVectorNumElements();
4852 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4853 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4854 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4858 MVT EltVT = ConstVecVT.getVectorElementType();
4859 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4861 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4864 const APInt &V = Bits[i];
4865 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4867 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4868 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4869 } else if (EltVT == MVT::f32) {
4870 APFloat FV(APFloat::IEEEsingle(), V);
4871 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4872 } else if (EltVT == MVT::f64) {
4873 APFloat FV(APFloat::IEEEdouble(), V);
4874 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4876 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4880 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4881 return DAG.getBitcast(VT, ConstsNode);
4884 /// Returns a vector of specified type with all zero elements.
4885 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4886 SelectionDAG &DAG, const SDLoc &dl) {
4887 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4888 VT.getVectorElementType() == MVT::i1) &&
4889 "Unexpected vector type");
4891 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4892 // type. This ensures they get CSE'd. But if the integer type is not
4893 // available, use a floating-point +0.0 instead.
4895 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4896 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4897 } else if (VT.getVectorElementType() == MVT::i1) {
4898 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4899 "Unexpected vector type");
4900 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4901 "Unexpected vector type");
4902 Vec = DAG.getConstant(0, dl, VT);
4904 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4905 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4907 return DAG.getBitcast(VT, Vec);
4910 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4911 const SDLoc &dl, unsigned vectorWidth) {
4912 EVT VT = Vec.getValueType();
4913 EVT ElVT = VT.getVectorElementType();
4914 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4915 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4916 VT.getVectorNumElements()/Factor);
4918 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4919 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4920 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4922 // This is the index of the first element of the vectorWidth-bit chunk
4923 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4924 IdxVal &= ~(ElemsPerChunk - 1);
4926 // If the input is a buildvector just emit a smaller one.
4927 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4928 return DAG.getBuildVector(ResultVT, dl,
4929 Vec->ops().slice(IdxVal, ElemsPerChunk));
4931 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4932 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4935 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4936 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4937 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4938 /// instructions or a simple subregister reference. Idx is an index in the
4939 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4940 /// lowering EXTRACT_VECTOR_ELT operations easier.
4941 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4942 SelectionDAG &DAG, const SDLoc &dl) {
4943 assert((Vec.getValueType().is256BitVector() ||
4944 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4945 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4948 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4949 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4950 SelectionDAG &DAG, const SDLoc &dl) {
4951 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4952 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4955 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4956 SelectionDAG &DAG, const SDLoc &dl,
4957 unsigned vectorWidth) {
4958 assert((vectorWidth == 128 || vectorWidth == 256) &&
4959 "Unsupported vector width");
4960 // Inserting UNDEF is Result
4963 EVT VT = Vec.getValueType();
4964 EVT ElVT = VT.getVectorElementType();
4965 EVT ResultVT = Result.getValueType();
4967 // Insert the relevant vectorWidth bits.
4968 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4969 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4971 // This is the index of the first element of the vectorWidth-bit chunk
4972 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4973 IdxVal &= ~(ElemsPerChunk - 1);
4975 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4976 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4979 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4980 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4981 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4982 /// simple superregister reference. Idx is an index in the 128 bits
4983 /// we want. It need not be aligned to a 128-bit boundary. That makes
4984 /// lowering INSERT_VECTOR_ELT operations easier.
4985 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4986 SelectionDAG &DAG, const SDLoc &dl) {
4987 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4988 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4991 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4992 SelectionDAG &DAG, const SDLoc &dl) {
4993 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4994 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4997 // Return true if the instruction zeroes the unused upper part of the
4998 // destination and accepts mask.
4999 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5004 case X86ISD::TESTNM:
5005 case X86ISD::PCMPEQM:
5006 case X86ISD::PCMPGTM:
5009 case X86ISD::CMPM_RND:
5014 /// Insert i1-subvector to i1-vector.
5015 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5016 const X86Subtarget &Subtarget) {
5019 SDValue Vec = Op.getOperand(0);
5020 SDValue SubVec = Op.getOperand(1);
5021 SDValue Idx = Op.getOperand(2);
5023 if (!isa<ConstantSDNode>(Idx))
5026 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5027 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5030 MVT OpVT = Op.getSimpleValueType();
5031 MVT SubVecVT = SubVec.getSimpleValueType();
5032 unsigned NumElems = OpVT.getVectorNumElements();
5033 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5035 assert(IdxVal + SubVecNumElems <= NumElems &&
5036 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5037 "Unexpected index value in INSERT_SUBVECTOR");
5039 // There are 3 possible cases:
5040 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5041 // 2. Subvector should be inserted in the upper part
5042 // (IdxVal + SubVecNumElems == NumElems)
5043 // 3. Subvector should be inserted in the middle (for example v2i1
5044 // to v16i1, index 2)
5046 // If this node widens - by concatenating zeroes - the type of the result
5047 // of a node with instruction that zeroes all upper (irrelevant) bits of the
5048 // output register, mark this node as legal to enable replacing them with
5049 // the v8i1 version of the previous instruction during instruction selection.
5050 // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5051 // while zeroing all the upper remaining 60 bits of the register. if the
5052 // result of such instruction is inserted into an allZeroVector, then we can
5053 // safely remove insert_vector (in instruction selection) as the cmp instr
5054 // already zeroed the rest of the register.
5055 if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5056 (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5057 (SubVec.getOpcode() == ISD::AND &&
5058 (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5059 isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5062 // extend to natively supported kshift
5063 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5064 MVT WideOpVT = OpVT;
5065 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5068 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5069 SDValue Undef = DAG.getUNDEF(WideOpVT);
5070 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5071 Undef, SubVec, ZeroIdx);
5073 // Extract sub-vector if require.
5074 auto ExtractSubVec = [&](SDValue V) {
5075 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5079 if (Vec.isUndef()) {
5081 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5082 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5085 return ExtractSubVec(WideSubVec);
5088 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5089 NumElems = WideOpVT.getVectorNumElements();
5090 unsigned ShiftLeft = NumElems - SubVecNumElems;
5091 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5092 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5093 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5094 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5095 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5096 return ExtractSubVec(Vec);
5100 // Zero lower bits of the Vec
5101 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5102 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5103 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5104 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5105 // Merge them together, SubVec should be zero extended.
5106 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5107 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5109 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5110 return ExtractSubVec(Vec);
5113 // Simple case when we put subvector in the upper part
5114 if (IdxVal + SubVecNumElems == NumElems) {
5115 // Zero upper bits of the Vec
5116 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5117 DAG.getConstant(IdxVal, dl, MVT::i8));
5118 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5119 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5120 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5121 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5122 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5123 return ExtractSubVec(Vec);
5125 // Subvector should be inserted in the middle - use shuffle
5126 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5128 SmallVector<int, 64> Mask;
5129 for (unsigned i = 0; i < NumElems; ++i)
5130 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5132 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5135 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5136 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5137 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5138 /// large BUILD_VECTORS.
5139 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5140 unsigned NumElems, SelectionDAG &DAG,
5142 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5143 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5146 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5147 unsigned NumElems, SelectionDAG &DAG,
5149 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5150 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5153 /// Returns a vector of specified type with all bits set.
5154 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5155 /// Then bitcast to their original type, ensuring they get CSE'd.
5156 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5157 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5158 "Expected a 128/256/512-bit vector type");
5160 APInt Ones = APInt::getAllOnesValue(32);
5161 unsigned NumElts = VT.getSizeInBits() / 32;
5162 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5163 return DAG.getBitcast(VT, Vec);
5166 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5167 SelectionDAG &DAG) {
5168 EVT InVT = In.getValueType();
5169 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5171 if (VT.is128BitVector() && InVT.is128BitVector())
5172 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5173 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5175 // For 256-bit vectors, we only need the lower (128-bit) input half.
5176 // For 512-bit vectors, we only need the lower input half or quarter.
5177 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5178 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5179 In = extractSubVector(In, 0, DAG, DL,
5180 std::max(128, (int)VT.getSizeInBits() / Scale));
5183 return DAG.getNode(Opc, DL, VT, In);
5186 /// Returns a vector_shuffle node for an unpackl operation.
5187 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5188 SDValue V1, SDValue V2) {
5189 SmallVector<int, 8> Mask;
5190 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5191 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5194 /// Returns a vector_shuffle node for an unpackh operation.
5195 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5196 SDValue V1, SDValue V2) {
5197 SmallVector<int, 8> Mask;
5198 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5199 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5202 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5203 /// This produces a shuffle where the low element of V2 is swizzled into the
5204 /// zero/undef vector, landing at element Idx.
5205 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5206 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5208 const X86Subtarget &Subtarget,
5209 SelectionDAG &DAG) {
5210 MVT VT = V2.getSimpleValueType();
5212 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5213 int NumElems = VT.getVectorNumElements();
5214 SmallVector<int, 16> MaskVec(NumElems);
5215 for (int i = 0; i != NumElems; ++i)
5216 // If this is the insertion idx, put the low elt of V2 here.
5217 MaskVec[i] = (i == Idx) ? NumElems : i;
5218 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5221 static SDValue peekThroughBitcasts(SDValue V) {
5222 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5223 V = V.getOperand(0);
5227 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5228 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5229 V.getOperand(0).hasOneUse())
5230 V = V.getOperand(0);
5234 static const Constant *getTargetConstantFromNode(SDValue Op) {
5235 Op = peekThroughBitcasts(Op);
5237 auto *Load = dyn_cast<LoadSDNode>(Op);
5241 SDValue Ptr = Load->getBasePtr();
5242 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5243 Ptr->getOpcode() == X86ISD::WrapperRIP)
5244 Ptr = Ptr->getOperand(0);
5246 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5247 if (!CNode || CNode->isMachineConstantPoolEntry())
5250 return dyn_cast<Constant>(CNode->getConstVal());
5253 // Extract raw constant bits from constant pools.
5254 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5256 SmallVectorImpl<APInt> &EltBits,
5257 bool AllowWholeUndefs = true,
5258 bool AllowPartialUndefs = true) {
5259 assert(EltBits.empty() && "Expected an empty EltBits vector");
5261 Op = peekThroughBitcasts(Op);
5263 EVT VT = Op.getValueType();
5264 unsigned SizeInBits = VT.getSizeInBits();
5265 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5266 unsigned NumElts = SizeInBits / EltSizeInBits;
5268 // Bitcast a source array of element bits to the target size.
5269 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5270 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5271 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5272 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5273 "Constant bit sizes don't match");
5275 // Don't split if we don't allow undef bits.
5276 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5277 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5280 // If we're already the right size, don't bother bitcasting.
5281 if (NumSrcElts == NumElts) {
5282 UndefElts = UndefSrcElts;
5283 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5287 // Extract all the undef/constant element data and pack into single bitsets.
5288 APInt UndefBits(SizeInBits, 0);
5289 APInt MaskBits(SizeInBits, 0);
5291 for (unsigned i = 0; i != NumSrcElts; ++i) {
5292 unsigned BitOffset = i * SrcEltSizeInBits;
5293 if (UndefSrcElts[i])
5294 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5295 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5298 // Split the undef/constant single bitset data into the target elements.
5299 UndefElts = APInt(NumElts, 0);
5300 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5302 for (unsigned i = 0; i != NumElts; ++i) {
5303 unsigned BitOffset = i * EltSizeInBits;
5304 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5306 // Only treat an element as UNDEF if all bits are UNDEF.
5307 if (UndefEltBits.isAllOnesValue()) {
5308 if (!AllowWholeUndefs)
5310 UndefElts.setBit(i);
5314 // If only some bits are UNDEF then treat them as zero (or bail if not
5316 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5319 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5320 EltBits[i] = Bits.getZExtValue();
5325 // Collect constant bits and insert into mask/undef bit masks.
5326 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5327 unsigned UndefBitIndex) {
5330 if (isa<UndefValue>(Cst)) {
5331 Undefs.setBit(UndefBitIndex);
5334 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5335 Mask = CInt->getValue();
5338 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5339 Mask = CFP->getValueAPF().bitcastToAPInt();
5347 APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
5348 SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
5349 return CastBitData(UndefSrcElts, SrcEltBits);
5352 // Extract scalar constant bits.
5353 if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
5354 APInt UndefSrcElts = APInt::getNullValue(1);
5355 SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
5356 return CastBitData(UndefSrcElts, SrcEltBits);
5359 // Extract constant bits from build vector.
5360 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5361 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5362 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5364 APInt UndefSrcElts(NumSrcElts, 0);
5365 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5366 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5367 const SDValue &Src = Op.getOperand(i);
5368 if (Src.isUndef()) {
5369 UndefSrcElts.setBit(i);
5372 auto *Cst = cast<ConstantSDNode>(Src);
5373 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5375 return CastBitData(UndefSrcElts, SrcEltBits);
5378 // Extract constant bits from constant pool vector.
5379 if (auto *Cst = getTargetConstantFromNode(Op)) {
5380 Type *CstTy = Cst->getType();
5381 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5384 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5385 unsigned NumSrcElts = CstTy->getVectorNumElements();
5387 APInt UndefSrcElts(NumSrcElts, 0);
5388 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5389 for (unsigned i = 0; i != NumSrcElts; ++i)
5390 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5394 return CastBitData(UndefSrcElts, SrcEltBits);
5397 // Extract constant bits from a broadcasted constant pool scalar.
5398 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5399 EltSizeInBits <= VT.getScalarSizeInBits()) {
5400 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5401 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5402 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5404 APInt UndefSrcElts(NumSrcElts, 0);
5405 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5406 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5407 if (UndefSrcElts[0])
5408 UndefSrcElts.setBits(0, NumSrcElts);
5409 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5410 return CastBitData(UndefSrcElts, SrcEltBits);
5415 // Extract a rematerialized scalar constant insertion.
5416 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5417 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5418 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5419 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5420 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5422 APInt UndefSrcElts(NumSrcElts, 0);
5423 SmallVector<APInt, 64> SrcEltBits;
5424 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5425 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5426 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5427 return CastBitData(UndefSrcElts, SrcEltBits);
5433 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5434 unsigned MaskEltSizeInBits,
5435 SmallVectorImpl<uint64_t> &RawMask) {
5437 SmallVector<APInt, 64> EltBits;
5439 // Extract the raw target constant bits.
5440 // FIXME: We currently don't support UNDEF bits or mask entries.
5441 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5442 EltBits, /* AllowWholeUndefs */ false,
5443 /* AllowPartialUndefs */ false))
5446 // Insert the extracted elements into the mask.
5447 for (APInt Elt : EltBits)
5448 RawMask.push_back(Elt.getZExtValue());
5453 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
5454 /// Note: This ignores saturation, so inputs must be checked first.
5455 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
5457 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5458 unsigned NumElts = VT.getVectorNumElements();
5459 unsigned NumLanes = VT.getSizeInBits() / 128;
5460 unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
5461 unsigned Offset = Unary ? 0 : NumElts;
5463 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
5464 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5465 Mask.push_back(Elt + (Lane * NumEltsPerLane));
5466 for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
5467 Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
5471 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5472 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5473 /// operands in \p Ops, and returns true.
5474 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5475 /// IsUnary for shuffles which use a single input multiple times, and in those
5476 /// cases it will adjust the mask to only have indices within that single input.
5477 /// It is an error to call this with non-empty Mask/Ops vectors.
5478 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5479 SmallVectorImpl<SDValue> &Ops,
5480 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5481 unsigned NumElems = VT.getVectorNumElements();
5484 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5485 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5488 bool IsFakeUnary = false;
5489 switch(N->getOpcode()) {
5490 case X86ISD::BLENDI:
5491 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5492 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5493 ImmN = N->getOperand(N->getNumOperands()-1);
5494 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5495 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5498 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5499 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5500 ImmN = N->getOperand(N->getNumOperands()-1);
5501 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5502 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5504 case X86ISD::INSERTPS:
5505 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5506 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5507 ImmN = N->getOperand(N->getNumOperands()-1);
5508 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5509 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5511 case X86ISD::EXTRQI:
5512 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5513 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5514 isa<ConstantSDNode>(N->getOperand(2))) {
5515 int BitLen = N->getConstantOperandVal(1);
5516 int BitIdx = N->getConstantOperandVal(2);
5517 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5521 case X86ISD::INSERTQI:
5522 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5523 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5524 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5525 isa<ConstantSDNode>(N->getOperand(3))) {
5526 int BitLen = N->getConstantOperandVal(2);
5527 int BitIdx = N->getConstantOperandVal(3);
5528 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5529 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5532 case X86ISD::UNPCKH:
5533 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5534 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5535 DecodeUNPCKHMask(VT, Mask);
5536 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5538 case X86ISD::UNPCKL:
5539 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5540 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5541 DecodeUNPCKLMask(VT, Mask);
5542 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5544 case X86ISD::MOVHLPS:
5545 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5546 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5547 DecodeMOVHLPSMask(NumElems, Mask);
5548 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5550 case X86ISD::MOVLHPS:
5551 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5552 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5553 DecodeMOVLHPSMask(NumElems, Mask);
5554 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5556 case X86ISD::PALIGNR:
5557 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5558 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5559 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5560 ImmN = N->getOperand(N->getNumOperands()-1);
5561 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5562 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5563 Ops.push_back(N->getOperand(1));
5564 Ops.push_back(N->getOperand(0));
5566 case X86ISD::VSHLDQ:
5567 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5568 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5569 ImmN = N->getOperand(N->getNumOperands() - 1);
5570 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5573 case X86ISD::VSRLDQ:
5574 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5575 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5576 ImmN = N->getOperand(N->getNumOperands() - 1);
5577 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5580 case X86ISD::PSHUFD:
5581 case X86ISD::VPERMILPI:
5582 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5583 ImmN = N->getOperand(N->getNumOperands()-1);
5584 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5587 case X86ISD::PSHUFHW:
5588 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5589 ImmN = N->getOperand(N->getNumOperands()-1);
5590 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5593 case X86ISD::PSHUFLW:
5594 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5595 ImmN = N->getOperand(N->getNumOperands()-1);
5596 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5599 case X86ISD::VZEXT_MOVL:
5600 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5601 DecodeZeroMoveLowMask(VT, Mask);
5604 case X86ISD::VBROADCAST: {
5605 SDValue N0 = N->getOperand(0);
5606 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5607 // add the pre-extracted value to the Ops vector.
5608 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5609 N0.getOperand(0).getValueType() == VT &&
5610 N0.getConstantOperandVal(1) == 0)
5611 Ops.push_back(N0.getOperand(0));
5613 // We only decode broadcasts of same-sized vectors, unless the broadcast
5614 // came from an extract from the original width. If we found one, we
5615 // pushed it the Ops vector above.
5616 if (N0.getValueType() == VT || !Ops.empty()) {
5617 DecodeVectorBroadcast(VT, Mask);
5623 case X86ISD::VPERMILPV: {
5624 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5626 SDValue MaskNode = N->getOperand(1);
5627 unsigned MaskEltSize = VT.getScalarSizeInBits();
5628 SmallVector<uint64_t, 32> RawMask;
5629 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5630 DecodeVPERMILPMask(VT, RawMask, Mask);
5633 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5634 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5639 case X86ISD::PSHUFB: {
5640 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5641 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5642 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5644 SDValue MaskNode = N->getOperand(1);
5645 SmallVector<uint64_t, 32> RawMask;
5646 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5647 DecodePSHUFBMask(RawMask, Mask);
5650 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5651 DecodePSHUFBMask(C, Mask);
5656 case X86ISD::VPERMI:
5657 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5658 ImmN = N->getOperand(N->getNumOperands()-1);
5659 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5664 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5665 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5666 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5668 case X86ISD::VPERM2X128:
5669 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5670 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5671 ImmN = N->getOperand(N->getNumOperands()-1);
5672 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5673 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5675 case X86ISD::MOVSLDUP:
5676 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5677 DecodeMOVSLDUPMask(VT, Mask);
5680 case X86ISD::MOVSHDUP:
5681 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5682 DecodeMOVSHDUPMask(VT, Mask);
5685 case X86ISD::MOVDDUP:
5686 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5687 DecodeMOVDDUPMask(VT, Mask);
5690 case X86ISD::MOVLPD:
5691 case X86ISD::MOVLPS:
5692 // Not yet implemented
5694 case X86ISD::VPERMIL2: {
5695 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5696 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5697 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5698 unsigned MaskEltSize = VT.getScalarSizeInBits();
5699 SDValue MaskNode = N->getOperand(2);
5700 SDValue CtrlNode = N->getOperand(3);
5701 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5702 unsigned CtrlImm = CtrlOp->getZExtValue();
5703 SmallVector<uint64_t, 32> RawMask;
5704 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5705 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5708 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5709 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5715 case X86ISD::VPPERM: {
5716 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5717 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5718 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5719 SDValue MaskNode = N->getOperand(2);
5720 SmallVector<uint64_t, 32> RawMask;
5721 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5722 DecodeVPPERMMask(RawMask, Mask);
5725 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5726 DecodeVPPERMMask(C, Mask);
5731 case X86ISD::VPERMV: {
5732 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5734 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5735 Ops.push_back(N->getOperand(1));
5736 SDValue MaskNode = N->getOperand(0);
5737 SmallVector<uint64_t, 32> RawMask;
5738 unsigned MaskEltSize = VT.getScalarSizeInBits();
5739 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5740 DecodeVPERMVMask(RawMask, Mask);
5743 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5744 DecodeVPERMVMask(C, MaskEltSize, Mask);
5749 case X86ISD::VPERMV3: {
5750 assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
5751 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5752 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5753 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5754 Ops.push_back(N->getOperand(0));
5755 Ops.push_back(N->getOperand(2));
5756 SDValue MaskNode = N->getOperand(1);
5757 unsigned MaskEltSize = VT.getScalarSizeInBits();
5758 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5759 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5764 case X86ISD::VPERMIV3: {
5765 assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
5766 assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
5767 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5768 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5769 Ops.push_back(N->getOperand(1));
5770 Ops.push_back(N->getOperand(2));
5771 SDValue MaskNode = N->getOperand(0);
5772 unsigned MaskEltSize = VT.getScalarSizeInBits();
5773 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5774 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5779 default: llvm_unreachable("unknown target shuffle node");
5782 // Empty mask indicates the decode failed.
5786 // Check if we're getting a shuffle mask with zero'd elements.
5787 if (!AllowSentinelZero)
5788 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5791 // If we have a fake unary shuffle, the shuffle mask is spread across two
5792 // inputs that are actually the same node. Re-map the mask to always point
5793 // into the first input.
5796 if (M >= (int)Mask.size())
5799 // If we didn't already add operands in the opcode-specific code, default to
5800 // adding 1 or 2 operands starting at 0.
5802 Ops.push_back(N->getOperand(0));
5803 if (!IsUnary || IsFakeUnary)
5804 Ops.push_back(N->getOperand(1));
5810 /// Check a target shuffle mask's inputs to see if we can set any values to
5811 /// SM_SentinelZero - this is for elements that are known to be zero
5812 /// (not just zeroable) from their inputs.
5813 /// Returns true if the target shuffle mask was decoded.
5814 static bool setTargetShuffleZeroElements(SDValue N,
5815 SmallVectorImpl<int> &Mask,
5816 SmallVectorImpl<SDValue> &Ops) {
5818 if (!isTargetShuffle(N.getOpcode()))
5821 MVT VT = N.getSimpleValueType();
5822 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5825 SDValue V1 = Ops[0];
5826 SDValue V2 = IsUnary ? V1 : Ops[1];
5828 V1 = peekThroughBitcasts(V1);
5829 V2 = peekThroughBitcasts(V2);
5831 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5832 "Illegal split of shuffle value type");
5833 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5835 // Extract known constant input data.
5836 APInt UndefSrcElts[2];
5837 SmallVector<APInt, 32> SrcEltBits[2];
5838 bool IsSrcConstant[2] = {
5839 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5840 SrcEltBits[0], true, false),
5841 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5842 SrcEltBits[1], true, false)};
5844 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5847 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5851 // Determine shuffle input and normalize the mask.
5852 unsigned SrcIdx = M / Size;
5853 SDValue V = M < Size ? V1 : V2;
5856 // We are referencing an UNDEF input.
5858 Mask[i] = SM_SentinelUndef;
5862 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5863 // TODO: We currently only set UNDEF for integer types - floats use the same
5864 // registers as vectors and many of the scalar folded loads rely on the
5865 // SCALAR_TO_VECTOR pattern.
5866 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5867 (Size % V.getValueType().getVectorNumElements()) == 0) {
5868 int Scale = Size / V.getValueType().getVectorNumElements();
5869 int Idx = M / Scale;
5870 if (Idx != 0 && !VT.isFloatingPoint())
5871 Mask[i] = SM_SentinelUndef;
5872 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5873 Mask[i] = SM_SentinelZero;
5877 // Attempt to extract from the source's constant bits.
5878 if (IsSrcConstant[SrcIdx]) {
5879 if (UndefSrcElts[SrcIdx][M])
5880 Mask[i] = SM_SentinelUndef;
5881 else if (SrcEltBits[SrcIdx][M] == 0)
5882 Mask[i] = SM_SentinelZero;
5886 assert(VT.getVectorNumElements() == Mask.size() &&
5887 "Different mask size from vector size!");
5891 // Attempt to decode ops that could be represented as a shuffle mask.
5892 // The decoded shuffle mask may contain a different number of elements to the
5893 // destination value type.
5894 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5895 SmallVectorImpl<SDValue> &Ops,
5896 SelectionDAG &DAG) {
5900 MVT VT = N.getSimpleValueType();
5901 unsigned NumElts = VT.getVectorNumElements();
5902 unsigned NumSizeInBits = VT.getSizeInBits();
5903 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5904 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5905 "Expected byte aligned value types");
5907 unsigned Opcode = N.getOpcode();
5910 case X86ISD::ANDNP: {
5911 // Attempt to decode as a per-byte mask.
5913 SmallVector<APInt, 32> EltBits;
5914 SDValue N0 = N.getOperand(0);
5915 SDValue N1 = N.getOperand(1);
5916 bool IsAndN = (X86ISD::ANDNP == Opcode);
5917 uint64_t ZeroMask = IsAndN ? 255 : 0;
5918 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5920 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5922 Mask.push_back(SM_SentinelUndef);
5925 uint64_t ByteBits = EltBits[i].getZExtValue();
5926 if (ByteBits != 0 && ByteBits != 255)
5928 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5930 Ops.push_back(IsAndN ? N1 : N0);
5933 case ISD::SCALAR_TO_VECTOR: {
5934 // Match against a scalar_to_vector of an extract from a vector,
5935 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5936 SDValue N0 = N.getOperand(0);
5939 if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5940 N0.getOperand(0).getValueType() == VT) ||
5941 (N0.getOpcode() == X86ISD::PEXTRW &&
5942 N0.getOperand(0).getValueType() == MVT::v8i16) ||
5943 (N0.getOpcode() == X86ISD::PEXTRB &&
5944 N0.getOperand(0).getValueType() == MVT::v16i8)) {
5948 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5951 SDValue SrcVec = SrcExtract.getOperand(0);
5952 EVT SrcVT = SrcVec.getValueType();
5953 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5954 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5956 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5957 if (NumSrcElts <= SrcIdx)
5960 Ops.push_back(SrcVec);
5961 Mask.push_back(SrcIdx);
5962 Mask.append(NumZeros, SM_SentinelZero);
5963 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5966 case X86ISD::PINSRB:
5967 case X86ISD::PINSRW: {
5968 SDValue InVec = N.getOperand(0);
5969 SDValue InScl = N.getOperand(1);
5970 uint64_t InIdx = N.getConstantOperandVal(2);
5971 assert(InIdx < NumElts && "Illegal insertion index");
5973 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
5974 if (X86::isZeroNode(InScl)) {
5975 Ops.push_back(InVec);
5976 for (unsigned i = 0; i != NumElts; ++i)
5977 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
5981 // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
5982 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
5984 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
5985 if (InScl.getOpcode() != ExOp)
5988 SDValue ExVec = InScl.getOperand(0);
5989 uint64_t ExIdx = InScl.getConstantOperandVal(1);
5990 assert(ExIdx < NumElts && "Illegal extraction index");
5991 Ops.push_back(InVec);
5992 Ops.push_back(ExVec);
5993 for (unsigned i = 0; i != NumElts; ++i)
5994 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
5997 case X86ISD::PACKSS:
5998 case X86ISD::PACKUS: {
5999 SDValue N0 = N.getOperand(0);
6000 SDValue N1 = N.getOperand(1);
6001 assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
6002 N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
6003 "Unexpected input value type");
6005 // If we know input saturation won't happen we can treat this
6006 // as a truncation shuffle.
6007 if (Opcode == X86ISD::PACKSS) {
6008 if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
6009 (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
6012 APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
6013 if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
6014 (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
6018 bool IsUnary = (N0 == N1);
6024 createPackShuffleMask(VT, Mask, IsUnary);
6028 case X86ISD::VSRLI: {
6029 uint64_t ShiftVal = N.getConstantOperandVal(1);
6030 // Out of range bit shifts are guaranteed to be zero.
6031 if (NumBitsPerElt <= ShiftVal) {
6032 Mask.append(NumElts, SM_SentinelZero);
6036 // We can only decode 'whole byte' bit shifts as shuffles.
6037 if ((ShiftVal % 8) != 0)
6040 uint64_t ByteShift = ShiftVal / 8;
6041 unsigned NumBytes = NumSizeInBits / 8;
6042 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6043 Ops.push_back(N.getOperand(0));
6045 // Clear mask to all zeros and insert the shifted byte indices.
6046 Mask.append(NumBytes, SM_SentinelZero);
6048 if (X86ISD::VSHLI == Opcode) {
6049 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6050 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6051 Mask[i + j] = i + j - ByteShift;
6053 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6054 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6055 Mask[i + j - ByteShift] = i + j;
6059 case ISD::ZERO_EXTEND_VECTOR_INREG:
6060 case X86ISD::VZEXT: {
6061 // TODO - add support for VPMOVZX with smaller input vector types.
6062 SDValue Src = N.getOperand(0);
6063 MVT SrcVT = Src.getSimpleValueType();
6064 if (NumSizeInBits != SrcVT.getSizeInBits())
6066 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6075 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6076 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6077 SmallVectorImpl<int> &Mask) {
6078 int MaskWidth = Mask.size();
6079 SmallVector<SDValue, 16> UsedInputs;
6080 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6081 int lo = UsedInputs.size() * MaskWidth;
6082 int hi = lo + MaskWidth;
6084 // Strip UNDEF input usage.
6085 if (Inputs[i].isUndef())
6087 if ((lo <= M) && (M < hi))
6088 M = SM_SentinelUndef;
6090 // Check for unused inputs.
6091 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6092 UsedInputs.push_back(Inputs[i]);
6099 Inputs = UsedInputs;
6102 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6103 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6104 /// remaining input indices in case we now have a unary shuffle and adjust the
6105 /// inputs accordingly.
6106 /// Returns true if the target shuffle mask was decoded.
6107 static bool resolveTargetShuffleInputs(SDValue Op,
6108 SmallVectorImpl<SDValue> &Inputs,
6109 SmallVectorImpl<int> &Mask,
6110 SelectionDAG &DAG) {
6111 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6112 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6115 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6119 /// Returns the scalar element that will make up the ith
6120 /// element of the result of the vector shuffle.
6121 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6124 return SDValue(); // Limit search depth.
6126 SDValue V = SDValue(N, 0);
6127 EVT VT = V.getValueType();
6128 unsigned Opcode = V.getOpcode();
6130 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6131 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6132 int Elt = SV->getMaskElt(Index);
6135 return DAG.getUNDEF(VT.getVectorElementType());
6137 unsigned NumElems = VT.getVectorNumElements();
6138 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6139 : SV->getOperand(1);
6140 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6143 // Recurse into target specific vector shuffles to find scalars.
6144 if (isTargetShuffle(Opcode)) {
6145 MVT ShufVT = V.getSimpleValueType();
6146 MVT ShufSVT = ShufVT.getVectorElementType();
6147 int NumElems = (int)ShufVT.getVectorNumElements();
6148 SmallVector<int, 16> ShuffleMask;
6149 SmallVector<SDValue, 16> ShuffleOps;
6152 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6155 int Elt = ShuffleMask[Index];
6156 if (Elt == SM_SentinelZero)
6157 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6158 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6159 if (Elt == SM_SentinelUndef)
6160 return DAG.getUNDEF(ShufSVT);
6162 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6163 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6164 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6168 // Actual nodes that may contain scalar elements
6169 if (Opcode == ISD::BITCAST) {
6170 V = V.getOperand(0);
6171 EVT SrcVT = V.getValueType();
6172 unsigned NumElems = VT.getVectorNumElements();
6174 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6178 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6179 return (Index == 0) ? V.getOperand(0)
6180 : DAG.getUNDEF(VT.getVectorElementType());
6182 if (V.getOpcode() == ISD::BUILD_VECTOR)
6183 return V.getOperand(Index);
6188 // Use PINSRB/PINSRW/PINSRD to create a build vector.
6189 static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
6190 unsigned NumNonZero, unsigned NumZero,
6192 const X86Subtarget &Subtarget) {
6193 MVT VT = Op.getSimpleValueType();
6194 unsigned NumElts = VT.getVectorNumElements();
6195 assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
6196 ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
6197 "Illegal vector insertion");
6203 for (unsigned i = 0; i < NumElts; ++i) {
6204 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6208 // If the build vector contains zeros or our first insertion is not the
6209 // first index then insert into zero vector to break any register
6210 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6213 if (NumZero || 0 != i)
6214 V = getZeroVector(VT, Subtarget, DAG, dl);
6216 assert(0 == i && "Expected insertion into zero-index");
6217 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6218 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6219 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6220 V = DAG.getBitcast(VT, V);
6224 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
6225 DAG.getIntPtrConstant(i, dl));
6231 /// Custom lower build_vector of v16i8.
6232 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6233 unsigned NumNonZero, unsigned NumZero,
6235 const X86Subtarget &Subtarget) {
6236 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6239 // SSE4.1 - use PINSRB to insert each byte directly.
6240 if (Subtarget.hasSSE41())
6241 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6248 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6249 for (unsigned i = 0; i < 16; ++i) {
6250 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6251 if (ThisIsNonZero && First) {
6253 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6255 V = DAG.getUNDEF(MVT::v8i16);
6260 // FIXME: Investigate extending to i32 instead of just i16.
6261 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6262 SDValue ThisElt, LastElt;
6263 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6264 if (LastIsNonZero) {
6266 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6268 if (ThisIsNonZero) {
6269 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6270 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6271 DAG.getConstant(8, dl, MVT::i8));
6273 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6279 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6280 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6281 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6282 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6283 V = DAG.getBitcast(MVT::v8i16, V);
6285 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6286 DAG.getIntPtrConstant(i / 2, dl));
6292 return DAG.getBitcast(MVT::v16i8, V);
6295 /// Custom lower build_vector of v8i16.
6296 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6297 unsigned NumNonZero, unsigned NumZero,
6299 const X86Subtarget &Subtarget) {
6300 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6303 // Use PINSRW to insert each byte directly.
6304 return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
6308 /// Custom lower build_vector of v4i32 or v4f32.
6309 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6310 const X86Subtarget &Subtarget) {
6311 // Find all zeroable elements.
6312 std::bitset<4> Zeroable;
6313 for (int i=0; i < 4; ++i) {
6314 SDValue Elt = Op->getOperand(i);
6315 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6317 assert(Zeroable.size() - Zeroable.count() > 1 &&
6318 "We expect at least two non-zero elements!");
6320 // We only know how to deal with build_vector nodes where elements are either
6321 // zeroable or extract_vector_elt with constant index.
6322 SDValue FirstNonZero;
6323 unsigned FirstNonZeroIdx;
6324 for (unsigned i=0; i < 4; ++i) {
6327 SDValue Elt = Op->getOperand(i);
6328 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6329 !isa<ConstantSDNode>(Elt.getOperand(1)))
6331 // Make sure that this node is extracting from a 128-bit vector.
6332 MVT VT = Elt.getOperand(0).getSimpleValueType();
6333 if (!VT.is128BitVector())
6335 if (!FirstNonZero.getNode()) {
6337 FirstNonZeroIdx = i;
6341 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6342 SDValue V1 = FirstNonZero.getOperand(0);
6343 MVT VT = V1.getSimpleValueType();
6345 // See if this build_vector can be lowered as a blend with zero.
6347 unsigned EltMaskIdx, EltIdx;
6349 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6350 if (Zeroable[EltIdx]) {
6351 // The zero vector will be on the right hand side.
6352 Mask[EltIdx] = EltIdx+4;
6356 Elt = Op->getOperand(EltIdx);
6357 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6358 EltMaskIdx = Elt.getConstantOperandVal(1);
6359 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6361 Mask[EltIdx] = EltIdx;
6365 // Let the shuffle legalizer deal with blend operations.
6366 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6367 if (V1.getSimpleValueType() != VT)
6368 V1 = DAG.getBitcast(VT, V1);
6369 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6372 // See if we can lower this build_vector to a INSERTPS.
6373 if (!Subtarget.hasSSE41())
6376 SDValue V2 = Elt.getOperand(0);
6377 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6380 bool CanFold = true;
6381 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6385 SDValue Current = Op->getOperand(i);
6386 SDValue SrcVector = Current->getOperand(0);
6389 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6395 assert(V1.getNode() && "Expected at least two non-zero elements!");
6396 if (V1.getSimpleValueType() != MVT::v4f32)
6397 V1 = DAG.getBitcast(MVT::v4f32, V1);
6398 if (V2.getSimpleValueType() != MVT::v4f32)
6399 V2 = DAG.getBitcast(MVT::v4f32, V2);
6401 // Ok, we can emit an INSERTPS instruction.
6402 unsigned ZMask = Zeroable.to_ulong();
6404 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6405 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6407 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6408 DAG.getIntPtrConstant(InsertPSMask, DL));
6409 return DAG.getBitcast(VT, Result);
6412 /// Return a vector logical shift node.
6413 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6414 SelectionDAG &DAG, const TargetLowering &TLI,
6416 assert(VT.is128BitVector() && "Unknown type for VShift");
6417 MVT ShVT = MVT::v16i8;
6418 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6419 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6420 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6421 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6422 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6423 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6426 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6427 SelectionDAG &DAG) {
6429 // Check if the scalar load can be widened into a vector load. And if
6430 // the address is "base + cst" see if the cst can be "absorbed" into
6431 // the shuffle mask.
6432 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6433 SDValue Ptr = LD->getBasePtr();
6434 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6436 EVT PVT = LD->getValueType(0);
6437 if (PVT != MVT::i32 && PVT != MVT::f32)
6442 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6443 FI = FINode->getIndex();
6445 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6446 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6447 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6448 Offset = Ptr.getConstantOperandVal(1);
6449 Ptr = Ptr.getOperand(0);
6454 // FIXME: 256-bit vector instructions don't require a strict alignment,
6455 // improve this code to support it better.
6456 unsigned RequiredAlign = VT.getSizeInBits()/8;
6457 SDValue Chain = LD->getChain();
6458 // Make sure the stack object alignment is at least 16 or 32.
6459 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6460 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6461 if (MFI.isFixedObjectIndex(FI)) {
6462 // Can't change the alignment. FIXME: It's possible to compute
6463 // the exact stack offset and reference FI + adjust offset instead.
6464 // If someone *really* cares about this. That's the way to implement it.
6467 MFI.setObjectAlignment(FI, RequiredAlign);
6471 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6472 // Ptr + (Offset & ~15).
6475 if ((Offset % RequiredAlign) & 3)
6477 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6480 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6481 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6484 int EltNo = (Offset - StartOffset) >> 2;
6485 unsigned NumElems = VT.getVectorNumElements();
6487 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6488 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6489 LD->getPointerInfo().getWithOffset(StartOffset));
6491 SmallVector<int, 8> Mask(NumElems, EltNo);
6493 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6499 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6500 /// elements can be replaced by a single large load which has the same value as
6501 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6503 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6504 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6505 const SDLoc &DL, SelectionDAG &DAG,
6506 const X86Subtarget &Subtarget,
6507 bool isAfterLegalize) {
6508 unsigned NumElems = Elts.size();
6510 int LastLoadedElt = -1;
6511 SmallBitVector LoadMask(NumElems, false);
6512 SmallBitVector ZeroMask(NumElems, false);
6513 SmallBitVector UndefMask(NumElems, false);
6515 // For each element in the initializer, see if we've found a load, zero or an
6517 for (unsigned i = 0; i < NumElems; ++i) {
6518 SDValue Elt = peekThroughBitcasts(Elts[i]);
6523 UndefMask[i] = true;
6524 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6526 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6529 // Each loaded element must be the correct fractional portion of the
6530 // requested vector load.
6531 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6536 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6537 "Incomplete element masks");
6539 // Handle Special Cases - all undef or undef/zero.
6540 if (UndefMask.count() == NumElems)
6541 return DAG.getUNDEF(VT);
6543 // FIXME: Should we return this as a BUILD_VECTOR instead?
6544 if ((ZeroMask | UndefMask).count() == NumElems)
6545 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6546 : DAG.getConstantFP(0.0, DL, VT);
6548 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6549 int FirstLoadedElt = LoadMask.find_first();
6550 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6551 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6552 EVT LDBaseVT = EltBase.getValueType();
6554 // Consecutive loads can contain UNDEFS but not ZERO elements.
6555 // Consecutive loads with UNDEFs and ZEROs elements require a
6556 // an additional shuffle stage to clear the ZERO elements.
6557 bool IsConsecutiveLoad = true;
6558 bool IsConsecutiveLoadWithZeros = true;
6559 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6561 SDValue Elt = peekThroughBitcasts(Elts[i]);
6562 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6563 if (!DAG.areNonVolatileConsecutiveLoads(
6564 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6565 i - FirstLoadedElt)) {
6566 IsConsecutiveLoad = false;
6567 IsConsecutiveLoadWithZeros = false;
6570 } else if (ZeroMask[i]) {
6571 IsConsecutiveLoad = false;
6575 SmallVector<LoadSDNode *, 8> Loads;
6576 for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
6578 Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
6580 auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
6581 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6582 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6583 "Cannot merge volatile loads.");
6585 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6586 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6587 for (auto *LD : Loads)
6588 DAG.makeEquivalentMemoryOrdering(LD, NewLd);
6592 // LOAD - all consecutive load/undefs (must start/end with a load).
6593 // If we have found an entire vector of loads and undefs, then return a large
6594 // load of the entire vector width starting at the base pointer.
6595 // If the vector contains zeros, then attempt to shuffle those elements.
6596 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6597 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6598 assert(LDBase && "Did not find base load for merging consecutive loads");
6599 EVT EltVT = LDBase->getValueType(0);
6600 // Ensure that the input vector size for the merged loads matches the
6601 // cumulative size of the input elements.
6602 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6605 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6608 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6609 // will lower to regular temporal loads and use the cache.
6610 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6611 VT.is256BitVector() && !Subtarget.hasInt256())
6614 if (IsConsecutiveLoad)
6615 return CreateLoad(VT, LDBase);
6617 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6618 // vector and a zero vector to clear out the zero elements.
6619 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6620 SmallVector<int, 4> ClearMask(NumElems, -1);
6621 for (unsigned i = 0; i < NumElems; ++i) {
6623 ClearMask[i] = i + NumElems;
6624 else if (LoadMask[i])
6627 SDValue V = CreateLoad(VT, LDBase);
6628 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6629 : DAG.getConstantFP(0.0, DL, VT);
6630 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6635 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6637 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6638 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6639 (LoadSize == 32 || LoadSize == 64) &&
6640 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6641 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6642 : MVT::getIntegerVT(LoadSize);
6643 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6644 if (TLI.isTypeLegal(VecVT)) {
6645 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6646 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6648 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6649 LDBase->getPointerInfo(),
6650 LDBase->getAlignment(),
6651 false/*isVolatile*/, true/*ReadMem*/,
6653 for (auto *LD : Loads)
6654 DAG.makeEquivalentMemoryOrdering(LD, ResNode);
6655 return DAG.getBitcast(VT, ResNode);
6662 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6663 unsigned SplatBitSize, LLVMContext &C) {
6664 unsigned ScalarSize = VT.getScalarSizeInBits();
6665 unsigned NumElm = SplatBitSize / ScalarSize;
6667 SmallVector<Constant *, 32> ConstantVec;
6668 for (unsigned i = 0; i < NumElm; i++) {
6669 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6671 if (VT.isFloatingPoint()) {
6672 if (ScalarSize == 32) {
6673 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6675 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6676 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6679 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6680 ConstantVec.push_back(Const);
6682 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6685 static bool isUseOfShuffle(SDNode *N) {
6686 for (auto *U : N->uses()) {
6687 if (isTargetShuffle(U->getOpcode()))
6689 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6690 return isUseOfShuffle(U);
6695 // Check if the current node of build vector is a zero extended vector.
6696 // // If so, return the value extended.
6697 // // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
6698 // // NumElt - return the number of zero extended identical values.
6699 // // EltType - return the type of the value include the zero extend.
6700 static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
6701 unsigned &NumElt, MVT &EltType) {
6702 SDValue ExtValue = Op->getOperand(0);
6703 unsigned NumElts = Op->getNumOperands();
6704 unsigned Delta = NumElts;
6706 for (unsigned i = 1; i < NumElts; i++) {
6707 if (Op->getOperand(i) == ExtValue) {
6711 if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
6714 if (!isPowerOf2_32(Delta) || Delta == 1)
6717 for (unsigned i = Delta; i < NumElts; i++) {
6718 if (i % Delta == 0) {
6719 if (Op->getOperand(i) != ExtValue)
6721 } else if (!(isNullConstant(Op->getOperand(i)) ||
6722 Op->getOperand(i).isUndef()))
6725 unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
6726 unsigned ExtVTSize = EltSize * Delta;
6727 EltType = MVT::getIntegerVT(ExtVTSize);
6728 NumElt = NumElts / Delta;
6732 /// Attempt to use the vbroadcast instruction to generate a splat value
6733 /// from a splat BUILD_VECTOR which uses:
6734 /// a. A single scalar load, or a constant.
6735 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6737 /// The VBROADCAST node is returned when a pattern is found,
6738 /// or SDValue() otherwise.
6739 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6740 const X86Subtarget &Subtarget,
6741 SelectionDAG &DAG) {
6742 // VBROADCAST requires AVX.
6743 // TODO: Splats could be generated for non-AVX CPUs using SSE
6744 // instructions, but there's less potential gain for only 128-bit vectors.
6745 if (!Subtarget.hasAVX())
6748 MVT VT = BVOp->getSimpleValueType(0);
6751 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6752 "Unsupported vector type for broadcast.");
6754 BitVector UndefElements;
6755 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6757 // Attempt to use VBROADCASTM
6758 // From this paterrn:
6759 // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
6760 // b. t1 = (build_vector t0 t0)
6762 // Create (VBROADCASTM v2i1 X)
6763 if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
6764 MVT EltType = VT.getScalarType();
6765 unsigned NumElts = VT.getVectorNumElements();
6767 SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
6768 if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
6769 (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
6770 Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
6772 BOperand = ZeroExtended.getOperand(0);
6774 BOperand = Ld.getOperand(0).getOperand(0);
6775 if (BOperand.getValueType().isVector() &&
6776 BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
6777 if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
6778 NumElts == 8)) || // for broadcastmb2q
6779 (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
6780 NumElts == 16))) { // for broadcastmw2d
6782 DAG.getNode(X86ISD::VBROADCASTM, dl,
6783 MVT::getVectorVT(EltType, NumElts), BOperand);
6784 return DAG.getBitcast(VT, Brdcst);
6790 // We need a splat of a single value to use broadcast, and it doesn't
6791 // make any sense if the value is only in one element of the vector.
6792 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6793 APInt SplatValue, Undef;
6794 unsigned SplatBitSize;
6796 // Check if this is a repeated constant pattern suitable for broadcasting.
6797 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6798 SplatBitSize > VT.getScalarSizeInBits() &&
6799 SplatBitSize < VT.getSizeInBits()) {
6800 // Avoid replacing with broadcast when it's a use of a shuffle
6801 // instruction to preserve the present custom lowering of shuffles.
6802 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6804 // replace BUILD_VECTOR with broadcast of the repeated constants.
6805 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6806 LLVMContext *Ctx = DAG.getContext();
6807 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6808 if (Subtarget.hasAVX()) {
6809 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6810 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6811 // Splatted value can fit in one INTEGER constant in constant pool.
6812 // Load the constant and broadcast it.
6813 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6814 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6815 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6816 SDValue CP = DAG.getConstantPool(C, PVT);
6817 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6819 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6821 CVT, dl, DAG.getEntryNode(), CP,
6822 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6824 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6825 MVT::getVectorVT(CVT, Repeat), Ld);
6826 return DAG.getBitcast(VT, Brdcst);
6827 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6828 // Splatted value can fit in one FLOAT constant in constant pool.
6829 // Load the constant and broadcast it.
6830 // AVX have support for 32 and 64 bit broadcast for floats only.
6831 // No 64bit integer in 32bit subtarget.
6832 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6833 // Lower the splat via APFloat directly, to avoid any conversion.
6836 ? ConstantFP::get(*Ctx,
6837 APFloat(APFloat::IEEEsingle(), SplatValue))
6838 : ConstantFP::get(*Ctx,
6839 APFloat(APFloat::IEEEdouble(), SplatValue));
6840 SDValue CP = DAG.getConstantPool(C, PVT);
6841 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6843 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6845 CVT, dl, DAG.getEntryNode(), CP,
6846 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6848 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6849 MVT::getVectorVT(CVT, Repeat), Ld);
6850 return DAG.getBitcast(VT, Brdcst);
6851 } else if (SplatBitSize > 64) {
6852 // Load the vector of constants and broadcast it.
6853 MVT CVT = VT.getScalarType();
6854 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6856 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6857 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6858 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6860 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6861 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6863 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6864 return DAG.getBitcast(VT, Brdcst);
6871 bool ConstSplatVal =
6872 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6874 // Make sure that all of the users of a non-constant load are from the
6875 // BUILD_VECTOR node.
6876 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6879 unsigned ScalarSize = Ld.getValueSizeInBits();
6880 bool IsGE256 = (VT.getSizeInBits() >= 256);
6882 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6883 // instruction to save 8 or more bytes of constant pool data.
6884 // TODO: If multiple splats are generated to load the same constant,
6885 // it may be detrimental to overall size. There needs to be a way to detect
6886 // that condition to know if this is truly a size win.
6887 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6889 // Handle broadcasting a single constant scalar from the constant pool
6891 // On Sandybridge (no AVX2), it is still better to load a constant vector
6892 // from the constant pool and not to broadcast it from a scalar.
6893 // But override that restriction when optimizing for size.
6894 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6895 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6896 EVT CVT = Ld.getValueType();
6897 assert(!CVT.isVector() && "Must not broadcast a vector type");
6899 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6900 // For size optimization, also splat v2f64 and v2i64, and for size opt
6901 // with AVX2, also splat i8 and i16.
6902 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6903 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6904 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6905 const Constant *C = nullptr;
6906 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6907 C = CI->getConstantIntValue();
6908 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6909 C = CF->getConstantFPValue();
6911 assert(C && "Invalid constant type");
6913 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6915 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6916 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6918 CVT, dl, DAG.getEntryNode(), CP,
6919 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6922 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6926 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6928 // Handle AVX2 in-register broadcasts.
6929 if (!IsLoad && Subtarget.hasInt256() &&
6930 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6931 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6933 // The scalar source must be a normal load.
6937 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6938 (Subtarget.hasVLX() && ScalarSize == 64))
6939 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6941 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6942 // double since there is no vbroadcastsd xmm
6943 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6944 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6945 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6948 // Unsupported broadcast.
6952 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6953 /// underlying vector and index.
6955 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6957 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6959 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6960 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6963 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6965 // (extract_vector_elt (v8f32 %1), Constant<6>)
6967 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6968 // (extract_subvector (v8f32 %0), Constant<4>),
6971 // In this case the vector is the extract_subvector expression and the index
6972 // is 2, as specified by the shuffle.
6973 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6974 SDValue ShuffleVec = SVOp->getOperand(0);
6975 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6976 assert(ShuffleVecVT.getVectorElementType() ==
6977 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6979 int ShuffleIdx = SVOp->getMaskElt(Idx);
6980 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6981 ExtractedFromVec = ShuffleVec;
6987 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6988 MVT VT = Op.getSimpleValueType();
6990 // Skip if insert_vec_elt is not supported.
6991 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6992 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6996 unsigned NumElems = Op.getNumOperands();
7000 SmallVector<unsigned, 4> InsertIndices;
7001 SmallVector<int, 8> Mask(NumElems, -1);
7003 for (unsigned i = 0; i != NumElems; ++i) {
7004 unsigned Opc = Op.getOperand(i).getOpcode();
7006 if (Opc == ISD::UNDEF)
7009 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
7010 // Quit if more than 1 elements need inserting.
7011 if (InsertIndices.size() > 1)
7014 InsertIndices.push_back(i);
7018 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
7019 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
7021 // Quit if non-constant index.
7022 if (!isa<ConstantSDNode>(ExtIdx))
7024 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
7026 // Quit if extracted from vector of different type.
7027 if (ExtractedFromVec.getValueType() != VT)
7030 if (!VecIn1.getNode())
7031 VecIn1 = ExtractedFromVec;
7032 else if (VecIn1 != ExtractedFromVec) {
7033 if (!VecIn2.getNode())
7034 VecIn2 = ExtractedFromVec;
7035 else if (VecIn2 != ExtractedFromVec)
7036 // Quit if more than 2 vectors to shuffle
7040 if (ExtractedFromVec == VecIn1)
7042 else if (ExtractedFromVec == VecIn2)
7043 Mask[i] = Idx + NumElems;
7046 if (!VecIn1.getNode())
7049 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
7050 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
7052 for (unsigned Idx : InsertIndices)
7053 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
7054 DAG.getIntPtrConstant(Idx, DL));
7059 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7060 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7061 Op.getScalarValueSizeInBits() == 1 &&
7062 "Can not convert non-constant vector");
7063 uint64_t Immediate = 0;
7064 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7065 SDValue In = Op.getOperand(idx);
7067 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7070 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7071 return DAG.getConstant(Immediate, dl, VT);
7073 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7075 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7077 MVT VT = Op.getSimpleValueType();
7078 assert((VT.getVectorElementType() == MVT::i1) &&
7079 "Unexpected type in LowerBUILD_VECTORvXi1!");
7082 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7083 return DAG.getTargetConstant(0, dl, VT);
7085 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7086 return DAG.getTargetConstant(1, dl, VT);
7088 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7089 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7090 // Split the pieces.
7092 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7094 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7095 // We have to manually lower both halves so getNode doesn't try to
7096 // reassemble the build_vector.
7097 Lower = LowerBUILD_VECTORvXi1(Lower, DAG);
7098 Upper = LowerBUILD_VECTORvXi1(Upper, DAG);
7099 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7101 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7102 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7103 return DAG.getBitcast(VT, Imm);
7104 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7105 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7106 DAG.getIntPtrConstant(0, dl));
7109 // Vector has one or more non-const elements
7110 uint64_t Immediate = 0;
7111 SmallVector<unsigned, 16> NonConstIdx;
7112 bool IsSplat = true;
7113 bool HasConstElts = false;
7115 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7116 SDValue In = Op.getOperand(idx);
7119 if (!isa<ConstantSDNode>(In))
7120 NonConstIdx.push_back(idx);
7122 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7123 HasConstElts = true;
7127 else if (In != Op.getOperand(SplatIdx))
7131 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7133 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7134 DAG.getConstant(1, dl, VT),
7135 DAG.getConstant(0, dl, VT));
7137 // insert elements one by one
7141 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7142 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7144 else if (HasConstElts)
7145 Imm = DAG.getConstant(0, dl, VT);
7147 Imm = DAG.getUNDEF(VT);
7148 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7149 DstVec = DAG.getBitcast(VT, Imm);
7151 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7152 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7153 DAG.getIntPtrConstant(0, dl));
7156 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7157 unsigned InsertIdx = NonConstIdx[i];
7158 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7159 Op.getOperand(InsertIdx),
7160 DAG.getIntPtrConstant(InsertIdx, dl));
7165 /// \brief Return true if \p N implements a horizontal binop and return the
7166 /// operands for the horizontal binop into V0 and V1.
7168 /// This is a helper function of LowerToHorizontalOp().
7169 /// This function checks that the build_vector \p N in input implements a
7170 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7171 /// operation to match.
7172 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7173 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7174 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7177 /// This function only analyzes elements of \p N whose indices are
7178 /// in range [BaseIdx, LastIdx).
7179 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7181 unsigned BaseIdx, unsigned LastIdx,
7182 SDValue &V0, SDValue &V1) {
7183 EVT VT = N->getValueType(0);
7185 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7186 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7187 "Invalid Vector in input!");
7189 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7190 bool CanFold = true;
7191 unsigned ExpectedVExtractIdx = BaseIdx;
7192 unsigned NumElts = LastIdx - BaseIdx;
7193 V0 = DAG.getUNDEF(VT);
7194 V1 = DAG.getUNDEF(VT);
7196 // Check if N implements a horizontal binop.
7197 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7198 SDValue Op = N->getOperand(i + BaseIdx);
7201 if (Op->isUndef()) {
7202 // Update the expected vector extract index.
7203 if (i * 2 == NumElts)
7204 ExpectedVExtractIdx = BaseIdx;
7205 ExpectedVExtractIdx += 2;
7209 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7214 SDValue Op0 = Op.getOperand(0);
7215 SDValue Op1 = Op.getOperand(1);
7217 // Try to match the following pattern:
7218 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7219 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7220 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7221 Op0.getOperand(0) == Op1.getOperand(0) &&
7222 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7223 isa<ConstantSDNode>(Op1.getOperand(1)));
7227 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7228 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7230 if (i * 2 < NumElts) {
7232 V0 = Op0.getOperand(0);
7233 if (V0.getValueType() != VT)
7238 V1 = Op0.getOperand(0);
7239 if (V1.getValueType() != VT)
7242 if (i * 2 == NumElts)
7243 ExpectedVExtractIdx = BaseIdx;
7246 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7247 if (I0 == ExpectedVExtractIdx)
7248 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7249 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7250 // Try to match the following dag sequence:
7251 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7252 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7256 ExpectedVExtractIdx += 2;
7262 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7263 /// a concat_vector.
7265 /// This is a helper function of LowerToHorizontalOp().
7266 /// This function expects two 256-bit vectors called V0 and V1.
7267 /// At first, each vector is split into two separate 128-bit vectors.
7268 /// Then, the resulting 128-bit vectors are used to implement two
7269 /// horizontal binary operations.
7271 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7273 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7274 /// the two new horizontal binop.
7275 /// When Mode is set, the first horizontal binop dag node would take as input
7276 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7277 /// horizontal binop dag node would take as input the lower 128-bit of V1
7278 /// and the upper 128-bit of V1.
7280 /// HADD V0_LO, V0_HI
7281 /// HADD V1_LO, V1_HI
7283 /// Otherwise, the first horizontal binop dag node takes as input the lower
7284 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7285 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7287 /// HADD V0_LO, V1_LO
7288 /// HADD V0_HI, V1_HI
7290 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7291 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7292 /// the upper 128-bits of the result.
7293 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7294 const SDLoc &DL, SelectionDAG &DAG,
7295 unsigned X86Opcode, bool Mode,
7296 bool isUndefLO, bool isUndefHI) {
7297 MVT VT = V0.getSimpleValueType();
7298 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7299 "Invalid nodes in input!");
7301 unsigned NumElts = VT.getVectorNumElements();
7302 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7303 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7304 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7305 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7306 MVT NewVT = V0_LO.getSimpleValueType();
7308 SDValue LO = DAG.getUNDEF(NewVT);
7309 SDValue HI = DAG.getUNDEF(NewVT);
7312 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7313 if (!isUndefLO && !V0->isUndef())
7314 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7315 if (!isUndefHI && !V1->isUndef())
7316 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7318 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7319 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7320 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7322 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7323 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7326 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7329 /// Returns true iff \p BV builds a vector with the result equivalent to
7330 /// the result of ADDSUB operation.
7331 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7332 /// are written to the parameters \p Opnd0 and \p Opnd1.
7333 static bool isAddSub(const BuildVectorSDNode *BV,
7334 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7335 SDValue &Opnd0, SDValue &Opnd1) {
7337 MVT VT = BV->getSimpleValueType(0);
7338 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7339 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7340 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7343 unsigned NumElts = VT.getVectorNumElements();
7344 SDValue InVec0 = DAG.getUNDEF(VT);
7345 SDValue InVec1 = DAG.getUNDEF(VT);
7347 // Odd-numbered elements in the input build vector are obtained from
7348 // adding two integer/float elements.
7349 // Even-numbered elements in the input build vector are obtained from
7350 // subtracting two integer/float elements.
7351 unsigned ExpectedOpcode = ISD::FSUB;
7352 unsigned NextExpectedOpcode = ISD::FADD;
7353 bool AddFound = false;
7354 bool SubFound = false;
7356 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7357 SDValue Op = BV->getOperand(i);
7359 // Skip 'undef' values.
7360 unsigned Opcode = Op.getOpcode();
7361 if (Opcode == ISD::UNDEF) {
7362 std::swap(ExpectedOpcode, NextExpectedOpcode);
7366 // Early exit if we found an unexpected opcode.
7367 if (Opcode != ExpectedOpcode)
7370 SDValue Op0 = Op.getOperand(0);
7371 SDValue Op1 = Op.getOperand(1);
7373 // Try to match the following pattern:
7374 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7375 // Early exit if we cannot match that sequence.
7376 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7377 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7378 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7379 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7380 Op0.getOperand(1) != Op1.getOperand(1))
7383 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7387 // We found a valid add/sub node. Update the information accordingly.
7393 // Update InVec0 and InVec1.
7394 if (InVec0.isUndef()) {
7395 InVec0 = Op0.getOperand(0);
7396 if (InVec0.getSimpleValueType() != VT)
7399 if (InVec1.isUndef()) {
7400 InVec1 = Op1.getOperand(0);
7401 if (InVec1.getSimpleValueType() != VT)
7405 // Make sure that operands in input to each add/sub node always
7406 // come from a same pair of vectors.
7407 if (InVec0 != Op0.getOperand(0)) {
7408 if (ExpectedOpcode == ISD::FSUB)
7411 // FADD is commutable. Try to commute the operands
7412 // and then test again.
7413 std::swap(Op0, Op1);
7414 if (InVec0 != Op0.getOperand(0))
7418 if (InVec1 != Op1.getOperand(0))
7421 // Update the pair of expected opcodes.
7422 std::swap(ExpectedOpcode, NextExpectedOpcode);
7425 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7426 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7434 /// Returns true if is possible to fold MUL and an idiom that has already been
7435 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7436 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7437 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7439 /// Prior to calling this function it should be known that there is some
7440 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7441 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7442 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7443 /// of \p Opnd0 uses is expected to be equal to 2.
7444 /// For example, this function may be called for the following IR:
7445 /// %AB = fmul fast <2 x double> %A, %B
7446 /// %Sub = fsub fast <2 x double> %AB, %C
7447 /// %Add = fadd fast <2 x double> %AB, %C
7448 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7449 /// <2 x i32> <i32 0, i32 3>
7450 /// There is a def for %Addsub here, which potentially can be replaced by
7451 /// X86ISD::ADDSUB operation:
7452 /// %Addsub = X86ISD::ADDSUB %AB, %C
7453 /// and such ADDSUB can further be replaced with FMADDSUB:
7454 /// %Addsub = FMADDSUB %A, %B, %C.
7456 /// The main reason why this method is called before the replacement of the
7457 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7458 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7460 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7461 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7462 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7463 !Subtarget.hasAnyFMA())
7466 // FIXME: These checks must match the similar ones in
7467 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7468 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7469 // or MUL + ADDSUB to FMADDSUB.
7470 const TargetOptions &Options = DAG.getTarget().Options;
7472 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7477 Opnd1 = Opnd0.getOperand(1);
7478 Opnd0 = Opnd0.getOperand(0);
7483 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7484 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7485 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7486 const X86Subtarget &Subtarget,
7487 SelectionDAG &DAG) {
7488 SDValue Opnd0, Opnd1;
7489 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7492 MVT VT = BV->getSimpleValueType(0);
7495 // Try to generate X86ISD::FMADDSUB node here.
7497 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7498 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7500 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7501 // the ADDSUB idiom has been successfully recognized. There are no known
7502 // X86 targets with 512-bit ADDSUB instructions!
7503 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7505 if (VT.is512BitVector())
7508 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7511 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7512 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7513 const X86Subtarget &Subtarget,
7514 SelectionDAG &DAG) {
7515 MVT VT = BV->getSimpleValueType(0);
7516 unsigned NumElts = VT.getVectorNumElements();
7517 unsigned NumUndefsLO = 0;
7518 unsigned NumUndefsHI = 0;
7519 unsigned Half = NumElts/2;
7521 // Count the number of UNDEF operands in the build_vector in input.
7522 for (unsigned i = 0, e = Half; i != e; ++i)
7523 if (BV->getOperand(i)->isUndef())
7526 for (unsigned i = Half, e = NumElts; i != e; ++i)
7527 if (BV->getOperand(i)->isUndef())
7530 // Early exit if this is either a build_vector of all UNDEFs or all the
7531 // operands but one are UNDEF.
7532 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7536 SDValue InVec0, InVec1;
7537 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7538 // Try to match an SSE3 float HADD/HSUB.
7539 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7540 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7542 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7543 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7544 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7545 // Try to match an SSSE3 integer HADD/HSUB.
7546 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7547 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7549 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7550 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7553 if (!Subtarget.hasAVX())
7556 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7557 // Try to match an AVX horizontal add/sub of packed single/double
7558 // precision floating point values from 256-bit vectors.
7559 SDValue InVec2, InVec3;
7560 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7561 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7562 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7563 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7564 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7566 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7567 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7568 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7569 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7570 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7571 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7572 // Try to match an AVX2 horizontal add/sub of signed integers.
7573 SDValue InVec2, InVec3;
7575 bool CanFold = true;
7577 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7578 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7579 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7580 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7581 X86Opcode = X86ISD::HADD;
7582 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7583 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7584 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7585 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7586 X86Opcode = X86ISD::HSUB;
7591 // Fold this build_vector into a single horizontal add/sub.
7592 // Do this only if the target has AVX2.
7593 if (Subtarget.hasAVX2())
7594 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7596 // Do not try to expand this build_vector into a pair of horizontal
7597 // add/sub if we can emit a pair of scalar add/sub.
7598 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7601 // Convert this build_vector into a pair of horizontal binop followed by
7603 bool isUndefLO = NumUndefsLO == Half;
7604 bool isUndefHI = NumUndefsHI == Half;
7605 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7606 isUndefLO, isUndefHI);
7610 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7611 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7613 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7614 X86Opcode = X86ISD::HADD;
7615 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7616 X86Opcode = X86ISD::HSUB;
7617 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7618 X86Opcode = X86ISD::FHADD;
7619 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7620 X86Opcode = X86ISD::FHSUB;
7624 // Don't try to expand this build_vector into a pair of horizontal add/sub
7625 // if we can simply emit a pair of scalar add/sub.
7626 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7629 // Convert this build_vector into two horizontal add/sub followed by
7631 bool isUndefLO = NumUndefsLO == Half;
7632 bool isUndefHI = NumUndefsHI == Half;
7633 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7634 isUndefLO, isUndefHI);
7640 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7641 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7642 /// just apply the bit to the vectors.
7643 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7644 /// from this, but enough scalar bit operations are created from the later
7645 /// legalization + scalarization stages to need basic support.
7646 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7647 SelectionDAG &DAG) {
7649 MVT VT = Op->getSimpleValueType(0);
7650 unsigned NumElems = VT.getVectorNumElements();
7651 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7653 // Check that all elements have the same opcode.
7654 // TODO: Should we allow UNDEFS and if so how many?
7655 unsigned Opcode = Op->getOperand(0).getOpcode();
7656 for (unsigned i = 1; i < NumElems; ++i)
7657 if (Opcode != Op->getOperand(i).getOpcode())
7660 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7667 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7672 SmallVector<SDValue, 4> LHSElts, RHSElts;
7673 for (SDValue Elt : Op->ops()) {
7674 SDValue LHS = Elt.getOperand(0);
7675 SDValue RHS = Elt.getOperand(1);
7677 // We expect the canonicalized RHS operand to be the constant.
7678 if (!isa<ConstantSDNode>(RHS))
7680 LHSElts.push_back(LHS);
7681 RHSElts.push_back(RHS);
7684 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7685 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7686 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7689 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7690 /// functionality to do this, so it's all zeros, all ones, or some derivation
7691 /// that is cheap to calculate.
7692 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7693 const X86Subtarget &Subtarget) {
7695 MVT VT = Op.getSimpleValueType();
7697 // Vectors containing all zeros can be matched by pxor and xorps.
7698 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7699 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7700 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7701 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7704 return getZeroVector(VT, Subtarget, DAG, DL);
7707 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7708 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7709 // vpcmpeqd on 256-bit vectors.
7710 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7711 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7712 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7715 return getOnesVector(VT, DAG, DL);
7721 // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
7722 // reasoned to be a permutation of a vector by indices in a non-constant vector.
7723 // (build_vector (extract_elt V, (extract_elt I, 0)),
7724 // (extract_elt V, (extract_elt I, 1)),
7729 // TODO: Handle undefs
7730 // TODO: Utilize pshufb and zero mask blending to support more efficient
7731 // construction of vectors with constant-0 elements.
7732 // TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
7733 // when no native operation available.
7735 LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
7736 const X86Subtarget &Subtarget) {
7737 // Look for VPERMV and PSHUFB opportunities.
7738 MVT VT = V.getSimpleValueType();
7739 switch (VT.SimpleTy) {
7743 if (!Subtarget.hasSSE3())
7748 if (!Subtarget.hasAVX2())
7753 if (!Subtarget.hasVLX())
7760 if (!Subtarget.hasAVX512())
7764 if (!Subtarget.hasBWI())
7769 if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
7773 if (!Subtarget.hasVBMI())
7777 if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
7781 SDValue SrcVec, IndicesVec;
7782 // Check for a match of the permute source vector and permute index elements.
7783 // This is done by checking that the i-th build_vector operand is of the form:
7784 // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
7785 for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
7786 SDValue Op = V.getOperand(Idx);
7787 if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7790 // If this is the first extract encountered in V, set the source vector,
7791 // otherwise verify the extract is from the previously defined source
7794 SrcVec = Op.getOperand(0);
7795 else if (SrcVec != Op.getOperand(0))
7797 SDValue ExtractedIndex = Op->getOperand(1);
7798 // Peek through extends.
7799 if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
7800 ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
7801 ExtractedIndex = ExtractedIndex.getOperand(0);
7802 if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
7805 // If this is the first extract from the index vector candidate, set the
7806 // indices vector, otherwise verify the extract is from the previously
7807 // defined indices vector.
7809 IndicesVec = ExtractedIndex.getOperand(0);
7810 else if (IndicesVec != ExtractedIndex.getOperand(0))
7813 auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
7814 if (!PermIdx || PermIdx->getZExtValue() != Idx)
7818 if (VT.isFloatingPoint())
7819 IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
7820 VT.getVectorNumElements());
7821 IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
7822 return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
7823 SDLoc(V), VT, IndicesVec, SrcVec);
7827 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7830 MVT VT = Op.getSimpleValueType();
7831 MVT ExtVT = VT.getVectorElementType();
7832 unsigned NumElems = Op.getNumOperands();
7834 // Generate vectors for predicate vectors.
7835 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7836 return LowerBUILD_VECTORvXi1(Op, DAG);
7838 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7839 return VectorConstant;
7841 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7842 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7844 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7845 return HorizontalOp;
7846 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7848 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7851 unsigned EVTBits = ExtVT.getSizeInBits();
7853 unsigned NumZero = 0;
7854 unsigned NumNonZero = 0;
7855 uint64_t NonZeros = 0;
7856 bool IsAllConstants = true;
7857 SmallSet<SDValue, 8> Values;
7858 unsigned NumConstants = NumElems;
7859 for (unsigned i = 0; i < NumElems; ++i) {
7860 SDValue Elt = Op.getOperand(i);
7864 if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
7865 IsAllConstants = false;
7868 if (X86::isZeroNode(Elt))
7871 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7872 NonZeros |= ((uint64_t)1 << i);
7877 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7878 if (NumNonZero == 0)
7879 return DAG.getUNDEF(VT);
7881 // If we are inserting one variable into a vector of non-zero constants, try
7882 // to avoid loading each constant element as a scalar. Load the constants as a
7883 // vector and then insert the variable scalar element. If insertion is not
7884 // supported, we assume that we will fall back to a shuffle to get the scalar
7885 // blended with the constants. Insertion into a zero vector is handled as a
7886 // special-case somewhere below here.
7887 LLVMContext &Context = *DAG.getContext();
7888 if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
7889 (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
7890 isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
7891 // Create an all-constant vector. The variable element in the old
7892 // build vector is replaced by undef in the constant vector. Save the
7893 // variable scalar element and its index for use in the insertelement.
7894 Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
7895 SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
7898 for (unsigned i = 0; i != NumElems; ++i) {
7899 SDValue Elt = Op.getOperand(i);
7900 if (auto *C = dyn_cast<ConstantSDNode>(Elt))
7901 ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
7902 else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
7903 ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
7904 else if (!Elt.isUndef()) {
7905 assert(!VarElt.getNode() && !InsIndex.getNode() &&
7906 "Expected one variable element in this vector");
7908 InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
7911 Constant *CV = ConstantVector::get(ConstVecOps);
7912 SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
7914 // The constants we just created may not be legal (eg, floating point). We
7915 // must lower the vector right here because we can not guarantee that we'll
7916 // legalize it before loading it. This is also why we could not just create
7917 // a new build vector here. If the build vector contains illegal constants,
7918 // it could get split back up into a series of insert elements.
7919 // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
7920 SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
7921 MachineFunction &MF = DAG.getMachineFunction();
7922 MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
7923 SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
7924 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
7927 // Special case for single non-zero, non-undef, element.
7928 if (NumNonZero == 1) {
7929 unsigned Idx = countTrailingZeros(NonZeros);
7930 SDValue Item = Op.getOperand(Idx);
7932 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7933 // the value are obviously zero, truncate the value to i32 and do the
7934 // insertion that way. Only do this if the value is non-constant or if the
7935 // value is a constant being inserted into element 0. It is cheaper to do
7936 // a constant pool load than it is to do a movd + shuffle.
7937 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7938 (!IsAllConstants || Idx == 0)) {
7939 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7941 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7942 MVT VecVT = MVT::v4i32;
7944 // Truncate the value (which may itself be a constant) to i32, and
7945 // convert it to a vector with movd (S2V+shuffle to zero extend).
7946 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7947 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7948 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7949 Item, Idx * 2, true, Subtarget, DAG));
7953 // If we have a constant or non-constant insertion into the low element of
7954 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7955 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7956 // depending on what the source datatype is.
7959 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7961 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7962 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7963 assert((VT.is128BitVector() || VT.is256BitVector() ||
7964 VT.is512BitVector()) &&
7965 "Expected an SSE value type!");
7966 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7967 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7968 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7971 // We can't directly insert an i8 or i16 into a vector, so zero extend
7973 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7974 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7975 if (VT.getSizeInBits() >= 256) {
7976 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7977 if (Subtarget.hasAVX()) {
7978 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7979 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7981 // Without AVX, we need to extend to a 128-bit vector and then
7982 // insert into the 256-bit vector.
7983 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7984 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7985 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7988 assert(VT.is128BitVector() && "Expected an SSE value type!");
7989 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7990 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7992 return DAG.getBitcast(VT, Item);
7996 // Is it a vector logical left shift?
7997 if (NumElems == 2 && Idx == 1 &&
7998 X86::isZeroNode(Op.getOperand(0)) &&
7999 !X86::isZeroNode(Op.getOperand(1))) {
8000 unsigned NumBits = VT.getSizeInBits();
8001 return getVShift(true, VT,
8002 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
8003 VT, Op.getOperand(1)),
8004 NumBits/2, DAG, *this, dl);
8007 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
8010 // Otherwise, if this is a vector with i32 or f32 elements, and the element
8011 // is a non-constant being inserted into an element other than the low one,
8012 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
8013 // movd/movss) to move this into the low element, then shuffle it into
8015 if (EVTBits == 32) {
8016 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
8017 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
8021 // Splat is obviously ok. Let legalizer expand it to a shuffle.
8022 if (Values.size() == 1) {
8023 if (EVTBits == 32) {
8024 // Instead of a shuffle like this:
8025 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
8026 // Check if it's possible to issue this instead.
8027 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
8028 unsigned Idx = countTrailingZeros(NonZeros);
8029 SDValue Item = Op.getOperand(Idx);
8030 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
8031 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
8036 // A vector full of immediates; various special cases are already
8037 // handled, so this is best done with a single constant-pool load.
8041 if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
8044 // See if we can use a vector load to get all of the elements.
8045 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
8046 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
8048 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
8052 // For AVX-length vectors, build the individual 128-bit pieces and use
8053 // shuffles to put them in place.
8054 if (VT.is256BitVector() || VT.is512BitVector()) {
8055 EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
8057 // Build both the lower and upper subvector.
8059 DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
8060 SDValue Upper = DAG.getBuildVector(
8061 HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
8063 // Recreate the wider vector with the lower and upper part.
8064 if (VT.is256BitVector())
8065 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8066 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
8069 // Let legalizer expand 2-wide build_vectors.
8070 if (EVTBits == 64) {
8071 if (NumNonZero == 1) {
8072 // One half is zero or undef.
8073 unsigned Idx = countTrailingZeros(NonZeros);
8074 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
8075 Op.getOperand(Idx));
8076 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
8081 // If element VT is < 32 bits, convert it to inserts into a zero vector.
8082 if (EVTBits == 8 && NumElems == 16)
8083 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
8087 if (EVTBits == 16 && NumElems == 8)
8088 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
8092 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
8093 if (EVTBits == 32 && NumElems == 4)
8094 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
8097 // If element VT is == 32 bits, turn it into a number of shuffles.
8098 if (NumElems == 4 && NumZero > 0) {
8099 SmallVector<SDValue, 8> Ops(NumElems);
8100 for (unsigned i = 0; i < 4; ++i) {
8101 bool isZero = !(NonZeros & (1ULL << i));
8103 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
8105 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8108 for (unsigned i = 0; i < 2; ++i) {
8109 switch ((NonZeros >> (i*2)) & 0x3) {
8110 default: llvm_unreachable("Unexpected NonZero count");
8112 Ops[i] = Ops[i*2]; // Must be a zero vector.
8115 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
8118 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8121 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
8126 bool Reverse1 = (NonZeros & 0x3) == 2;
8127 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
8131 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
8132 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
8134 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
8137 assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
8139 // Check for a build vector from mostly shuffle plus few inserting.
8140 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
8143 // For SSE 4.1, use insertps to put the high elements into the low element.
8144 if (Subtarget.hasSSE41()) {
8146 if (!Op.getOperand(0).isUndef())
8147 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
8149 Result = DAG.getUNDEF(VT);
8151 for (unsigned i = 1; i < NumElems; ++i) {
8152 if (Op.getOperand(i).isUndef()) continue;
8153 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
8154 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
8159 // Otherwise, expand into a number of unpckl*, start by extending each of
8160 // our (non-undef) elements to the full vector width with the element in the
8161 // bottom slot of the vector (which generates no code for SSE).
8162 SmallVector<SDValue, 8> Ops(NumElems);
8163 for (unsigned i = 0; i < NumElems; ++i) {
8164 if (!Op.getOperand(i).isUndef())
8165 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
8167 Ops[i] = DAG.getUNDEF(VT);
8170 // Next, we iteratively mix elements, e.g. for v4f32:
8171 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
8172 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
8173 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
8174 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
8175 // Generate scaled UNPCKL shuffle mask.
8176 SmallVector<int, 16> Mask;
8177 for(unsigned i = 0; i != Scale; ++i)
8179 for (unsigned i = 0; i != Scale; ++i)
8180 Mask.push_back(NumElems+i);
8181 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
8183 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
8184 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
8189 // 256-bit AVX can use the vinsertf128 instruction
8190 // to create 256-bit vectors from two other 128-bit ones.
8191 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
8193 MVT ResVT = Op.getSimpleValueType();
8195 assert((ResVT.is256BitVector() ||
8196 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
8198 SDValue V1 = Op.getOperand(0);
8199 SDValue V2 = Op.getOperand(1);
8200 unsigned NumElems = ResVT.getVectorNumElements();
8201 if (ResVT.is256BitVector())
8202 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8204 if (Op.getNumOperands() == 4) {
8205 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8206 ResVT.getVectorNumElements()/2);
8207 SDValue V3 = Op.getOperand(2);
8208 SDValue V4 = Op.getOperand(3);
8209 return concat256BitVectors(
8210 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
8211 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8214 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8217 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8218 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8219 static bool isExpandWithZeros(const SDValue &Op) {
8220 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8221 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8223 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8224 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8230 // Returns true if the given node is a type promotion (by concatenating i1
8231 // zeros) of the result of a node that already zeros all upper bits of
8233 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8234 unsigned Opc = Op.getOpcode();
8236 assert(Opc == ISD::CONCAT_VECTORS &&
8237 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8238 "Unexpected node to check for type promotion!");
8240 // As long as we are concatenating zeros to the upper part of a previous node
8241 // result, climb up the tree until a node with different opcode is
8243 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8244 if (Opc == ISD::INSERT_SUBVECTOR) {
8245 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8246 Op.getConstantOperandVal(2) == 0)
8247 Op = Op.getOperand(1);
8250 } else { // Opc == ISD::CONCAT_VECTORS
8251 if (isExpandWithZeros(Op))
8252 Op = Op.getOperand(0);
8256 Opc = Op.getOpcode();
8259 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8260 // of a node that zeros the upper bits (its masked version).
8261 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8262 (Op.getOpcode() == ISD::AND &&
8263 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8264 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8271 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8272 const X86Subtarget &Subtarget,
8273 SelectionDAG & DAG) {
8275 MVT ResVT = Op.getSimpleValueType();
8276 unsigned NumOfOperands = Op.getNumOperands();
8278 assert(isPowerOf2_32(NumOfOperands) &&
8279 "Unexpected number of operands in CONCAT_VECTORS");
8281 // If this node promotes - by concatenating zeroes - the type of the result
8282 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8283 // output register, mark it as legal and catch the pattern in instruction
8284 // selection to avoid emitting extra instructions (for zeroing upper bits).
8285 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8286 SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
8287 SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
8288 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8292 SDValue Undef = DAG.getUNDEF(ResVT);
8293 if (NumOfOperands > 2) {
8294 // Specialize the cases when all, or all but one, of the operands are undef.
8295 unsigned NumOfDefinedOps = 0;
8297 for (unsigned i = 0; i < NumOfOperands; i++)
8298 if (!Op.getOperand(i).isUndef()) {
8302 if (NumOfDefinedOps == 0)
8304 if (NumOfDefinedOps == 1) {
8305 unsigned SubVecNumElts =
8306 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8307 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8308 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8309 Op.getOperand(OpIdx), IdxVal);
8312 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8313 ResVT.getVectorNumElements()/2);
8314 SmallVector<SDValue, 2> Ops;
8315 for (unsigned i = 0; i < NumOfOperands/2; i++)
8316 Ops.push_back(Op.getOperand(i));
8317 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8319 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8320 Ops.push_back(Op.getOperand(i));
8321 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8322 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8326 SDValue V1 = Op.getOperand(0);
8327 SDValue V2 = Op.getOperand(1);
8328 unsigned NumElems = ResVT.getVectorNumElements();
8329 assert(V1.getValueType() == V2.getValueType() &&
8330 V1.getValueType().getVectorNumElements() == NumElems/2 &&
8331 "Unexpected operands in CONCAT_VECTORS");
8333 if (ResVT.getSizeInBits() >= 16)
8334 return Op; // The operation is legal with KUNPCK
8336 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8337 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8338 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8339 if (IsZeroV1 && IsZeroV2)
8342 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8344 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8346 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8348 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8350 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8353 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8355 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8356 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8359 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8360 const X86Subtarget &Subtarget,
8361 SelectionDAG &DAG) {
8362 MVT VT = Op.getSimpleValueType();
8363 if (VT.getVectorElementType() == MVT::i1)
8364 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8366 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8367 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8368 Op.getNumOperands() == 4)));
8370 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8371 // from two other 128-bit ones.
8373 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8374 return LowerAVXCONCAT_VECTORS(Op, DAG);
8377 //===----------------------------------------------------------------------===//
8378 // Vector shuffle lowering
8380 // This is an experimental code path for lowering vector shuffles on x86. It is
8381 // designed to handle arbitrary vector shuffles and blends, gracefully
8382 // degrading performance as necessary. It works hard to recognize idiomatic
8383 // shuffles and lower them to optimal instruction patterns without leaving
8384 // a framework that allows reasonably efficient handling of all vector shuffle
8386 //===----------------------------------------------------------------------===//
8388 /// \brief Tiny helper function to identify a no-op mask.
8390 /// This is a somewhat boring predicate function. It checks whether the mask
8391 /// array input, which is assumed to be a single-input shuffle mask of the kind
8392 /// used by the X86 shuffle instructions (not a fully general
8393 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8394 /// in-place shuffle are 'no-op's.
8395 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8396 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8397 assert(Mask[i] >= -1 && "Out of bound mask element!");
8398 if (Mask[i] >= 0 && Mask[i] != i)
8404 /// \brief Test whether there are elements crossing 128-bit lanes in this
8407 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8408 /// and we routinely test for these.
8409 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8410 int LaneSize = 128 / VT.getScalarSizeInBits();
8411 int Size = Mask.size();
8412 for (int i = 0; i < Size; ++i)
8413 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8418 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8420 /// This checks a shuffle mask to see if it is performing the same
8421 /// lane-relative shuffle in each sub-lane. This trivially implies
8422 /// that it is also not lane-crossing. It may however involve a blend from the
8423 /// same lane of a second vector.
8425 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8426 /// non-trivial to compute in the face of undef lanes. The representation is
8427 /// suitable for use with existing 128-bit shuffles as entries from the second
8428 /// vector have been remapped to [LaneSize, 2*LaneSize).
8429 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8431 SmallVectorImpl<int> &RepeatedMask) {
8432 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8433 RepeatedMask.assign(LaneSize, -1);
8434 int Size = Mask.size();
8435 for (int i = 0; i < Size; ++i) {
8436 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8439 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8440 // This entry crosses lanes, so there is no way to model this shuffle.
8443 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8444 // Adjust second vector indices to start at LaneSize instead of Size.
8445 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8446 : Mask[i] % LaneSize + LaneSize;
8447 if (RepeatedMask[i % LaneSize] < 0)
8448 // This is the first non-undef entry in this slot of a 128-bit lane.
8449 RepeatedMask[i % LaneSize] = LocalM;
8450 else if (RepeatedMask[i % LaneSize] != LocalM)
8451 // Found a mismatch with the repeated mask.
8457 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8459 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8460 SmallVectorImpl<int> &RepeatedMask) {
8461 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8464 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8466 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8467 SmallVectorImpl<int> &RepeatedMask) {
8468 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8471 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8472 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8473 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8475 SmallVectorImpl<int> &RepeatedMask) {
8476 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8477 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8478 int Size = Mask.size();
8479 for (int i = 0; i < Size; ++i) {
8480 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8481 if (Mask[i] == SM_SentinelUndef)
8483 if (Mask[i] == SM_SentinelZero) {
8484 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8486 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8489 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8490 // This entry crosses lanes, so there is no way to model this shuffle.
8493 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8494 // Adjust second vector indices to start at LaneSize instead of Size.
8496 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8497 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8498 // This is the first non-undef entry in this slot of a 128-bit lane.
8499 RepeatedMask[i % LaneSize] = LocalM;
8500 else if (RepeatedMask[i % LaneSize] != LocalM)
8501 // Found a mismatch with the repeated mask.
8507 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8510 /// This is a fast way to test a shuffle mask against a fixed pattern:
8512 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8514 /// It returns true if the mask is exactly as wide as the argument list, and
8515 /// each element of the mask is either -1 (signifying undef) or the value given
8516 /// in the argument.
8517 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8518 ArrayRef<int> ExpectedMask) {
8519 if (Mask.size() != ExpectedMask.size())
8522 int Size = Mask.size();
8524 // If the values are build vectors, we can look through them to find
8525 // equivalent inputs that make the shuffles equivalent.
8526 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8527 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8529 for (int i = 0; i < Size; ++i) {
8530 assert(Mask[i] >= -1 && "Out of bound mask element!");
8531 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8532 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8533 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8534 if (!MaskBV || !ExpectedBV ||
8535 MaskBV->getOperand(Mask[i] % Size) !=
8536 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8544 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8546 /// The masks must be exactly the same width.
8548 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8549 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8551 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8552 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8553 ArrayRef<int> ExpectedMask) {
8554 int Size = Mask.size();
8555 if (Size != (int)ExpectedMask.size())
8558 for (int i = 0; i < Size; ++i)
8559 if (Mask[i] == SM_SentinelUndef)
8561 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8563 else if (Mask[i] != ExpectedMask[i])
8569 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8571 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8572 const APInt &Zeroable) {
8573 int NumElts = Mask.size();
8574 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8576 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8577 for (int i = 0; i != NumElts; ++i) {
8579 if (M == SM_SentinelUndef)
8581 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8582 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8587 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8589 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8590 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8593 SmallVector<int, 8> Unpcklwd;
8594 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8595 /* Unary = */ false);
8596 SmallVector<int, 8> Unpckhwd;
8597 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8598 /* Unary = */ false);
8599 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8600 isTargetShuffleEquivalent(Mask, Unpckhwd));
8601 return IsUnpackwdMask;
8604 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8606 /// This helper function produces an 8-bit shuffle immediate corresponding to
8607 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8608 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8611 /// NB: We rely heavily on "undef" masks preserving the input lane.
8612 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8613 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8614 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8615 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8616 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8617 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8620 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8621 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8622 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8623 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8627 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8628 SelectionDAG &DAG) {
8629 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8632 /// \brief Compute whether each element of a shuffle is zeroable.
8634 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8635 /// Either it is an undef element in the shuffle mask, the element of the input
8636 /// referenced is undef, or the element of the input referenced is known to be
8637 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8638 /// as many lanes with this technique as possible to simplify the remaining
8640 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8641 SDValue V1, SDValue V2) {
8642 APInt Zeroable(Mask.size(), 0);
8643 V1 = peekThroughBitcasts(V1);
8644 V2 = peekThroughBitcasts(V2);
8646 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8647 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8649 int VectorSizeInBits = V1.getValueSizeInBits();
8650 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8651 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8653 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8655 // Handle the easy cases.
8656 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8661 // Determine shuffle input and normalize the mask.
8662 SDValue V = M < Size ? V1 : V2;
8665 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8666 if (V.getOpcode() != ISD::BUILD_VECTOR)
8669 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8670 // the (larger) source element must be UNDEF/ZERO.
8671 if ((Size % V.getNumOperands()) == 0) {
8672 int Scale = Size / V->getNumOperands();
8673 SDValue Op = V.getOperand(M / Scale);
8674 if (Op.isUndef() || X86::isZeroNode(Op))
8676 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8677 APInt Val = Cst->getAPIntValue();
8678 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8679 Val = Val.getLoBits(ScalarSizeInBits);
8682 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8683 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8684 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8685 Val = Val.getLoBits(ScalarSizeInBits);
8692 // If the BUILD_VECTOR has more elements then all the (smaller) source
8693 // elements must be UNDEF or ZERO.
8694 if ((V.getNumOperands() % Size) == 0) {
8695 int Scale = V->getNumOperands() / Size;
8696 bool AllZeroable = true;
8697 for (int j = 0; j < Scale; ++j) {
8698 SDValue Op = V.getOperand((M * Scale) + j);
8699 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8710 // The Shuffle result is as follow:
8711 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8712 // Each Zeroable's element correspond to a particular Mask's element.
8713 // As described in computeZeroableShuffleElements function.
8715 // The function looks for a sub-mask that the nonzero elements are in
8716 // increasing order. If such sub-mask exist. The function returns true.
8717 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8718 ArrayRef<int> Mask, const EVT &VectorType,
8719 bool &IsZeroSideLeft) {
8720 int NextElement = -1;
8721 // Check if the Mask's nonzero elements are in increasing order.
8722 for (int i = 0, e = Mask.size(); i < e; i++) {
8723 // Checks if the mask's zeros elements are built from only zeros.
8724 assert(Mask[i] >= -1 && "Out of bound mask element!");
8729 // Find the lowest non zero element
8730 if (NextElement < 0) {
8731 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8732 IsZeroSideLeft = NextElement != 0;
8734 // Exit if the mask's non zero elements are not in increasing order.
8735 if (NextElement != Mask[i])
8742 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8743 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8744 ArrayRef<int> Mask, SDValue V1,
8746 const APInt &Zeroable,
8747 const X86Subtarget &Subtarget,
8748 SelectionDAG &DAG) {
8749 int Size = Mask.size();
8750 int LaneSize = 128 / VT.getScalarSizeInBits();
8751 const int NumBytes = VT.getSizeInBits() / 8;
8752 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8754 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8755 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8756 (Subtarget.hasBWI() && VT.is512BitVector()));
8758 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8759 // Sign bit set in i8 mask means zero element.
8760 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8763 for (int i = 0; i < NumBytes; ++i) {
8764 int M = Mask[i / NumEltBytes];
8766 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8769 if (Zeroable[i / NumEltBytes]) {
8770 PSHUFBMask[i] = ZeroMask;
8774 // We can only use a single input of V1 or V2.
8775 SDValue SrcV = (M >= Size ? V2 : V1);
8781 // PSHUFB can't cross lanes, ensure this doesn't happen.
8782 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8786 M = M * NumEltBytes + (i % NumEltBytes);
8787 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8789 assert(V && "Failed to find a source input");
8791 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8792 return DAG.getBitcast(
8793 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8794 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8797 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8798 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8801 // X86 has dedicated shuffle that can be lowered to VEXPAND
8802 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8803 const APInt &Zeroable,
8804 ArrayRef<int> Mask, SDValue &V1,
8805 SDValue &V2, SelectionDAG &DAG,
8806 const X86Subtarget &Subtarget) {
8807 bool IsLeftZeroSide = true;
8808 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8811 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8813 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8814 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8815 unsigned NumElts = VT.getVectorNumElements();
8816 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8817 "Unexpected number of vector elements");
8818 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8819 Subtarget, DAG, DL);
8820 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8821 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8822 return DAG.getSelect(DL, VT, VMask,
8823 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8827 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8828 unsigned &UnpackOpcode, bool IsUnary,
8829 ArrayRef<int> TargetMask, SDLoc &DL,
8831 const X86Subtarget &Subtarget) {
8832 int NumElts = VT.getVectorNumElements();
8834 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8835 for (int i = 0; i != NumElts; i += 2) {
8836 int M1 = TargetMask[i + 0];
8837 int M2 = TargetMask[i + 1];
8838 Undef1 &= (SM_SentinelUndef == M1);
8839 Undef2 &= (SM_SentinelUndef == M2);
8840 Zero1 &= isUndefOrZero(M1);
8841 Zero2 &= isUndefOrZero(M2);
8843 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8844 "Zeroable shuffle detected");
8846 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8847 SmallVector<int, 64> Unpckl, Unpckh;
8848 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8849 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8850 UnpackOpcode = X86ISD::UNPCKL;
8851 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8852 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8856 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8857 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8858 UnpackOpcode = X86ISD::UNPCKH;
8859 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8860 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8864 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8865 if (IsUnary && (Zero1 || Zero2)) {
8866 // Don't bother if we can blend instead.
8867 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8868 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8871 bool MatchLo = true, MatchHi = true;
8872 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8873 int M = TargetMask[i];
8875 // Ignore if the input is known to be zero or the index is undef.
8876 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8877 (M == SM_SentinelUndef))
8880 MatchLo &= (M == Unpckl[i]);
8881 MatchHi &= (M == Unpckh[i]);
8884 if (MatchLo || MatchHi) {
8885 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8886 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8887 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8892 // If a binary shuffle, commute and try again.
8894 ShuffleVectorSDNode::commuteMask(Unpckl);
8895 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8896 UnpackOpcode = X86ISD::UNPCKL;
8901 ShuffleVectorSDNode::commuteMask(Unpckh);
8902 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8903 UnpackOpcode = X86ISD::UNPCKH;
8912 // X86 has dedicated unpack instructions that can handle specific blend
8913 // operations: UNPCKH and UNPCKL.
8914 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8915 ArrayRef<int> Mask, SDValue V1,
8916 SDValue V2, SelectionDAG &DAG) {
8917 SmallVector<int, 8> Unpckl;
8918 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8919 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8920 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8922 SmallVector<int, 8> Unpckh;
8923 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8924 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8925 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8927 // Commute and try again.
8928 ShuffleVectorSDNode::commuteMask(Unpckl);
8929 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8930 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8932 ShuffleVectorSDNode::commuteMask(Unpckh);
8933 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8934 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8939 // X86 has dedicated pack instructions that can handle specific truncation
8940 // operations: PACKSS and PACKUS.
8941 static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
8942 SDValue &V2, unsigned &PackOpcode,
8943 ArrayRef<int> TargetMask,
8945 const X86Subtarget &Subtarget) {
8946 unsigned NumElts = VT.getVectorNumElements();
8947 unsigned BitSize = VT.getScalarSizeInBits();
8948 MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
8949 MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
8951 auto MatchPACK = [&](SDValue N1, SDValue N2) {
8952 SDValue VV1 = DAG.getBitcast(PackVT, N1);
8953 SDValue VV2 = DAG.getBitcast(PackVT, N2);
8954 if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
8955 (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
8959 PackOpcode = X86ISD::PACKSS;
8963 if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
8964 APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
8965 if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
8966 (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
8970 PackOpcode = X86ISD::PACKUS;
8978 // Try binary shuffle.
8979 SmallVector<int, 32> BinaryMask;
8980 createPackShuffleMask(VT, BinaryMask, false);
8981 if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
8982 if (MatchPACK(V1, V2))
8985 // Try unary shuffle.
8986 SmallVector<int, 32> UnaryMask;
8987 createPackShuffleMask(VT, UnaryMask, true);
8988 if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
8989 if (MatchPACK(V1, V1))
8995 static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
8996 ArrayRef<int> Mask, SDValue V1,
8997 SDValue V2, SelectionDAG &DAG,
8998 const X86Subtarget &Subtarget) {
9000 unsigned PackOpcode;
9001 if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
9003 return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
9004 DAG.getBitcast(PackVT, V2));
9009 /// \brief Try to emit a bitmask instruction for a shuffle.
9011 /// This handles cases where we can model a blend exactly as a bitmask due to
9012 /// one of the inputs being zeroable.
9013 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
9014 SDValue V2, ArrayRef<int> Mask,
9015 const APInt &Zeroable,
9016 SelectionDAG &DAG) {
9017 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
9018 MVT EltVT = VT.getVectorElementType();
9019 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9020 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9021 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
9023 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9026 if (Mask[i] % Size != i)
9027 return SDValue(); // Not a blend.
9029 V = Mask[i] < Size ? V1 : V2;
9030 else if (V != (Mask[i] < Size ? V1 : V2))
9031 return SDValue(); // Can only let one input through the mask.
9033 VMaskOps[i] = AllOnes;
9036 return SDValue(); // No non-zeroable elements!
9038 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
9039 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
9042 /// \brief Try to emit a blend instruction for a shuffle using bit math.
9044 /// This is used as a fallback approach when first class blend instructions are
9045 /// unavailable. Currently it is only suitable for integer vectors, but could
9046 /// be generalized for floating point vectors if desirable.
9047 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
9048 SDValue V2, ArrayRef<int> Mask,
9049 SelectionDAG &DAG) {
9050 assert(VT.isInteger() && "Only supports integer vector types!");
9051 MVT EltVT = VT.getVectorElementType();
9052 SDValue Zero = DAG.getConstant(0, DL, EltVT);
9053 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
9054 SmallVector<SDValue, 16> MaskOps;
9055 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9056 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
9057 return SDValue(); // Shuffled input!
9058 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
9061 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
9062 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
9063 // We have to cast V2 around.
9064 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
9065 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
9066 DAG.getBitcast(MaskVT, V1Mask),
9067 DAG.getBitcast(MaskVT, V2)));
9068 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
9071 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
9072 SDValue PreservedSrc,
9073 const X86Subtarget &Subtarget,
9076 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
9077 MutableArrayRef<int> TargetMask,
9078 bool &ForceV1Zero, bool &ForceV2Zero,
9079 uint64_t &BlendMask) {
9080 bool V1IsZeroOrUndef =
9081 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
9082 bool V2IsZeroOrUndef =
9083 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
9086 ForceV1Zero = false, ForceV2Zero = false;
9087 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
9089 // Attempt to generate the binary blend mask. If an input is zero then
9090 // we can use any lane.
9091 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
9092 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
9093 int M = TargetMask[i];
9094 if (M == SM_SentinelUndef)
9098 if (M == i + Size) {
9099 BlendMask |= 1ull << i;
9102 if (M == SM_SentinelZero) {
9103 if (V1IsZeroOrUndef) {
9108 if (V2IsZeroOrUndef) {
9110 BlendMask |= 1ull << i;
9111 TargetMask[i] = i + Size;
9120 static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
9122 uint64_t ScaledMask = 0;
9123 for (int i = 0; i != Size; ++i)
9124 if (BlendMask & (1ull << i))
9125 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
9129 /// \brief Try to emit a blend instruction for a shuffle.
9131 /// This doesn't do any checks for the availability of instructions for blending
9132 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
9133 /// be matched in the backend with the type given. What it does check for is
9134 /// that the shuffle mask is a blend, or convertible into a blend with zero.
9135 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
9136 SDValue V2, ArrayRef<int> Original,
9137 const APInt &Zeroable,
9138 const X86Subtarget &Subtarget,
9139 SelectionDAG &DAG) {
9140 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
9142 uint64_t BlendMask = 0;
9143 bool ForceV1Zero = false, ForceV2Zero = false;
9144 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
9148 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
9150 V1 = getZeroVector(VT, Subtarget, DAG, DL);
9152 V2 = getZeroVector(VT, Subtarget, DAG, DL);
9154 switch (VT.SimpleTy) {
9159 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
9160 DAG.getConstant(BlendMask, DL, MVT::i8));
9164 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9168 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
9169 // that instruction.
9170 if (Subtarget.hasAVX2()) {
9171 // Scale the blend by the number of 32-bit dwords per element.
9172 int Scale = VT.getScalarSizeInBits() / 32;
9173 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9174 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
9175 V1 = DAG.getBitcast(BlendVT, V1);
9176 V2 = DAG.getBitcast(BlendVT, V2);
9177 return DAG.getBitcast(
9178 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
9179 DAG.getConstant(BlendMask, DL, MVT::i8)));
9183 // For integer shuffles we need to expand the mask and cast the inputs to
9184 // v8i16s prior to blending.
9185 int Scale = 8 / VT.getVectorNumElements();
9186 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
9187 V1 = DAG.getBitcast(MVT::v8i16, V1);
9188 V2 = DAG.getBitcast(MVT::v8i16, V2);
9189 return DAG.getBitcast(VT,
9190 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
9191 DAG.getConstant(BlendMask, DL, MVT::i8)));
9195 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
9196 SmallVector<int, 8> RepeatedMask;
9197 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
9198 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
9199 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
9201 for (int i = 0; i < 8; ++i)
9202 if (RepeatedMask[i] >= 8)
9203 BlendMask |= 1ull << i;
9204 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
9205 DAG.getConstant(BlendMask, DL, MVT::i8));
9211 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
9212 "256-bit byte-blends require AVX2 support!");
9214 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
9216 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9217 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9218 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9221 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
9222 if (SDValue Masked =
9223 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
9226 // Scale the blend by the number of bytes per element.
9227 int Scale = VT.getScalarSizeInBits() / 8;
9229 // This form of blend is always done on bytes. Compute the byte vector
9231 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9233 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
9234 // mix of LLVM's code generator and the x86 backend. We tell the code
9235 // generator that boolean values in the elements of an x86 vector register
9236 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
9237 // mapping a select to operand #1, and 'false' mapping to operand #2. The
9238 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
9239 // of the element (the remaining are ignored) and 0 in that high bit would
9240 // mean operand #1 while 1 in the high bit would mean operand #2. So while
9241 // the LLVM model for boolean values in vector elements gets the relevant
9242 // bit set, it is set backwards and over constrained relative to x86's
9244 SmallVector<SDValue, 32> VSELECTMask;
9245 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9246 for (int j = 0; j < Scale; ++j)
9247 VSELECTMask.push_back(
9248 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
9249 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
9252 V1 = DAG.getBitcast(BlendVT, V1);
9253 V2 = DAG.getBitcast(BlendVT, V2);
9254 return DAG.getBitcast(
9256 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
9266 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
9267 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
9268 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
9271 llvm_unreachable("Not a supported integer vector type!");
9275 /// \brief Try to lower as a blend of elements from two inputs followed by
9276 /// a single-input permutation.
9278 /// This matches the pattern where we can blend elements from two inputs and
9279 /// then reduce the shuffle to a single-input permutation.
9280 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
9281 SDValue V1, SDValue V2,
9283 SelectionDAG &DAG) {
9284 // We build up the blend mask while checking whether a blend is a viable way
9285 // to reduce the shuffle.
9286 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9287 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9289 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9293 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9295 if (BlendMask[Mask[i] % Size] < 0)
9296 BlendMask[Mask[i] % Size] = Mask[i];
9297 else if (BlendMask[Mask[i] % Size] != Mask[i])
9298 return SDValue(); // Can't blend in the needed input!
9300 PermuteMask[i] = Mask[i] % Size;
9303 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9304 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9307 /// \brief Generic routine to decompose a shuffle and blend into independent
9308 /// blends and permutes.
9310 /// This matches the extremely common pattern for handling combined
9311 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9312 /// operations. It will try to pick the best arrangement of shuffles and
9314 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9318 SelectionDAG &DAG) {
9319 // Shuffle the input elements into the desired positions in V1 and V2 and
9320 // blend them together.
9321 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9322 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9323 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9324 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9325 if (Mask[i] >= 0 && Mask[i] < Size) {
9326 V1Mask[i] = Mask[i];
9328 } else if (Mask[i] >= Size) {
9329 V2Mask[i] = Mask[i] - Size;
9330 BlendMask[i] = i + Size;
9333 // Try to lower with the simpler initial blend strategy unless one of the
9334 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9335 // shuffle may be able to fold with a load or other benefit. However, when
9336 // we'll have to do 2x as many shuffles in order to achieve this, blending
9337 // first is a better strategy.
9338 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9339 if (SDValue BlendPerm =
9340 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9343 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9344 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9345 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9348 /// \brief Try to lower a vector shuffle as a rotation.
9350 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9351 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9352 ArrayRef<int> Mask) {
9353 int NumElts = Mask.size();
9355 // We need to detect various ways of spelling a rotation:
9356 // [11, 12, 13, 14, 15, 0, 1, 2]
9357 // [-1, 12, 13, 14, -1, -1, 1, -1]
9358 // [-1, -1, -1, -1, -1, -1, 1, 2]
9359 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9360 // [-1, 4, 5, 6, -1, -1, 9, -1]
9361 // [-1, 4, 5, 6, -1, -1, -1, -1]
9364 for (int i = 0; i < NumElts; ++i) {
9366 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9367 "Unexpected mask index.");
9371 // Determine where a rotated vector would have started.
9372 int StartIdx = i - (M % NumElts);
9374 // The identity rotation isn't interesting, stop.
9377 // If we found the tail of a vector the rotation must be the missing
9378 // front. If we found the head of a vector, it must be how much of the
9380 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9383 Rotation = CandidateRotation;
9384 else if (Rotation != CandidateRotation)
9385 // The rotations don't match, so we can't match this mask.
9388 // Compute which value this mask is pointing at.
9389 SDValue MaskV = M < NumElts ? V1 : V2;
9391 // Compute which of the two target values this index should be assigned
9392 // to. This reflects whether the high elements are remaining or the low
9393 // elements are remaining.
9394 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9396 // Either set up this value if we've not encountered it before, or check
9397 // that it remains consistent.
9400 else if (TargetV != MaskV)
9401 // This may be a rotation, but it pulls from the inputs in some
9402 // unsupported interleaving.
9406 // Check that we successfully analyzed the mask, and normalize the results.
9407 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9408 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9420 /// \brief Try to lower a vector shuffle as a byte rotation.
9422 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9423 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9424 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9425 /// try to generically lower a vector shuffle through such an pattern. It
9426 /// does not check for the profitability of lowering either as PALIGNR or
9427 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9428 /// This matches shuffle vectors that look like:
9430 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9432 /// Essentially it concatenates V1 and V2, shifts right by some number of
9433 /// elements, and takes the low elements as the result. Note that while this is
9434 /// specified as a *right shift* because x86 is little-endian, it is a *left
9435 /// rotate* of the vector lanes.
9436 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9437 ArrayRef<int> Mask) {
9438 // Don't accept any shuffles with zero elements.
9439 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9442 // PALIGNR works on 128-bit lanes.
9443 SmallVector<int, 16> RepeatedMask;
9444 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9447 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9451 // PALIGNR rotates bytes, so we need to scale the
9452 // rotation based on how many bytes are in the vector lane.
9453 int NumElts = RepeatedMask.size();
9454 int Scale = 16 / NumElts;
9455 return Rotation * Scale;
9458 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9459 SDValue V1, SDValue V2,
9461 const X86Subtarget &Subtarget,
9462 SelectionDAG &DAG) {
9463 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9465 SDValue Lo = V1, Hi = V2;
9466 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9467 if (ByteRotation <= 0)
9470 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9472 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9473 Lo = DAG.getBitcast(ByteVT, Lo);
9474 Hi = DAG.getBitcast(ByteVT, Hi);
9476 // SSSE3 targets can use the palignr instruction.
9477 if (Subtarget.hasSSSE3()) {
9478 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9479 "512-bit PALIGNR requires BWI instructions");
9480 return DAG.getBitcast(
9481 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9482 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9485 assert(VT.is128BitVector() &&
9486 "Rotate-based lowering only supports 128-bit lowering!");
9487 assert(Mask.size() <= 16 &&
9488 "Can shuffle at most 16 bytes in a 128-bit vector!");
9489 assert(ByteVT == MVT::v16i8 &&
9490 "SSE2 rotate lowering only needed for v16i8!");
9492 // Default SSE2 implementation
9493 int LoByteShift = 16 - ByteRotation;
9494 int HiByteShift = ByteRotation;
9496 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9497 DAG.getConstant(LoByteShift, DL, MVT::i8));
9498 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9499 DAG.getConstant(HiByteShift, DL, MVT::i8));
9500 return DAG.getBitcast(VT,
9501 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9504 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9506 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9507 /// rotation of the concatenation of two vectors; This routine will
9508 /// try to generically lower a vector shuffle through such an pattern.
9510 /// Essentially it concatenates V1 and V2, shifts right by some number of
9511 /// elements, and takes the low elements as the result. Note that while this is
9512 /// specified as a *right shift* because x86 is little-endian, it is a *left
9513 /// rotate* of the vector lanes.
9514 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9515 SDValue V1, SDValue V2,
9517 const X86Subtarget &Subtarget,
9518 SelectionDAG &DAG) {
9519 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9520 "Only 32-bit and 64-bit elements are supported!");
9522 // 128/256-bit vectors are only supported with VLX.
9523 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9524 && "VLX required for 128/256-bit vectors");
9526 SDValue Lo = V1, Hi = V2;
9527 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9531 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9532 DAG.getConstant(Rotation, DL, MVT::i8));
9535 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9537 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9538 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9539 /// matches elements from one of the input vectors shuffled to the left or
9540 /// right with zeroable elements 'shifted in'. It handles both the strictly
9541 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9544 /// PSHL : (little-endian) left bit shift.
9545 /// [ zz, 0, zz, 2 ]
9546 /// [ -1, 4, zz, -1 ]
9547 /// PSRL : (little-endian) right bit shift.
9549 /// [ -1, -1, 7, zz]
9550 /// PSLLDQ : (little-endian) left byte shift
9551 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9552 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9553 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9554 /// PSRLDQ : (little-endian) right byte shift
9555 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9556 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9557 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9558 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9559 unsigned ScalarSizeInBits,
9560 ArrayRef<int> Mask, int MaskOffset,
9561 const APInt &Zeroable,
9562 const X86Subtarget &Subtarget) {
9563 int Size = Mask.size();
9564 unsigned SizeInBits = Size * ScalarSizeInBits;
9566 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9567 for (int i = 0; i < Size; i += Scale)
9568 for (int j = 0; j < Shift; ++j)
9569 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9575 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9576 for (int i = 0; i != Size; i += Scale) {
9577 unsigned Pos = Left ? i + Shift : i;
9578 unsigned Low = Left ? i : i + Shift;
9579 unsigned Len = Scale - Shift;
9580 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9584 int ShiftEltBits = ScalarSizeInBits * Scale;
9585 bool ByteShift = ShiftEltBits > 64;
9586 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9587 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9588 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9590 // Normalize the scale for byte shifts to still produce an i64 element
9592 Scale = ByteShift ? Scale / 2 : Scale;
9594 // We need to round trip through the appropriate type for the shift.
9595 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9596 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9597 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9598 return (int)ShiftAmt;
9601 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9602 // keep doubling the size of the integer elements up to that. We can
9603 // then shift the elements of the integer vector by whole multiples of
9604 // their width within the elements of the larger integer vector. Test each
9605 // multiple to see if we can find a match with the moved element indices
9606 // and that the shifted in elements are all zeroable.
9607 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9608 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9609 for (int Shift = 1; Shift != Scale; ++Shift)
9610 for (bool Left : {true, false})
9611 if (CheckZeros(Shift, Scale, Left)) {
9612 int ShiftAmt = MatchShift(Shift, Scale, Left);
9621 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9622 SDValue V2, ArrayRef<int> Mask,
9623 const APInt &Zeroable,
9624 const X86Subtarget &Subtarget,
9625 SelectionDAG &DAG) {
9626 int Size = Mask.size();
9627 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9633 // Try to match shuffle against V1 shift.
9634 int ShiftAmt = matchVectorShuffleAsShift(
9635 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9637 // If V1 failed, try to match shuffle against V2 shift.
9640 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9641 Mask, Size, Zeroable, Subtarget);
9648 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9649 "Illegal integer vector type");
9650 V = DAG.getBitcast(ShiftVT, V);
9651 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9652 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9653 return DAG.getBitcast(VT, V);
9656 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9657 // Remainder of lower half result is zero and upper half is all undef.
9658 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9659 ArrayRef<int> Mask, uint64_t &BitLen,
9660 uint64_t &BitIdx, const APInt &Zeroable) {
9661 int Size = Mask.size();
9662 int HalfSize = Size / 2;
9663 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9664 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9666 // Upper half must be undefined.
9667 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9670 // Determine the extraction length from the part of the
9671 // lower half that isn't zeroable.
9673 for (; Len > 0; --Len)
9674 if (!Zeroable[Len - 1])
9676 assert(Len > 0 && "Zeroable shuffle mask");
9678 // Attempt to match first Len sequential elements from the lower half.
9681 for (int i = 0; i != Len; ++i) {
9683 if (M == SM_SentinelUndef)
9685 SDValue &V = (M < Size ? V1 : V2);
9688 // The extracted elements must start at a valid index and all mask
9689 // elements must be in the lower half.
9690 if (i > M || M >= HalfSize)
9693 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9701 if (!Src || Idx < 0)
9704 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9705 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9706 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9711 // INSERTQ: Extract lowest Len elements from lower half of second source and
9712 // insert over first source, starting at Idx.
9713 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9714 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9715 ArrayRef<int> Mask, uint64_t &BitLen,
9717 int Size = Mask.size();
9718 int HalfSize = Size / 2;
9719 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9721 // Upper half must be undefined.
9722 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9725 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9728 // Attempt to match first source from mask before insertion point.
9729 if (isUndefInRange(Mask, 0, Idx)) {
9731 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9733 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9739 // Extend the extraction length looking to match both the insertion of
9740 // the second source and the remaining elements of the first.
9741 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9746 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9748 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9754 // Match the remaining elements of the lower half.
9755 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9757 } else if ((!Base || (Base == V1)) &&
9758 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9760 } else if ((!Base || (Base == V2)) &&
9761 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9768 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9769 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9779 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9780 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9781 SDValue V2, ArrayRef<int> Mask,
9782 const APInt &Zeroable,
9783 SelectionDAG &DAG) {
9784 uint64_t BitLen, BitIdx;
9785 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9786 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9787 DAG.getConstant(BitLen, DL, MVT::i8),
9788 DAG.getConstant(BitIdx, DL, MVT::i8));
9790 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9791 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9792 V2 ? V2 : DAG.getUNDEF(VT),
9793 DAG.getConstant(BitLen, DL, MVT::i8),
9794 DAG.getConstant(BitIdx, DL, MVT::i8));
9799 /// \brief Lower a vector shuffle as a zero or any extension.
9801 /// Given a specific number of elements, element bit width, and extension
9802 /// stride, produce either a zero or any extension based on the available
9803 /// features of the subtarget. The extended elements are consecutive and
9804 /// begin and can start from an offsetted element index in the input; to
9805 /// avoid excess shuffling the offset must either being in the bottom lane
9806 /// or at the start of a higher lane. All extended elements must be from
9808 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9809 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9810 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9811 assert(Scale > 1 && "Need a scale to extend.");
9812 int EltBits = VT.getScalarSizeInBits();
9813 int NumElements = VT.getVectorNumElements();
9814 int NumEltsPerLane = 128 / EltBits;
9815 int OffsetLane = Offset / NumEltsPerLane;
9816 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9817 "Only 8, 16, and 32 bit elements can be extended.");
9818 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9819 assert(0 <= Offset && "Extension offset must be positive.");
9820 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9821 "Extension offset must be in the first lane or start an upper lane.");
9823 // Check that an index is in same lane as the base offset.
9824 auto SafeOffset = [&](int Idx) {
9825 return OffsetLane == (Idx / NumEltsPerLane);
9828 // Shift along an input so that the offset base moves to the first element.
9829 auto ShuffleOffset = [&](SDValue V) {
9833 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9834 for (int i = 0; i * Scale < NumElements; ++i) {
9835 int SrcIdx = i + Offset;
9836 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9838 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9841 // Found a valid zext mask! Try various lowering strategies based on the
9842 // input type and available ISA extensions.
9843 if (Subtarget.hasSSE41()) {
9844 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9845 // PUNPCK will catch this in a later shuffle match.
9846 if (Offset && Scale == 2 && VT.is128BitVector())
9848 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9849 NumElements / Scale);
9850 InputV = ShuffleOffset(InputV);
9851 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9852 return DAG.getBitcast(VT, InputV);
9855 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9857 // For any extends we can cheat for larger element sizes and use shuffle
9858 // instructions that can fold with a load and/or copy.
9859 if (AnyExt && EltBits == 32) {
9860 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9862 return DAG.getBitcast(
9863 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9864 DAG.getBitcast(MVT::v4i32, InputV),
9865 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9867 if (AnyExt && EltBits == 16 && Scale > 2) {
9868 int PSHUFDMask[4] = {Offset / 2, -1,
9869 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9870 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9871 DAG.getBitcast(MVT::v4i32, InputV),
9872 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9873 int PSHUFWMask[4] = {1, -1, -1, -1};
9874 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9875 return DAG.getBitcast(
9876 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9877 DAG.getBitcast(MVT::v8i16, InputV),
9878 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9881 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9883 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9884 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9885 assert(VT.is128BitVector() && "Unexpected vector width!");
9887 int LoIdx = Offset * EltBits;
9888 SDValue Lo = DAG.getBitcast(
9889 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9890 DAG.getConstant(EltBits, DL, MVT::i8),
9891 DAG.getConstant(LoIdx, DL, MVT::i8)));
9893 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9894 !SafeOffset(Offset + 1))
9895 return DAG.getBitcast(VT, Lo);
9897 int HiIdx = (Offset + 1) * EltBits;
9898 SDValue Hi = DAG.getBitcast(
9899 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9900 DAG.getConstant(EltBits, DL, MVT::i8),
9901 DAG.getConstant(HiIdx, DL, MVT::i8)));
9902 return DAG.getBitcast(VT,
9903 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9906 // If this would require more than 2 unpack instructions to expand, use
9907 // pshufb when available. We can only use more than 2 unpack instructions
9908 // when zero extending i8 elements which also makes it easier to use pshufb.
9909 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9910 assert(NumElements == 16 && "Unexpected byte vector width!");
9911 SDValue PSHUFBMask[16];
9912 for (int i = 0; i < 16; ++i) {
9913 int Idx = Offset + (i / Scale);
9914 PSHUFBMask[i] = DAG.getConstant(
9915 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9917 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9918 return DAG.getBitcast(
9919 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9920 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9923 // If we are extending from an offset, ensure we start on a boundary that
9924 // we can unpack from.
9925 int AlignToUnpack = Offset % (NumElements / Scale);
9926 if (AlignToUnpack) {
9927 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9928 for (int i = AlignToUnpack; i < NumElements; ++i)
9929 ShMask[i - AlignToUnpack] = i;
9930 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9931 Offset -= AlignToUnpack;
9934 // Otherwise emit a sequence of unpacks.
9936 unsigned UnpackLoHi = X86ISD::UNPCKL;
9937 if (Offset >= (NumElements / 2)) {
9938 UnpackLoHi = X86ISD::UNPCKH;
9939 Offset -= (NumElements / 2);
9942 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9943 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9944 : getZeroVector(InputVT, Subtarget, DAG, DL);
9945 InputV = DAG.getBitcast(InputVT, InputV);
9946 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9950 } while (Scale > 1);
9951 return DAG.getBitcast(VT, InputV);
9954 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9956 /// This routine will try to do everything in its power to cleverly lower
9957 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9958 /// check for the profitability of this lowering, it tries to aggressively
9959 /// match this pattern. It will use all of the micro-architectural details it
9960 /// can to emit an efficient lowering. It handles both blends with all-zero
9961 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9962 /// masking out later).
9964 /// The reason we have dedicated lowering for zext-style shuffles is that they
9965 /// are both incredibly common and often quite performance sensitive.
9966 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9967 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9968 const APInt &Zeroable, const X86Subtarget &Subtarget,
9969 SelectionDAG &DAG) {
9970 int Bits = VT.getSizeInBits();
9971 int NumLanes = Bits / 128;
9972 int NumElements = VT.getVectorNumElements();
9973 int NumEltsPerLane = NumElements / NumLanes;
9974 assert(VT.getScalarSizeInBits() <= 32 &&
9975 "Exceeds 32-bit integer zero extension limit");
9976 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9978 // Define a helper function to check a particular ext-scale and lower to it if
9980 auto Lower = [&](int Scale) -> SDValue {
9985 for (int i = 0; i < NumElements; ++i) {
9988 continue; // Valid anywhere but doesn't tell us anything.
9989 if (i % Scale != 0) {
9990 // Each of the extended elements need to be zeroable.
9994 // We no longer are in the anyext case.
9999 // Each of the base elements needs to be consecutive indices into the
10000 // same input vector.
10001 SDValue V = M < NumElements ? V1 : V2;
10002 M = M % NumElements;
10005 Offset = M - (i / Scale);
10006 } else if (InputV != V)
10007 return SDValue(); // Flip-flopping inputs.
10009 // Offset must start in the lowest 128-bit lane or at the start of an
10011 // FIXME: Is it ever worth allowing a negative base offset?
10012 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
10013 (Offset % NumEltsPerLane) == 0))
10016 // If we are offsetting, all referenced entries must come from the same
10018 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
10021 if ((M % NumElements) != (Offset + (i / Scale)))
10022 return SDValue(); // Non-consecutive strided elements.
10026 // If we fail to find an input, we have a zero-shuffle which should always
10027 // have already been handled.
10028 // FIXME: Maybe handle this here in case during blending we end up with one?
10032 // If we are offsetting, don't extend if we only match a single input, we
10033 // can always do better by using a basic PSHUF or PUNPCK.
10034 if (Offset != 0 && Matches < 2)
10037 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
10038 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
10041 // The widest scale possible for extending is to a 64-bit integer.
10042 assert(Bits % 64 == 0 &&
10043 "The number of bits in a vector must be divisible by 64 on x86!");
10044 int NumExtElements = Bits / 64;
10046 // Each iteration, try extending the elements half as much, but into twice as
10048 for (; NumExtElements < NumElements; NumExtElements *= 2) {
10049 assert(NumElements % NumExtElements == 0 &&
10050 "The input vector size must be divisible by the extended size.");
10051 if (SDValue V = Lower(NumElements / NumExtElements))
10055 // General extends failed, but 128-bit vectors may be able to use MOVQ.
10059 // Returns one of the source operands if the shuffle can be reduced to a
10060 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
10061 auto CanZExtLowHalf = [&]() {
10062 for (int i = NumElements / 2; i != NumElements; ++i)
10065 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
10067 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
10072 if (SDValue V = CanZExtLowHalf()) {
10073 V = DAG.getBitcast(MVT::v2i64, V);
10074 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
10075 return DAG.getBitcast(VT, V);
10078 // No viable ext lowering found.
10082 /// \brief Try to get a scalar value for a specific element of a vector.
10084 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
10085 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
10086 SelectionDAG &DAG) {
10087 MVT VT = V.getSimpleValueType();
10088 MVT EltVT = VT.getVectorElementType();
10089 V = peekThroughBitcasts(V);
10091 // If the bitcasts shift the element size, we can't extract an equivalent
10092 // element from it.
10093 MVT NewVT = V.getSimpleValueType();
10094 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
10097 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10098 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
10099 // Ensure the scalar operand is the same size as the destination.
10100 // FIXME: Add support for scalar truncation where possible.
10101 SDValue S = V.getOperand(Idx);
10102 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
10103 return DAG.getBitcast(EltVT, S);
10109 /// \brief Helper to test for a load that can be folded with x86 shuffles.
10111 /// This is particularly important because the set of instructions varies
10112 /// significantly based on whether the operand is a load or not.
10113 static bool isShuffleFoldableLoad(SDValue V) {
10114 V = peekThroughBitcasts(V);
10115 return ISD::isNON_EXTLoad(V.getNode());
10118 /// \brief Try to lower insertion of a single element into a zero vector.
10120 /// This is a common pattern that we have especially efficient patterns to lower
10121 /// across all subtarget feature sets.
10122 static SDValue lowerVectorShuffleAsElementInsertion(
10123 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10124 const APInt &Zeroable, const X86Subtarget &Subtarget,
10125 SelectionDAG &DAG) {
10127 MVT EltVT = VT.getVectorElementType();
10130 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
10132 bool IsV1Zeroable = true;
10133 for (int i = 0, Size = Mask.size(); i < Size; ++i)
10134 if (i != V2Index && !Zeroable[i]) {
10135 IsV1Zeroable = false;
10139 // Check for a single input from a SCALAR_TO_VECTOR node.
10140 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
10141 // all the smarts here sunk into that routine. However, the current
10142 // lowering of BUILD_VECTOR makes that nearly impossible until the old
10143 // vector shuffle lowering is dead.
10144 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
10146 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
10147 // We need to zext the scalar if it is smaller than an i32.
10148 V2S = DAG.getBitcast(EltVT, V2S);
10149 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
10150 // Using zext to expand a narrow element won't work for non-zero
10155 // Zero-extend directly to i32.
10156 ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
10157 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
10159 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
10160 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
10161 EltVT == MVT::i16) {
10162 // Either not inserting from the low element of the input or the input
10163 // element size is too small to use VZEXT_MOVL to clear the high bits.
10167 if (!IsV1Zeroable) {
10168 // If V1 can't be treated as a zero vector we have fewer options to lower
10169 // this. We can't support integer vectors or non-zero targets cheaply, and
10170 // the V1 elements can't be permuted in any way.
10171 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
10172 if (!VT.isFloatingPoint() || V2Index != 0)
10174 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
10175 V1Mask[V2Index] = -1;
10176 if (!isNoopShuffleMask(V1Mask))
10178 if (!VT.is128BitVector())
10181 // Otherwise, use MOVSD or MOVSS.
10182 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
10183 "Only two types of floating point element types to handle!");
10184 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
10188 // This lowering only works for the low element with floating point vectors.
10189 if (VT.isFloatingPoint() && V2Index != 0)
10192 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
10194 V2 = DAG.getBitcast(VT, V2);
10196 if (V2Index != 0) {
10197 // If we have 4 or fewer lanes we can cheaply shuffle the element into
10198 // the desired position. Otherwise it is more efficient to do a vector
10199 // shift left. We know that we can do a vector shift left because all
10200 // the inputs are zero.
10201 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
10202 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
10203 V2Shuffle[V2Index] = 0;
10204 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
10206 V2 = DAG.getBitcast(MVT::v16i8, V2);
10208 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
10209 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
10210 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
10211 DAG.getDataLayout(), VT)));
10212 V2 = DAG.getBitcast(VT, V2);
10218 /// Try to lower broadcast of a single - truncated - integer element,
10219 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
10221 /// This assumes we have AVX2.
10222 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
10223 SDValue V0, int BroadcastIdx,
10224 const X86Subtarget &Subtarget,
10225 SelectionDAG &DAG) {
10226 assert(Subtarget.hasAVX2() &&
10227 "We can only lower integer broadcasts with AVX2!");
10229 EVT EltVT = VT.getVectorElementType();
10230 EVT V0VT = V0.getValueType();
10232 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
10233 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
10235 EVT V0EltVT = V0VT.getVectorElementType();
10236 if (!V0EltVT.isInteger())
10239 const unsigned EltSize = EltVT.getSizeInBits();
10240 const unsigned V0EltSize = V0EltVT.getSizeInBits();
10242 // This is only a truncation if the original element type is larger.
10243 if (V0EltSize <= EltSize)
10246 assert(((V0EltSize % EltSize) == 0) &&
10247 "Scalar type sizes must all be powers of 2 on x86!");
10249 const unsigned V0Opc = V0.getOpcode();
10250 const unsigned Scale = V0EltSize / EltSize;
10251 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
10253 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
10254 V0Opc != ISD::BUILD_VECTOR)
10257 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
10259 // If we're extracting non-least-significant bits, shift so we can truncate.
10260 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
10261 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
10262 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
10263 if (const int OffsetIdx = BroadcastIdx % Scale)
10264 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
10265 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
10267 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
10268 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
10271 /// \brief Try to lower broadcast of a single element.
10273 /// For convenience, this code also bundles all of the subtarget feature set
10274 /// filtering. While a little annoying to re-dispatch on type here, there isn't
10275 /// a convenient way to factor it out.
10276 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
10277 SDValue V1, SDValue V2,
10278 ArrayRef<int> Mask,
10279 const X86Subtarget &Subtarget,
10280 SelectionDAG &DAG) {
10281 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10282 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10283 (Subtarget.hasAVX2() && VT.isInteger())))
10286 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10287 // we can only broadcast from a register with AVX2.
10288 unsigned NumElts = Mask.size();
10289 unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
10291 : X86ISD::VBROADCAST;
10292 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10294 // Check that the mask is a broadcast.
10295 int BroadcastIdx = -1;
10296 for (int i = 0; i != (int)NumElts; ++i) {
10297 SmallVector<int, 8> BroadcastMask(NumElts, i);
10298 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10304 if (BroadcastIdx < 0)
10306 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10307 "a sorted mask where the broadcast "
10310 // Go up the chain of (vector) values to find a scalar load that we can
10311 // combine with the broadcast.
10314 switch (V.getOpcode()) {
10315 case ISD::BITCAST: {
10316 SDValue VSrc = V.getOperand(0);
10317 MVT SrcVT = VSrc.getSimpleValueType();
10318 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10323 case ISD::CONCAT_VECTORS: {
10324 int OperandSize = Mask.size() / V.getNumOperands();
10325 V = V.getOperand(BroadcastIdx / OperandSize);
10326 BroadcastIdx %= OperandSize;
10329 case ISD::INSERT_SUBVECTOR: {
10330 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10331 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10335 int BeginIdx = (int)ConstantIdx->getZExtValue();
10337 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10338 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10339 BroadcastIdx -= BeginIdx;
10350 // Check if this is a broadcast of a scalar. We special case lowering
10351 // for scalars so that we can more effectively fold with loads.
10352 // First, look through bitcast: if the original value has a larger element
10353 // type than the shuffle, the broadcast element is in essence truncated.
10354 // Make that explicit to ease folding.
10355 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10356 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10357 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10358 return TruncBroadcast;
10360 MVT BroadcastVT = VT;
10362 // Peek through any bitcast (only useful for loads).
10363 SDValue BC = peekThroughBitcasts(V);
10365 // Also check the simpler case, where we can directly reuse the scalar.
10366 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10367 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10368 V = V.getOperand(BroadcastIdx);
10370 // If we can't broadcast from a register, check that the input is a load.
10371 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10373 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10374 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10375 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10376 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10377 Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
10382 // If we are broadcasting a load that is only used by the shuffle
10383 // then we can reduce the vector load to the broadcasted scalar load.
10384 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10385 SDValue BaseAddr = Ld->getOperand(1);
10386 EVT SVT = BroadcastVT.getScalarType();
10387 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10388 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10389 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10390 DAG.getMachineFunction().getMachineMemOperand(
10391 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10392 DAG.makeEquivalentMemoryOrdering(Ld, V);
10393 } else if (!BroadcastFromReg) {
10394 // We can't broadcast from a vector register.
10396 } else if (BroadcastIdx != 0) {
10397 // We can only broadcast from the zero-element of a vector register,
10398 // but it can be advantageous to broadcast from the zero-element of a
10400 if (!VT.is256BitVector() && !VT.is512BitVector())
10403 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10404 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10407 // Only broadcast the zero-element of a 128-bit subvector.
10408 unsigned EltSize = VT.getScalarSizeInBits();
10409 if (((BroadcastIdx * EltSize) % 128) != 0)
10412 // The shuffle input might have been a bitcast we looked through; look at
10413 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10414 // later bitcast it to BroadcastVT.
10415 MVT SrcVT = V.getSimpleValueType();
10416 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10417 "Unexpected vector element size");
10418 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10419 "Unexpected vector size");
10421 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10422 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10423 DAG.getIntPtrConstant(BroadcastIdx, DL));
10426 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10427 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10428 DAG.getBitcast(MVT::f64, V));
10430 // Bitcast back to the same scalar type as BroadcastVT.
10431 MVT SrcVT = V.getSimpleValueType();
10432 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10433 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10434 "Unexpected vector element size");
10435 if (SrcVT.isVector()) {
10436 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10437 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10439 SrcVT = BroadcastVT.getScalarType();
10441 V = DAG.getBitcast(SrcVT, V);
10444 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10445 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10446 V = DAG.getBitcast(MVT::f64, V);
10447 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10448 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10451 // We only support broadcasting from 128-bit vectors to minimize the
10452 // number of patterns we need to deal with in isel. So extract down to
10454 if (SrcVT.getSizeInBits() > 128)
10455 V = extract128BitVector(V, 0, DAG, DL);
10457 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10460 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10461 // INSERTPS when the V1 elements are already in the correct locations
10462 // because otherwise we can just always use two SHUFPS instructions which
10463 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10464 // perform INSERTPS if a single V1 element is out of place and all V2
10465 // elements are zeroable.
10466 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10467 unsigned &InsertPSMask,
10468 const APInt &Zeroable,
10469 ArrayRef<int> Mask,
10470 SelectionDAG &DAG) {
10471 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10472 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10473 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10475 // Attempt to match INSERTPS with one element from VA or VB being
10476 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10478 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10479 ArrayRef<int> CandidateMask) {
10480 unsigned ZMask = 0;
10481 int VADstIndex = -1;
10482 int VBDstIndex = -1;
10483 bool VAUsedInPlace = false;
10485 for (int i = 0; i < 4; ++i) {
10486 // Synthesize a zero mask from the zeroable elements (includes undefs).
10492 // Flag if we use any VA inputs in place.
10493 if (i == CandidateMask[i]) {
10494 VAUsedInPlace = true;
10498 // We can only insert a single non-zeroable element.
10499 if (VADstIndex >= 0 || VBDstIndex >= 0)
10502 if (CandidateMask[i] < 4) {
10503 // VA input out of place for insertion.
10506 // VB input for insertion.
10511 // Don't bother if we have no (non-zeroable) element for insertion.
10512 if (VADstIndex < 0 && VBDstIndex < 0)
10515 // Determine element insertion src/dst indices. The src index is from the
10516 // start of the inserted vector, not the start of the concatenated vector.
10517 unsigned VBSrcIndex = 0;
10518 if (VADstIndex >= 0) {
10519 // If we have a VA input out of place, we use VA as the V2 element
10520 // insertion and don't use the original V2 at all.
10521 VBSrcIndex = CandidateMask[VADstIndex];
10522 VBDstIndex = VADstIndex;
10525 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10528 // If no V1 inputs are used in place, then the result is created only from
10529 // the zero mask and the V2 insertion - so remove V1 dependency.
10530 if (!VAUsedInPlace)
10531 VA = DAG.getUNDEF(MVT::v4f32);
10533 // Update V1, V2 and InsertPSMask accordingly.
10537 // Insert the V2 element into the desired position.
10538 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10539 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10543 if (matchAsInsertPS(V1, V2, Mask))
10546 // Commute and try again.
10547 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10548 ShuffleVectorSDNode::commuteMask(CommutedMask);
10549 if (matchAsInsertPS(V2, V1, CommutedMask))
10555 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10556 SDValue V2, ArrayRef<int> Mask,
10557 const APInt &Zeroable,
10558 SelectionDAG &DAG) {
10559 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10560 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10562 // Attempt to match the insertps pattern.
10563 unsigned InsertPSMask;
10564 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10567 // Insert the V2 element into the desired position.
10568 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10569 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10572 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10573 /// UNPCK instruction.
10575 /// This specifically targets cases where we end up with alternating between
10576 /// the two inputs, and so can permute them into something that feeds a single
10577 /// UNPCK instruction. Note that this routine only targets integer vectors
10578 /// because for floating point vectors we have a generalized SHUFPS lowering
10579 /// strategy that handles everything that doesn't *exactly* match an unpack,
10580 /// making this clever lowering unnecessary.
10581 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10582 SDValue V1, SDValue V2,
10583 ArrayRef<int> Mask,
10584 SelectionDAG &DAG) {
10585 assert(!VT.isFloatingPoint() &&
10586 "This routine only supports integer vectors.");
10587 assert(VT.is128BitVector() &&
10588 "This routine only works on 128-bit vectors.");
10589 assert(!V2.isUndef() &&
10590 "This routine should only be used when blending two inputs.");
10591 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10593 int Size = Mask.size();
10596 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10598 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10600 bool UnpackLo = NumLoInputs >= NumHiInputs;
10602 auto TryUnpack = [&](int ScalarSize, int Scale) {
10603 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10604 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10606 for (int i = 0; i < Size; ++i) {
10610 // Each element of the unpack contains Scale elements from this mask.
10611 int UnpackIdx = i / Scale;
10613 // We only handle the case where V1 feeds the first slots of the unpack.
10614 // We rely on canonicalization to ensure this is the case.
10615 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10618 // Setup the mask for this input. The indexing is tricky as we have to
10619 // handle the unpack stride.
10620 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10621 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10625 // If we will have to shuffle both inputs to use the unpack, check whether
10626 // we can just unpack first and shuffle the result. If so, skip this unpack.
10627 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10628 !isNoopShuffleMask(V2Mask))
10631 // Shuffle the inputs into place.
10632 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10633 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10635 // Cast the inputs to the type we will use to unpack them.
10636 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10637 V1 = DAG.getBitcast(UnpackVT, V1);
10638 V2 = DAG.getBitcast(UnpackVT, V2);
10640 // Unpack the inputs and cast the result back to the desired type.
10641 return DAG.getBitcast(
10642 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10643 UnpackVT, V1, V2));
10646 // We try each unpack from the largest to the smallest to try and find one
10647 // that fits this mask.
10648 int OrigScalarSize = VT.getScalarSizeInBits();
10649 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10650 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10653 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10655 if (NumLoInputs == 0 || NumHiInputs == 0) {
10656 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10657 "We have to have *some* inputs!");
10658 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10660 // FIXME: We could consider the total complexity of the permute of each
10661 // possible unpacking. Or at the least we should consider how many
10662 // half-crossings are created.
10663 // FIXME: We could consider commuting the unpacks.
10665 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10666 for (int i = 0; i < Size; ++i) {
10670 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10673 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10675 return DAG.getVectorShuffle(
10676 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10678 DAG.getUNDEF(VT), PermMask);
10684 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10686 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10687 /// support for floating point shuffles but not integer shuffles. These
10688 /// instructions will incur a domain crossing penalty on some chips though so
10689 /// it is better to avoid lowering through this for integer vectors where
10691 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10692 const APInt &Zeroable,
10693 SDValue V1, SDValue V2,
10694 const X86Subtarget &Subtarget,
10695 SelectionDAG &DAG) {
10696 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10697 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10698 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10700 if (V2.isUndef()) {
10701 // Check for being able to broadcast a single element.
10702 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10703 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10706 // Straight shuffle of a single input vector. Simulate this by using the
10707 // single input as both of the "inputs" to this instruction..
10708 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10710 if (Subtarget.hasAVX()) {
10711 // If we have AVX, we can use VPERMILPS which will allow folding a load
10712 // into the shuffle.
10713 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10714 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10717 return DAG.getNode(
10718 X86ISD::SHUFP, DL, MVT::v2f64,
10719 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10720 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10721 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10723 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10724 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10726 // If we have a single input, insert that into V1 if we can do so cheaply.
10727 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10728 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10729 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10731 // Try inverting the insertion since for v2 masks it is easy to do and we
10732 // can't reliably sort the mask one way or the other.
10733 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10734 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10735 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10736 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10740 // Try to use one of the special instruction patterns to handle two common
10741 // blend patterns if a zero-blend above didn't work.
10742 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10743 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10744 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10745 // We can either use a special instruction to load over the low double or
10746 // to move just the low double.
10747 return DAG.getNode(
10748 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10749 DL, MVT::v2f64, V2,
10750 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10752 if (Subtarget.hasSSE41())
10753 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10754 Zeroable, Subtarget, DAG))
10757 // Use dedicated unpack instructions for masks that match their pattern.
10759 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10762 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10763 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10764 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10767 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10769 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10770 /// the integer unit to minimize domain crossing penalties. However, for blends
10771 /// it falls back to the floating point shuffle operation with appropriate bit
10773 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10774 const APInt &Zeroable,
10775 SDValue V1, SDValue V2,
10776 const X86Subtarget &Subtarget,
10777 SelectionDAG &DAG) {
10778 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10779 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10780 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10782 if (V2.isUndef()) {
10783 // Check for being able to broadcast a single element.
10784 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10785 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10788 // Straight shuffle of a single input vector. For everything from SSE2
10789 // onward this has a single fast instruction with no scary immediates.
10790 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10791 V1 = DAG.getBitcast(MVT::v4i32, V1);
10792 int WidenedMask[4] = {
10793 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10794 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10795 return DAG.getBitcast(
10797 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10798 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10800 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10801 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10802 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10803 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10805 // Try to use shift instructions.
10806 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10807 Zeroable, Subtarget, DAG))
10810 // When loading a scalar and then shuffling it into a vector we can often do
10811 // the insertion cheaply.
10812 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10813 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10815 // Try inverting the insertion since for v2 masks it is easy to do and we
10816 // can't reliably sort the mask one way or the other.
10817 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10818 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10819 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10822 // We have different paths for blend lowering, but they all must use the
10823 // *exact* same predicate.
10824 bool IsBlendSupported = Subtarget.hasSSE41();
10825 if (IsBlendSupported)
10826 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10827 Zeroable, Subtarget, DAG))
10830 // Use dedicated unpack instructions for masks that match their pattern.
10832 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10835 // Try to use byte rotation instructions.
10836 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10837 if (Subtarget.hasSSSE3()) {
10838 if (Subtarget.hasVLX())
10839 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
10840 Mask, Subtarget, DAG))
10843 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10844 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10848 // If we have direct support for blends, we should lower by decomposing into
10849 // a permute. That will be faster than the domain cross.
10850 if (IsBlendSupported)
10851 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10854 // We implement this with SHUFPD which is pretty lame because it will likely
10855 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10856 // However, all the alternatives are still more cycles and newer chips don't
10857 // have this problem. It would be really nice if x86 had better shuffles here.
10858 V1 = DAG.getBitcast(MVT::v2f64, V1);
10859 V2 = DAG.getBitcast(MVT::v2f64, V2);
10860 return DAG.getBitcast(MVT::v2i64,
10861 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10864 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10866 /// This is used to disable more specialized lowerings when the shufps lowering
10867 /// will happen to be efficient.
10868 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10869 // This routine only handles 128-bit shufps.
10870 assert(Mask.size() == 4 && "Unsupported mask size!");
10871 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10872 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10873 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10874 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10876 // To lower with a single SHUFPS we need to have the low half and high half
10877 // each requiring a single input.
10878 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10880 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10886 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10888 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10889 /// It makes no assumptions about whether this is the *best* lowering, it simply
10891 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10892 ArrayRef<int> Mask, SDValue V1,
10893 SDValue V2, SelectionDAG &DAG) {
10894 SDValue LowV = V1, HighV = V2;
10895 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10897 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10899 if (NumV2Elements == 1) {
10900 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10902 // Compute the index adjacent to V2Index and in the same half by toggling
10904 int V2AdjIndex = V2Index ^ 1;
10906 if (Mask[V2AdjIndex] < 0) {
10907 // Handles all the cases where we have a single V2 element and an undef.
10908 // This will only ever happen in the high lanes because we commute the
10909 // vector otherwise.
10911 std::swap(LowV, HighV);
10912 NewMask[V2Index] -= 4;
10914 // Handle the case where the V2 element ends up adjacent to a V1 element.
10915 // To make this work, blend them together as the first step.
10916 int V1Index = V2AdjIndex;
10917 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10918 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10919 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10921 // Now proceed to reconstruct the final blend as we have the necessary
10922 // high or low half formed.
10929 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10930 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10932 } else if (NumV2Elements == 2) {
10933 if (Mask[0] < 4 && Mask[1] < 4) {
10934 // Handle the easy case where we have V1 in the low lanes and V2 in the
10938 } else if (Mask[2] < 4 && Mask[3] < 4) {
10939 // We also handle the reversed case because this utility may get called
10940 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10941 // arrange things in the right direction.
10947 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10948 // trying to place elements directly, just blend them and set up the final
10949 // shuffle to place them.
10951 // The first two blend mask elements are for V1, the second two are for
10953 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10954 Mask[2] < 4 ? Mask[2] : Mask[3],
10955 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10956 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10957 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10958 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10960 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10963 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10964 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10965 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10966 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10969 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10970 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10973 /// \brief Lower 4-lane 32-bit floating point shuffles.
10975 /// Uses instructions exclusively from the floating point unit to minimize
10976 /// domain crossing penalties, as these are sufficient to implement all v4f32
10978 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10979 const APInt &Zeroable,
10980 SDValue V1, SDValue V2,
10981 const X86Subtarget &Subtarget,
10982 SelectionDAG &DAG) {
10983 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10984 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10985 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10987 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10989 if (NumV2Elements == 0) {
10990 // Check for being able to broadcast a single element.
10991 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10992 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10995 // Use even/odd duplicate instructions for masks that match their pattern.
10996 if (Subtarget.hasSSE3()) {
10997 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10998 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10999 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
11000 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
11003 if (Subtarget.hasAVX()) {
11004 // If we have AVX, we can use VPERMILPS which will allow folding a load
11005 // into the shuffle.
11006 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
11007 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11010 // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
11011 // in SSE1 because otherwise they are widened to v2f64 and never get here.
11012 if (!Subtarget.hasSSE2()) {
11013 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
11014 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
11015 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
11016 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
11019 // Otherwise, use a straight shuffle of a single input vector. We pass the
11020 // input vector to both operands to simulate this with a SHUFPS.
11021 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
11022 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11025 // There are special ways we can lower some single-element blends. However, we
11026 // have custom ways we can lower more complex single-element blends below that
11027 // we defer to if both this and BLENDPS fail to match, so restrict this to
11028 // when the V2 input is targeting element 0 of the mask -- that is the fast
11030 if (NumV2Elements == 1 && Mask[0] >= 4)
11031 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11032 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11035 if (Subtarget.hasSSE41()) {
11036 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
11037 Zeroable, Subtarget, DAG))
11040 // Use INSERTPS if we can complete the shuffle efficiently.
11042 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
11045 if (!isSingleSHUFPSMask(Mask))
11046 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
11047 DL, MVT::v4f32, V1, V2, Mask, DAG))
11051 // Use low/high mov instructions. These are only valid in SSE1 because
11052 // otherwise they are widened to v2f64 and never get here.
11053 if (!Subtarget.hasSSE2()) {
11054 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
11055 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
11056 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
11057 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
11060 // Use dedicated unpack instructions for masks that match their pattern.
11062 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
11065 // Otherwise fall back to a SHUFPS lowering strategy.
11066 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
11069 /// \brief Lower 4-lane i32 vector shuffles.
11071 /// We try to handle these with integer-domain shuffles where we can, but for
11072 /// blends we use the floating point domain blend instructions.
11073 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11074 const APInt &Zeroable,
11075 SDValue V1, SDValue V2,
11076 const X86Subtarget &Subtarget,
11077 SelectionDAG &DAG) {
11078 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11079 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
11080 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
11082 // Whenever we can lower this as a zext, that instruction is strictly faster
11083 // than any alternative. It also allows us to fold memory operands into the
11084 // shuffle in many cases.
11085 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11086 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11089 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
11091 if (NumV2Elements == 0) {
11092 // Check for being able to broadcast a single element.
11093 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11094 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11097 // Straight shuffle of a single input vector. For everything from SSE2
11098 // onward this has a single fast instruction with no scary immediates.
11099 // We coerce the shuffle pattern to be compatible with UNPCK instructions
11100 // but we aren't actually going to use the UNPCK instruction because doing
11101 // so prevents folding a load into this instruction or making a copy.
11102 const int UnpackLoMask[] = {0, 0, 1, 1};
11103 const int UnpackHiMask[] = {2, 2, 3, 3};
11104 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
11105 Mask = UnpackLoMask;
11106 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
11107 Mask = UnpackHiMask;
11109 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
11110 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
11113 // Try to use shift instructions.
11114 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
11115 Zeroable, Subtarget, DAG))
11118 // There are special ways we can lower some single-element blends.
11119 if (NumV2Elements == 1)
11120 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11121 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
11124 // We have different paths for blend lowering, but they all must use the
11125 // *exact* same predicate.
11126 bool IsBlendSupported = Subtarget.hasSSE41();
11127 if (IsBlendSupported)
11128 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
11129 Zeroable, Subtarget, DAG))
11132 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
11136 // Use dedicated unpack instructions for masks that match their pattern.
11138 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
11141 // Try to use byte rotation instructions.
11142 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
11143 if (Subtarget.hasSSSE3()) {
11144 if (Subtarget.hasVLX())
11145 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
11146 Mask, Subtarget, DAG))
11149 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11150 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
11154 // Assume that a single SHUFPS is faster than an alternative sequence of
11155 // multiple instructions (even if the CPU has a domain penalty).
11156 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
11157 if (!isSingleSHUFPSMask(Mask)) {
11158 // If we have direct support for blends, we should lower by decomposing into
11159 // a permute. That will be faster than the domain cross.
11160 if (IsBlendSupported)
11161 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
11164 // Try to lower by permuting the inputs into an unpack instruction.
11165 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11166 DL, MVT::v4i32, V1, V2, Mask, DAG))
11170 // We implement this with SHUFPS because it can blend from two vectors.
11171 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
11172 // up the inputs, bypassing domain shift penalties that we would incur if we
11173 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
11175 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
11176 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
11177 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
11178 return DAG.getBitcast(MVT::v4i32, ShufPS);
11181 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
11182 /// shuffle lowering, and the most complex part.
11184 /// The lowering strategy is to try to form pairs of input lanes which are
11185 /// targeted at the same half of the final vector, and then use a dword shuffle
11186 /// to place them onto the right half, and finally unpack the paired lanes into
11187 /// their final position.
11189 /// The exact breakdown of how to form these dword pairs and align them on the
11190 /// correct sides is really tricky. See the comments within the function for
11191 /// more of the details.
11193 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
11194 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
11195 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
11196 /// vector, form the analogous 128-bit 8-element Mask.
11197 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
11198 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
11199 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11200 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
11201 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
11203 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
11204 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
11205 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
11207 SmallVector<int, 4> LoInputs;
11208 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
11209 std::sort(LoInputs.begin(), LoInputs.end());
11210 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
11211 SmallVector<int, 4> HiInputs;
11212 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
11213 std::sort(HiInputs.begin(), HiInputs.end());
11214 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
11216 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
11217 int NumHToL = LoInputs.size() - NumLToL;
11219 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
11220 int NumHToH = HiInputs.size() - NumLToH;
11221 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
11222 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
11223 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
11224 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
11226 // If we are splatting two values from one half - one to each half, then
11227 // we can shuffle that half so each is splatted to a dword, then splat those
11228 // to their respective halves.
11229 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
11231 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
11232 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
11233 V = DAG.getNode(ShufWOp, DL, VT, V,
11234 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11235 V = DAG.getBitcast(PSHUFDVT, V);
11236 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
11237 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
11238 return DAG.getBitcast(VT, V);
11241 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
11242 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
11243 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
11244 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
11246 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
11247 // such inputs we can swap two of the dwords across the half mark and end up
11248 // with <=2 inputs to each half in each half. Once there, we can fall through
11249 // to the generic code below. For example:
11251 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11252 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
11254 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
11255 // and an existing 2-into-2 on the other half. In this case we may have to
11256 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
11257 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
11258 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
11259 // because any other situation (including a 3-into-1 or 1-into-3 in the other
11260 // half than the one we target for fixing) will be fixed when we re-enter this
11261 // path. We will also combine away any sequence of PSHUFD instructions that
11262 // result into a single instruction. Here is an example of the tricky case:
11264 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
11265 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
11267 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
11269 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
11270 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
11272 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
11273 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
11275 // The result is fine to be handled by the generic logic.
11276 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
11277 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
11278 int AOffset, int BOffset) {
11279 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
11280 "Must call this with A having 3 or 1 inputs from the A half.");
11281 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
11282 "Must call this with B having 1 or 3 inputs from the B half.");
11283 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
11284 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
11286 bool ThreeAInputs = AToAInputs.size() == 3;
11288 // Compute the index of dword with only one word among the three inputs in
11289 // a half by taking the sum of the half with three inputs and subtracting
11290 // the sum of the actual three inputs. The difference is the remaining
11292 int ADWord, BDWord;
11293 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11294 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11295 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11296 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11297 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11298 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11299 int TripleNonInputIdx =
11300 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11301 TripleDWord = TripleNonInputIdx / 2;
11303 // We use xor with one to compute the adjacent DWord to whichever one the
11305 OneInputDWord = (OneInput / 2) ^ 1;
11307 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11308 // and BToA inputs. If there is also such a problem with the BToB and AToB
11309 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11310 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11311 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11312 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11313 // Compute how many inputs will be flipped by swapping these DWords. We
11315 // to balance this to ensure we don't form a 3-1 shuffle in the other
11317 int NumFlippedAToBInputs =
11318 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11319 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11320 int NumFlippedBToBInputs =
11321 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11322 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11323 if ((NumFlippedAToBInputs == 1 &&
11324 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11325 (NumFlippedBToBInputs == 1 &&
11326 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11327 // We choose whether to fix the A half or B half based on whether that
11328 // half has zero flipped inputs. At zero, we may not be able to fix it
11329 // with that half. We also bias towards fixing the B half because that
11330 // will more commonly be the high half, and we have to bias one way.
11331 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11332 ArrayRef<int> Inputs) {
11333 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11334 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11335 // Determine whether the free index is in the flipped dword or the
11336 // unflipped dword based on where the pinned index is. We use this bit
11337 // in an xor to conditionally select the adjacent dword.
11338 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11339 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11340 if (IsFixIdxInput == IsFixFreeIdxInput)
11342 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11343 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11344 "We need to be changing the number of flipped inputs!");
11345 int PSHUFHalfMask[] = {0, 1, 2, 3};
11346 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11348 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11349 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11350 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11352 for (int &M : Mask)
11353 if (M >= 0 && M == FixIdx)
11355 else if (M >= 0 && M == FixFreeIdx)
11358 if (NumFlippedBToBInputs != 0) {
11360 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11361 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11363 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11364 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11365 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11370 int PSHUFDMask[] = {0, 1, 2, 3};
11371 PSHUFDMask[ADWord] = BDWord;
11372 PSHUFDMask[BDWord] = ADWord;
11373 V = DAG.getBitcast(
11375 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11376 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11378 // Adjust the mask to match the new locations of A and B.
11379 for (int &M : Mask)
11380 if (M >= 0 && M/2 == ADWord)
11381 M = 2 * BDWord + M % 2;
11382 else if (M >= 0 && M/2 == BDWord)
11383 M = 2 * ADWord + M % 2;
11385 // Recurse back into this routine to re-compute state now that this isn't
11386 // a 3 and 1 problem.
11387 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11390 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11391 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11392 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11393 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11395 // At this point there are at most two inputs to the low and high halves from
11396 // each half. That means the inputs can always be grouped into dwords and
11397 // those dwords can then be moved to the correct half with a dword shuffle.
11398 // We use at most one low and one high word shuffle to collect these paired
11399 // inputs into dwords, and finally a dword shuffle to place them.
11400 int PSHUFLMask[4] = {-1, -1, -1, -1};
11401 int PSHUFHMask[4] = {-1, -1, -1, -1};
11402 int PSHUFDMask[4] = {-1, -1, -1, -1};
11404 // First fix the masks for all the inputs that are staying in their
11405 // original halves. This will then dictate the targets of the cross-half
11407 auto fixInPlaceInputs =
11408 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11409 MutableArrayRef<int> SourceHalfMask,
11410 MutableArrayRef<int> HalfMask, int HalfOffset) {
11411 if (InPlaceInputs.empty())
11413 if (InPlaceInputs.size() == 1) {
11414 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11415 InPlaceInputs[0] - HalfOffset;
11416 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11419 if (IncomingInputs.empty()) {
11420 // Just fix all of the in place inputs.
11421 for (int Input : InPlaceInputs) {
11422 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11423 PSHUFDMask[Input / 2] = Input / 2;
11428 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11429 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11430 InPlaceInputs[0] - HalfOffset;
11431 // Put the second input next to the first so that they are packed into
11432 // a dword. We find the adjacent index by toggling the low bit.
11433 int AdjIndex = InPlaceInputs[0] ^ 1;
11434 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11435 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11436 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11438 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11439 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11441 // Now gather the cross-half inputs and place them into a free dword of
11442 // their target half.
11443 // FIXME: This operation could almost certainly be simplified dramatically to
11444 // look more like the 3-1 fixing operation.
11445 auto moveInputsToRightHalf = [&PSHUFDMask](
11446 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11447 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11448 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11450 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11451 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11453 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11455 int LowWord = Word & ~1;
11456 int HighWord = Word | 1;
11457 return isWordClobbered(SourceHalfMask, LowWord) ||
11458 isWordClobbered(SourceHalfMask, HighWord);
11461 if (IncomingInputs.empty())
11464 if (ExistingInputs.empty()) {
11465 // Map any dwords with inputs from them into the right half.
11466 for (int Input : IncomingInputs) {
11467 // If the source half mask maps over the inputs, turn those into
11468 // swaps and use the swapped lane.
11469 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11470 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11471 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11472 Input - SourceOffset;
11473 // We have to swap the uses in our half mask in one sweep.
11474 for (int &M : HalfMask)
11475 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11477 else if (M == Input)
11478 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11480 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11481 Input - SourceOffset &&
11482 "Previous placement doesn't match!");
11484 // Note that this correctly re-maps both when we do a swap and when
11485 // we observe the other side of the swap above. We rely on that to
11486 // avoid swapping the members of the input list directly.
11487 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11490 // Map the input's dword into the correct half.
11491 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11492 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11494 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11496 "Previous placement doesn't match!");
11499 // And just directly shift any other-half mask elements to be same-half
11500 // as we will have mirrored the dword containing the element into the
11501 // same position within that half.
11502 for (int &M : HalfMask)
11503 if (M >= SourceOffset && M < SourceOffset + 4) {
11504 M = M - SourceOffset + DestOffset;
11505 assert(M >= 0 && "This should never wrap below zero!");
11510 // Ensure we have the input in a viable dword of its current half. This
11511 // is particularly tricky because the original position may be clobbered
11512 // by inputs being moved and *staying* in that half.
11513 if (IncomingInputs.size() == 1) {
11514 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11515 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11517 SourceHalfMask[InputFixed - SourceOffset] =
11518 IncomingInputs[0] - SourceOffset;
11519 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11521 IncomingInputs[0] = InputFixed;
11523 } else if (IncomingInputs.size() == 2) {
11524 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11525 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11526 // We have two non-adjacent or clobbered inputs we need to extract from
11527 // the source half. To do this, we need to map them into some adjacent
11528 // dword slot in the source mask.
11529 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11530 IncomingInputs[1] - SourceOffset};
11532 // If there is a free slot in the source half mask adjacent to one of
11533 // the inputs, place the other input in it. We use (Index XOR 1) to
11534 // compute an adjacent index.
11535 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11536 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11537 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11538 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11539 InputsFixed[1] = InputsFixed[0] ^ 1;
11540 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11541 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11542 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11543 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11544 InputsFixed[0] = InputsFixed[1] ^ 1;
11545 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11546 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11547 // The two inputs are in the same DWord but it is clobbered and the
11548 // adjacent DWord isn't used at all. Move both inputs to the free
11550 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11551 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11552 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11553 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11555 // The only way we hit this point is if there is no clobbering
11556 // (because there are no off-half inputs to this half) and there is no
11557 // free slot adjacent to one of the inputs. In this case, we have to
11558 // swap an input with a non-input.
11559 for (int i = 0; i < 4; ++i)
11560 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11561 "We can't handle any clobbers here!");
11562 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11563 "Cannot have adjacent inputs here!");
11565 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11566 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11568 // We also have to update the final source mask in this case because
11569 // it may need to undo the above swap.
11570 for (int &M : FinalSourceHalfMask)
11571 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11572 M = InputsFixed[1] + SourceOffset;
11573 else if (M == InputsFixed[1] + SourceOffset)
11574 M = (InputsFixed[0] ^ 1) + SourceOffset;
11576 InputsFixed[1] = InputsFixed[0] ^ 1;
11579 // Point everything at the fixed inputs.
11580 for (int &M : HalfMask)
11581 if (M == IncomingInputs[0])
11582 M = InputsFixed[0] + SourceOffset;
11583 else if (M == IncomingInputs[1])
11584 M = InputsFixed[1] + SourceOffset;
11586 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11587 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11590 llvm_unreachable("Unhandled input size!");
11593 // Now hoist the DWord down to the right half.
11594 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11595 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11596 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11597 for (int &M : HalfMask)
11598 for (int Input : IncomingInputs)
11600 M = FreeDWord * 2 + Input % 2;
11602 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11603 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11604 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11605 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11607 // Now enact all the shuffles we've computed to move the inputs into their
11609 if (!isNoopShuffleMask(PSHUFLMask))
11610 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11611 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11612 if (!isNoopShuffleMask(PSHUFHMask))
11613 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11614 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11615 if (!isNoopShuffleMask(PSHUFDMask))
11616 V = DAG.getBitcast(
11618 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11619 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11621 // At this point, each half should contain all its inputs, and we can then
11622 // just shuffle them into their final position.
11623 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11624 "Failed to lift all the high half inputs to the low mask!");
11625 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11626 "Failed to lift all the low half inputs to the high mask!");
11628 // Do a half shuffle for the low mask.
11629 if (!isNoopShuffleMask(LoMask))
11630 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11631 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11633 // Do a half shuffle with the high mask after shifting its values down.
11634 for (int &M : HiMask)
11637 if (!isNoopShuffleMask(HiMask))
11638 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11639 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11644 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11645 /// blend if only one input is used.
11646 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11647 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11648 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11650 SDValue V1Mask[16];
11651 SDValue V2Mask[16];
11655 int Size = Mask.size();
11656 int Scale = 16 / Size;
11657 for (int i = 0; i < 16; ++i) {
11658 if (Mask[i / Scale] < 0) {
11659 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11661 const int ZeroMask = 0x80;
11662 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11664 int V2Idx = Mask[i / Scale] < Size
11666 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11667 if (Zeroable[i / Scale])
11668 V1Idx = V2Idx = ZeroMask;
11669 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11670 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11671 V1InUse |= (ZeroMask != V1Idx);
11672 V2InUse |= (ZeroMask != V2Idx);
11677 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11678 DAG.getBitcast(MVT::v16i8, V1),
11679 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11681 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11682 DAG.getBitcast(MVT::v16i8, V2),
11683 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11685 // If we need shuffled inputs from both, blend the two.
11687 if (V1InUse && V2InUse)
11688 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11690 V = V1InUse ? V1 : V2;
11692 // Cast the result back to the correct type.
11693 return DAG.getBitcast(VT, V);
11696 /// \brief Generic lowering of 8-lane i16 shuffles.
11698 /// This handles both single-input shuffles and combined shuffle/blends with
11699 /// two inputs. The single input shuffles are immediately delegated to
11700 /// a dedicated lowering routine.
11702 /// The blends are lowered in one of three fundamental ways. If there are few
11703 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11704 /// of the input is significantly cheaper when lowered as an interleaving of
11705 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11706 /// halves of the inputs separately (making them have relatively few inputs)
11707 /// and then concatenate them.
11708 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11709 const APInt &Zeroable,
11710 SDValue V1, SDValue V2,
11711 const X86Subtarget &Subtarget,
11712 SelectionDAG &DAG) {
11713 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11714 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11715 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11717 // Whenever we can lower this as a zext, that instruction is strictly faster
11718 // than any alternative.
11719 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11720 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11723 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11725 if (NumV2Inputs == 0) {
11726 // Check for being able to broadcast a single element.
11727 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11728 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11731 // Try to use shift instructions.
11732 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11733 Zeroable, Subtarget, DAG))
11736 // Use dedicated unpack instructions for masks that match their pattern.
11738 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11741 // Use dedicated pack instructions for masks that match their pattern.
11742 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
11746 // Try to use byte rotation instructions.
11747 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11748 Mask, Subtarget, DAG))
11751 // Make a copy of the mask so it can be modified.
11752 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11753 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11754 MutableMask, Subtarget,
11758 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11759 "All single-input shuffles should be canonicalized to be V1-input "
11762 // Try to use shift instructions.
11763 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11764 Zeroable, Subtarget, DAG))
11767 // See if we can use SSE4A Extraction / Insertion.
11768 if (Subtarget.hasSSE4A())
11769 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11773 // There are special ways we can lower some single-element blends.
11774 if (NumV2Inputs == 1)
11775 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11776 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11779 // We have different paths for blend lowering, but they all must use the
11780 // *exact* same predicate.
11781 bool IsBlendSupported = Subtarget.hasSSE41();
11782 if (IsBlendSupported)
11783 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11784 Zeroable, Subtarget, DAG))
11787 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11791 // Use dedicated unpack instructions for masks that match their pattern.
11793 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11796 // Use dedicated pack instructions for masks that match their pattern.
11797 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
11801 // Try to use byte rotation instructions.
11802 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11803 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11806 if (SDValue BitBlend =
11807 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11810 // Try to lower by permuting the inputs into an unpack instruction.
11811 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11815 // If we can't directly blend but can use PSHUFB, that will be better as it
11816 // can both shuffle and set up the inefficient blend.
11817 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11818 bool V1InUse, V2InUse;
11819 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11820 Zeroable, DAG, V1InUse, V2InUse);
11823 // We can always bit-blend if we have to so the fallback strategy is to
11824 // decompose into single-input permutes and blends.
11825 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11829 /// \brief Check whether a compaction lowering can be done by dropping even
11830 /// elements and compute how many times even elements must be dropped.
11832 /// This handles shuffles which take every Nth element where N is a power of
11833 /// two. Example shuffle masks:
11835 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11836 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11837 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11838 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11839 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11840 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11842 /// Any of these lanes can of course be undef.
11844 /// This routine only supports N <= 3.
11845 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11848 /// \returns N above, or the number of times even elements must be dropped if
11849 /// there is such a number. Otherwise returns zero.
11850 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11851 bool IsSingleInput) {
11852 // The modulus for the shuffle vector entries is based on whether this is
11853 // a single input or not.
11854 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11855 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11856 "We should only be called with masks with a power-of-2 size!");
11858 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11860 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11861 // and 2^3 simultaneously. This is because we may have ambiguity with
11862 // partially undef inputs.
11863 bool ViableForN[3] = {true, true, true};
11865 for (int i = 0, e = Mask.size(); i < e; ++i) {
11866 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11871 bool IsAnyViable = false;
11872 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11873 if (ViableForN[j]) {
11874 uint64_t N = j + 1;
11876 // The shuffle mask must be equal to (i * 2^N) % M.
11877 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11878 IsAnyViable = true;
11880 ViableForN[j] = false;
11882 // Early exit if we exhaust the possible powers of two.
11887 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11891 // Return 0 as there is no viable power of two.
11895 /// \brief Generic lowering of v16i8 shuffles.
11897 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11898 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11899 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11900 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11902 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11903 const APInt &Zeroable,
11904 SDValue V1, SDValue V2,
11905 const X86Subtarget &Subtarget,
11906 SelectionDAG &DAG) {
11907 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11908 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11909 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11911 // Try to use shift instructions.
11912 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11913 Zeroable, Subtarget, DAG))
11916 // Try to use byte rotation instructions.
11917 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11918 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11921 // Use dedicated pack instructions for masks that match their pattern.
11922 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
11926 // Try to use a zext lowering.
11927 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11928 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11931 // See if we can use SSE4A Extraction / Insertion.
11932 if (Subtarget.hasSSE4A())
11933 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11937 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11939 // For single-input shuffles, there are some nicer lowering tricks we can use.
11940 if (NumV2Elements == 0) {
11941 // Check for being able to broadcast a single element.
11942 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11943 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11946 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11947 // Notably, this handles splat and partial-splat shuffles more efficiently.
11948 // However, it only makes sense if the pre-duplication shuffle simplifies
11949 // things significantly. Currently, this means we need to be able to
11950 // express the pre-duplication shuffle as an i16 shuffle.
11952 // FIXME: We should check for other patterns which can be widened into an
11953 // i16 shuffle as well.
11954 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11955 for (int i = 0; i < 16; i += 2)
11956 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11961 auto tryToWidenViaDuplication = [&]() -> SDValue {
11962 if (!canWidenViaDuplication(Mask))
11964 SmallVector<int, 4> LoInputs;
11965 copy_if(Mask, std::back_inserter(LoInputs),
11966 [](int M) { return M >= 0 && M < 8; });
11967 std::sort(LoInputs.begin(), LoInputs.end());
11968 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11970 SmallVector<int, 4> HiInputs;
11971 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11972 std::sort(HiInputs.begin(), HiInputs.end());
11973 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11976 bool TargetLo = LoInputs.size() >= HiInputs.size();
11977 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11978 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11980 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11981 SmallDenseMap<int, int, 8> LaneMap;
11982 for (int I : InPlaceInputs) {
11983 PreDupI16Shuffle[I/2] = I/2;
11986 int j = TargetLo ? 0 : 4, je = j + 4;
11987 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11988 // Check if j is already a shuffle of this input. This happens when
11989 // there are two adjacent bytes after we move the low one.
11990 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11991 // If we haven't yet mapped the input, search for a slot into which
11993 while (j < je && PreDupI16Shuffle[j] >= 0)
11997 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
12000 // Map this input with the i16 shuffle.
12001 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
12004 // Update the lane map based on the mapping we ended up with.
12005 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
12007 V1 = DAG.getBitcast(
12009 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12010 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
12012 // Unpack the bytes to form the i16s that will be shuffled into place.
12013 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
12014 MVT::v16i8, V1, V1);
12016 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
12017 for (int i = 0; i < 16; ++i)
12018 if (Mask[i] >= 0) {
12019 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
12020 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
12021 if (PostDupI16Shuffle[i / 2] < 0)
12022 PostDupI16Shuffle[i / 2] = MappedMask;
12024 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
12025 "Conflicting entries in the original shuffle!");
12027 return DAG.getBitcast(
12029 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
12030 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
12032 if (SDValue V = tryToWidenViaDuplication())
12036 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
12040 // Use dedicated unpack instructions for masks that match their pattern.
12042 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
12045 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
12046 // with PSHUFB. It is important to do this before we attempt to generate any
12047 // blends but after all of the single-input lowerings. If the single input
12048 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
12049 // want to preserve that and we can DAG combine any longer sequences into
12050 // a PSHUFB in the end. But once we start blending from multiple inputs,
12051 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
12052 // and there are *very* few patterns that would actually be faster than the
12053 // PSHUFB approach because of its ability to zero lanes.
12055 // FIXME: The only exceptions to the above are blends which are exact
12056 // interleavings with direct instructions supporting them. We currently don't
12057 // handle those well here.
12058 if (Subtarget.hasSSSE3()) {
12059 bool V1InUse = false;
12060 bool V2InUse = false;
12062 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
12063 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
12065 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
12066 // do so. This avoids using them to handle blends-with-zero which is
12067 // important as a single pshufb is significantly faster for that.
12068 if (V1InUse && V2InUse) {
12069 if (Subtarget.hasSSE41())
12070 if (SDValue Blend = lowerVectorShuffleAsBlend(
12071 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12074 // We can use an unpack to do the blending rather than an or in some
12075 // cases. Even though the or may be (very minorly) more efficient, we
12076 // preference this lowering because there are common cases where part of
12077 // the complexity of the shuffles goes away when we do the final blend as
12079 // FIXME: It might be worth trying to detect if the unpack-feeding
12080 // shuffles will both be pshufb, in which case we shouldn't bother with
12082 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
12083 DL, MVT::v16i8, V1, V2, Mask, DAG))
12090 // There are special ways we can lower some single-element blends.
12091 if (NumV2Elements == 1)
12092 if (SDValue V = lowerVectorShuffleAsElementInsertion(
12093 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12096 if (SDValue BitBlend =
12097 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
12100 // Check whether a compaction lowering can be done. This handles shuffles
12101 // which take every Nth element for some even N. See the helper function for
12104 // We special case these as they can be particularly efficiently handled with
12105 // the PACKUSB instruction on x86 and they show up in common patterns of
12106 // rearranging bytes to truncate wide elements.
12107 bool IsSingleInput = V2.isUndef();
12108 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
12109 // NumEvenDrops is the power of two stride of the elements. Another way of
12110 // thinking about it is that we need to drop the even elements this many
12111 // times to get the original input.
12113 // First we need to zero all the dropped bytes.
12114 assert(NumEvenDrops <= 3 &&
12115 "No support for dropping even elements more than 3 times.");
12116 // We use the mask type to pick which bytes are preserved based on how many
12117 // elements are dropped.
12118 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
12119 SDValue ByteClearMask = DAG.getBitcast(
12120 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
12121 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
12122 if (!IsSingleInput)
12123 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
12125 // Now pack things back together.
12126 V1 = DAG.getBitcast(MVT::v8i16, V1);
12127 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
12128 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
12129 for (int i = 1; i < NumEvenDrops; ++i) {
12130 Result = DAG.getBitcast(MVT::v8i16, Result);
12131 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
12137 // Handle multi-input cases by blending single-input shuffles.
12138 if (NumV2Elements > 0)
12139 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
12142 // The fallback path for single-input shuffles widens this into two v8i16
12143 // vectors with unpacks, shuffles those, and then pulls them back together
12147 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12148 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
12149 for (int i = 0; i < 16; ++i)
12151 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
12153 SDValue VLoHalf, VHiHalf;
12154 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
12155 // them out and avoid using UNPCK{L,H} to extract the elements of V as
12157 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
12158 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
12159 // Use a mask to drop the high bytes.
12160 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
12161 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
12162 DAG.getConstant(0x00FF, DL, MVT::v8i16));
12164 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
12165 VHiHalf = DAG.getUNDEF(MVT::v8i16);
12167 // Squash the masks to point directly into VLoHalf.
12168 for (int &M : LoBlendMask)
12171 for (int &M : HiBlendMask)
12175 // Otherwise just unpack the low half of V into VLoHalf and the high half into
12176 // VHiHalf so that we can blend them as i16s.
12177 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
12179 VLoHalf = DAG.getBitcast(
12180 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
12181 VHiHalf = DAG.getBitcast(
12182 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
12185 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
12186 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
12188 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
12191 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
12193 /// This routine breaks down the specific type of 128-bit shuffle and
12194 /// dispatches to the lowering routines accordingly.
12195 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12196 MVT VT, SDValue V1, SDValue V2,
12197 const APInt &Zeroable,
12198 const X86Subtarget &Subtarget,
12199 SelectionDAG &DAG) {
12200 switch (VT.SimpleTy) {
12202 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12204 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12206 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12208 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12210 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12212 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12215 llvm_unreachable("Unimplemented!");
12219 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
12221 /// This routine just extracts two subvectors, shuffles them independently, and
12222 /// then concatenates them back together. This should work effectively with all
12223 /// AVX vector shuffle types.
12224 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12225 SDValue V2, ArrayRef<int> Mask,
12226 SelectionDAG &DAG) {
12227 assert(VT.getSizeInBits() >= 256 &&
12228 "Only for 256-bit or wider vector shuffles!");
12229 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
12230 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
12232 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
12233 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
12235 int NumElements = VT.getVectorNumElements();
12236 int SplitNumElements = NumElements / 2;
12237 MVT ScalarVT = VT.getVectorElementType();
12238 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
12240 // Rather than splitting build-vectors, just build two narrower build
12241 // vectors. This helps shuffling with splats and zeros.
12242 auto SplitVector = [&](SDValue V) {
12243 V = peekThroughBitcasts(V);
12245 MVT OrigVT = V.getSimpleValueType();
12246 int OrigNumElements = OrigVT.getVectorNumElements();
12247 int OrigSplitNumElements = OrigNumElements / 2;
12248 MVT OrigScalarVT = OrigVT.getVectorElementType();
12249 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
12253 auto *BV = dyn_cast<BuildVectorSDNode>(V);
12255 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12256 DAG.getIntPtrConstant(0, DL));
12257 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
12258 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
12261 SmallVector<SDValue, 16> LoOps, HiOps;
12262 for (int i = 0; i < OrigSplitNumElements; ++i) {
12263 LoOps.push_back(BV->getOperand(i));
12264 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
12266 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
12267 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
12269 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
12270 DAG.getBitcast(SplitVT, HiV));
12273 SDValue LoV1, HiV1, LoV2, HiV2;
12274 std::tie(LoV1, HiV1) = SplitVector(V1);
12275 std::tie(LoV2, HiV2) = SplitVector(V2);
12277 // Now create two 4-way blends of these half-width vectors.
12278 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
12279 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
12280 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
12281 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
12282 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
12283 for (int i = 0; i < SplitNumElements; ++i) {
12284 int M = HalfMask[i];
12285 if (M >= NumElements) {
12286 if (M >= NumElements + SplitNumElements)
12290 V2BlendMask[i] = M - NumElements;
12291 BlendMask[i] = SplitNumElements + i;
12292 } else if (M >= 0) {
12293 if (M >= SplitNumElements)
12297 V1BlendMask[i] = M;
12302 // Because the lowering happens after all combining takes place, we need to
12303 // manually combine these blend masks as much as possible so that we create
12304 // a minimal number of high-level vector shuffle nodes.
12306 // First try just blending the halves of V1 or V2.
12307 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12308 return DAG.getUNDEF(SplitVT);
12309 if (!UseLoV2 && !UseHiV2)
12310 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12311 if (!UseLoV1 && !UseHiV1)
12312 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12314 SDValue V1Blend, V2Blend;
12315 if (UseLoV1 && UseHiV1) {
12317 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12319 // We only use half of V1 so map the usage down into the final blend mask.
12320 V1Blend = UseLoV1 ? LoV1 : HiV1;
12321 for (int i = 0; i < SplitNumElements; ++i)
12322 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12323 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12325 if (UseLoV2 && UseHiV2) {
12327 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12329 // We only use half of V2 so map the usage down into the final blend mask.
12330 V2Blend = UseLoV2 ? LoV2 : HiV2;
12331 for (int i = 0; i < SplitNumElements; ++i)
12332 if (BlendMask[i] >= SplitNumElements)
12333 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12335 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12337 SDValue Lo = HalfBlend(LoMask);
12338 SDValue Hi = HalfBlend(HiMask);
12339 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12342 /// \brief Either split a vector in halves or decompose the shuffles and the
12345 /// This is provided as a good fallback for many lowerings of non-single-input
12346 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12347 /// between splitting the shuffle into 128-bit components and stitching those
12348 /// back together vs. extracting the single-input shuffles and blending those
12350 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12351 SDValue V1, SDValue V2,
12352 ArrayRef<int> Mask,
12353 SelectionDAG &DAG) {
12354 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12355 "shuffles as it could then recurse on itself.");
12356 int Size = Mask.size();
12358 // If this can be modeled as a broadcast of two elements followed by a blend,
12359 // prefer that lowering. This is especially important because broadcasts can
12360 // often fold with memory operands.
12361 auto DoBothBroadcast = [&] {
12362 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12365 if (V2BroadcastIdx < 0)
12366 V2BroadcastIdx = M - Size;
12367 else if (M - Size != V2BroadcastIdx)
12369 } else if (M >= 0) {
12370 if (V1BroadcastIdx < 0)
12371 V1BroadcastIdx = M;
12372 else if (M != V1BroadcastIdx)
12377 if (DoBothBroadcast())
12378 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12381 // If the inputs all stem from a single 128-bit lane of each input, then we
12382 // split them rather than blending because the split will decompose to
12383 // unusually few instructions.
12384 int LaneCount = VT.getSizeInBits() / 128;
12385 int LaneSize = Size / LaneCount;
12386 SmallBitVector LaneInputs[2];
12387 LaneInputs[0].resize(LaneCount, false);
12388 LaneInputs[1].resize(LaneCount, false);
12389 for (int i = 0; i < Size; ++i)
12391 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12392 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12393 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12395 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12396 // that the decomposed single-input shuffles don't end up here.
12397 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12400 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12401 /// a permutation and blend of those lanes.
12403 /// This essentially blends the out-of-lane inputs to each lane into the lane
12404 /// from a permuted copy of the vector. This lowering strategy results in four
12405 /// instructions in the worst case for a single-input cross lane shuffle which
12406 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12407 /// of. Special cases for each particular shuffle pattern should be handled
12408 /// prior to trying this lowering.
12409 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12410 SDValue V1, SDValue V2,
12411 ArrayRef<int> Mask,
12413 const X86Subtarget &Subtarget) {
12414 // FIXME: This should probably be generalized for 512-bit vectors as well.
12415 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12416 int Size = Mask.size();
12417 int LaneSize = Size / 2;
12419 // If there are only inputs from one 128-bit lane, splitting will in fact be
12420 // less expensive. The flags track whether the given lane contains an element
12421 // that crosses to another lane.
12422 if (!Subtarget.hasAVX2()) {
12423 bool LaneCrossing[2] = {false, false};
12424 for (int i = 0; i < Size; ++i)
12425 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12426 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12427 if (!LaneCrossing[0] || !LaneCrossing[1])
12428 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12430 bool LaneUsed[2] = {false, false};
12431 for (int i = 0; i < Size; ++i)
12433 LaneUsed[(Mask[i] / LaneSize)] = true;
12434 if (!LaneUsed[0] || !LaneUsed[1])
12435 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12438 assert(V2.isUndef() &&
12439 "This last part of this routine only works on single input shuffles");
12441 SmallVector<int, 32> FlippedBlendMask(Size);
12442 for (int i = 0; i < Size; ++i)
12443 FlippedBlendMask[i] =
12444 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12446 : Mask[i] % LaneSize +
12447 (i / LaneSize) * LaneSize + Size);
12449 // Flip the vector, and blend the results which should now be in-lane.
12450 MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
12451 SDValue Flipped = DAG.getBitcast(PVT, V1);
12452 Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
12454 Flipped = DAG.getBitcast(VT, Flipped);
12455 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12458 /// \brief Handle lowering 2-lane 128-bit shuffles.
12459 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12460 SDValue V2, ArrayRef<int> Mask,
12461 const APInt &Zeroable,
12462 const X86Subtarget &Subtarget,
12463 SelectionDAG &DAG) {
12464 // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
12465 if (Subtarget.hasAVX2() && V2.isUndef())
12468 SmallVector<int, 4> WidenedMask;
12469 if (!canWidenShuffleElements(Mask, WidenedMask))
12472 // TODO: If minimizing size and one of the inputs is a zero vector and the
12473 // the zero vector has only one use, we could use a VPERM2X128 to save the
12474 // instruction bytes needed to explicitly generate the zero vector.
12476 // Blends are faster and handle all the non-lane-crossing cases.
12477 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12478 Zeroable, Subtarget, DAG))
12481 bool IsLowZero = (Zeroable & 0x3) == 0x3;
12482 bool IsHighZero = (Zeroable & 0xc) == 0xc;
12484 // If either input operand is a zero vector, use VPERM2X128 because its mask
12485 // allows us to replace the zero input with an implicit zero.
12486 if (!IsLowZero && !IsHighZero) {
12487 // Check for patterns which can be matched with a single insert of a 128-bit
12489 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12490 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12492 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12493 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12494 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12495 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12496 VT.getVectorNumElements() / 2);
12497 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12498 DAG.getIntPtrConstant(0, DL));
12499 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12500 OnlyUsesV1 ? V1 : V2,
12501 DAG.getIntPtrConstant(0, DL));
12502 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12506 // Try to use SHUF128 if possible.
12507 if (Subtarget.hasVLX()) {
12508 if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
12509 unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
12510 ((WidenedMask[1] % 2) << 1);
12511 return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
12512 DAG.getConstant(PermMask, DL, MVT::i8));
12517 // Otherwise form a 128-bit permutation. After accounting for undefs,
12518 // convert the 64-bit shuffle mask selection values into 128-bit
12519 // selection bits by dividing the indexes by 2 and shifting into positions
12520 // defined by a vperm2*128 instruction's immediate control byte.
12522 // The immediate permute control byte looks like this:
12523 // [1:0] - select 128 bits from sources for low half of destination
12525 // [3] - zero low half of destination
12526 // [5:4] - select 128 bits from sources for high half of destination
12528 // [7] - zero high half of destination
12530 assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
12532 unsigned PermMask = 0;
12533 PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
12534 PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
12536 // Check the immediate mask and replace unused sources with undef.
12537 if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
12538 V1 = DAG.getUNDEF(VT);
12539 if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
12540 V2 = DAG.getUNDEF(VT);
12542 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12543 DAG.getConstant(PermMask, DL, MVT::i8));
12546 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12547 /// shuffling each lane.
12549 /// This will only succeed when the result of fixing the 128-bit lanes results
12550 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12551 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12552 /// the lane crosses early and then use simpler shuffles within each lane.
12554 /// FIXME: It might be worthwhile at some point to support this without
12555 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12556 /// in x86 only floating point has interesting non-repeating shuffles, and even
12557 /// those are still *marginally* more expensive.
12558 static SDValue lowerVectorShuffleByMerging128BitLanes(
12559 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12560 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12561 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12563 int Size = Mask.size();
12564 int LaneSize = 128 / VT.getScalarSizeInBits();
12565 int NumLanes = Size / LaneSize;
12566 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12568 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12569 // check whether the in-128-bit lane shuffles share a repeating pattern.
12570 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12571 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12572 for (int i = 0; i < Size; ++i) {
12576 int j = i / LaneSize;
12578 if (Lanes[j] < 0) {
12579 // First entry we've seen for this lane.
12580 Lanes[j] = Mask[i] / LaneSize;
12581 } else if (Lanes[j] != Mask[i] / LaneSize) {
12582 // This doesn't match the lane selected previously!
12586 // Check that within each lane we have a consistent shuffle mask.
12587 int k = i % LaneSize;
12588 if (InLaneMask[k] < 0) {
12589 InLaneMask[k] = Mask[i] % LaneSize;
12590 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12591 // This doesn't fit a repeating in-lane mask.
12596 // First shuffle the lanes into place.
12597 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12598 VT.getSizeInBits() / 64);
12599 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12600 for (int i = 0; i < NumLanes; ++i)
12601 if (Lanes[i] >= 0) {
12602 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12603 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12606 V1 = DAG.getBitcast(LaneVT, V1);
12607 V2 = DAG.getBitcast(LaneVT, V2);
12608 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12610 // Cast it back to the type we actually want.
12611 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12613 // Now do a simple shuffle that isn't lane crossing.
12614 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12615 for (int i = 0; i < Size; ++i)
12617 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12618 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12619 "Must not introduce lane crosses at this point!");
12621 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12624 /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
12625 /// This allows for fast cases such as subvector extraction/insertion
12626 /// or shuffling smaller vector types which can lower more efficiently.
12627 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12628 SDValue V1, SDValue V2,
12629 ArrayRef<int> Mask,
12630 const X86Subtarget &Subtarget,
12631 SelectionDAG &DAG) {
12632 assert((VT.is256BitVector() || VT.is512BitVector()) &&
12633 "Expected 256-bit or 512-bit vector");
12635 unsigned NumElts = VT.getVectorNumElements();
12636 unsigned HalfNumElts = NumElts / 2;
12637 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12639 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12640 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12641 if (!UndefLower && !UndefUpper)
12644 // Upper half is undef and lower half is whole upper subvector.
12645 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12647 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12648 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12649 DAG.getIntPtrConstant(HalfNumElts, DL));
12650 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12651 DAG.getIntPtrConstant(0, DL));
12654 // Lower half is undef and upper half is whole lower subvector.
12655 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12657 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12658 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12659 DAG.getIntPtrConstant(0, DL));
12660 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12661 DAG.getIntPtrConstant(HalfNumElts, DL));
12664 // If the shuffle only uses two of the four halves of the input operands,
12665 // then extract them and perform the 'half' shuffle at half width.
12666 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12667 int HalfIdx1 = -1, HalfIdx2 = -1;
12668 SmallVector<int, 8> HalfMask(HalfNumElts);
12669 unsigned Offset = UndefLower ? HalfNumElts : 0;
12670 for (unsigned i = 0; i != HalfNumElts; ++i) {
12671 int M = Mask[i + Offset];
12677 // Determine which of the 4 half vectors this element is from.
12678 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12679 int HalfIdx = M / HalfNumElts;
12681 // Determine the element index into its half vector source.
12682 int HalfElt = M % HalfNumElts;
12684 // We can shuffle with up to 2 half vectors, set the new 'half'
12685 // shuffle mask accordingly.
12686 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12687 HalfMask[i] = HalfElt;
12688 HalfIdx1 = HalfIdx;
12691 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12692 HalfMask[i] = HalfElt + HalfNumElts;
12693 HalfIdx2 = HalfIdx;
12697 // Too many half vectors referenced.
12700 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12702 // Only shuffle the halves of the inputs when useful.
12703 int NumLowerHalves =
12704 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12705 int NumUpperHalves =
12706 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12708 // uuuuXXXX - don't extract uppers just to insert again.
12709 if (UndefLower && NumUpperHalves != 0)
12712 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12713 if (UndefUpper && NumUpperHalves == 2)
12716 // AVX2 - XXXXuuuu - always extract lowers.
12717 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12718 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12719 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12721 // AVX2 supports variable 32-bit element cross-lane shuffles.
12722 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12723 // XXXXuuuu - don't extract lowers and uppers.
12724 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12729 // AVX512 - XXXXuuuu - always extract lowers.
12730 if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
12733 auto GetHalfVector = [&](int HalfIdx) {
12735 return DAG.getUNDEF(HalfVT);
12736 SDValue V = (HalfIdx < 2 ? V1 : V2);
12737 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12738 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12739 DAG.getIntPtrConstant(HalfIdx, DL));
12742 SDValue Half1 = GetHalfVector(HalfIdx1);
12743 SDValue Half2 = GetHalfVector(HalfIdx2);
12744 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12745 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12746 DAG.getIntPtrConstant(Offset, DL));
12749 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12752 /// This returns true if the elements from a particular input are already in the
12753 /// slot required by the given mask and require no permutation.
12754 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12755 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12756 int Size = Mask.size();
12757 for (int i = 0; i < Size; ++i)
12758 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12764 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12765 /// every lane can be represented as the same repeating mask - allowing us to
12766 /// shuffle the sources with the repeating shuffle and then permute the result
12767 /// to the destination lanes.
12768 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12769 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12770 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12771 int NumElts = VT.getVectorNumElements();
12772 int NumLanes = VT.getSizeInBits() / 128;
12773 int NumLaneElts = NumElts / NumLanes;
12775 // On AVX2 we may be able to just shuffle the lowest elements and then
12776 // broadcast the result.
12777 if (Subtarget.hasAVX2()) {
12778 for (unsigned BroadcastSize : {16, 32, 64}) {
12779 if (BroadcastSize <= VT.getScalarSizeInBits())
12781 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12783 // Attempt to match a repeating pattern every NumBroadcastElts,
12784 // accounting for UNDEFs but only references the lowest 128-bit
12785 // lane of the inputs.
12786 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12787 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12788 for (int j = 0; j != NumBroadcastElts; ++j) {
12789 int M = Mask[i + j];
12792 int &R = RepeatMask[j];
12793 if (0 != ((M % NumElts) / NumLaneElts))
12795 if (0 <= R && R != M)
12802 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12803 if (!FindRepeatingBroadcastMask(RepeatMask))
12806 // Shuffle the (lowest) repeated elements in place for broadcast.
12807 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12809 // Shuffle the actual broadcast.
12810 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12811 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12812 for (int j = 0; j != NumBroadcastElts; ++j)
12813 BroadcastMask[i + j] = j;
12814 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12819 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12820 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12823 // Bail if we already have a repeated lane shuffle mask.
12824 SmallVector<int, 8> RepeatedShuffleMask;
12825 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12828 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12829 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12830 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12831 int NumSubLanes = NumLanes * SubLaneScale;
12832 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12834 // Check that all the sources are coming from the same lane and see if we can
12835 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12836 // determine the source sub-lane for each destination sub-lane.
12837 int TopSrcSubLane = -1;
12838 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12839 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12840 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12841 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12843 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12844 // Extract the sub-lane mask, check that it all comes from the same lane
12845 // and normalize the mask entries to come from the first lane.
12847 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12848 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12849 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12852 int Lane = (M % NumElts) / NumLaneElts;
12853 if ((0 <= SrcLane) && (SrcLane != Lane))
12856 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12857 SubLaneMask[Elt] = LocalM;
12860 // Whole sub-lane is UNDEF.
12864 // Attempt to match against the candidate repeated sub-lane masks.
12865 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12866 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12867 for (int i = 0; i != NumSubLaneElts; ++i) {
12868 if (M1[i] < 0 || M2[i] < 0)
12870 if (M1[i] != M2[i])
12876 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12877 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12880 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12881 for (int i = 0; i != NumSubLaneElts; ++i) {
12882 int M = SubLaneMask[i];
12885 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12886 "Unexpected mask element");
12887 RepeatedSubLaneMask[i] = M;
12890 // Track the top most source sub-lane - by setting the remaining to UNDEF
12891 // we can greatly simplify shuffle matching.
12892 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12893 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12894 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12898 // Bail if we failed to find a matching repeated sub-lane mask.
12899 if (Dst2SrcSubLanes[DstSubLane] < 0)
12902 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12903 "Unexpected source lane");
12905 // Create a repeating shuffle mask for the entire vector.
12906 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12907 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12908 int Lane = SubLane / SubLaneScale;
12909 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12910 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12911 int M = RepeatedSubLaneMask[Elt];
12914 int Idx = (SubLane * NumSubLaneElts) + Elt;
12915 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12918 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12920 // Shuffle each source sub-lane to its destination.
12921 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12922 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12923 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12924 if (SrcSubLane < 0)
12926 for (int j = 0; j != NumSubLaneElts; ++j)
12927 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12930 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12934 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12935 unsigned &ShuffleImm,
12936 ArrayRef<int> Mask) {
12937 int NumElts = VT.getVectorNumElements();
12938 assert(VT.getScalarSizeInBits() == 64 &&
12939 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12940 "Unexpected data type for VSHUFPD");
12942 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12943 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12945 bool ShufpdMask = true;
12946 bool CommutableMask = true;
12947 for (int i = 0; i < NumElts; ++i) {
12948 if (Mask[i] == SM_SentinelUndef)
12952 int Val = (i & 6) + NumElts * (i & 1);
12953 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12954 if (Mask[i] < Val || Mask[i] > Val + 1)
12955 ShufpdMask = false;
12956 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12957 CommutableMask = false;
12958 ShuffleImm |= (Mask[i] % 2) << i;
12963 if (CommutableMask) {
12971 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12972 ArrayRef<int> Mask, SDValue V1,
12973 SDValue V2, SelectionDAG &DAG) {
12974 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12975 "Unexpected data type for VSHUFPD");
12977 unsigned Immediate = 0;
12978 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12981 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12982 DAG.getConstant(Immediate, DL, MVT::i8));
12985 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12986 ArrayRef<int> Mask, SDValue V1,
12987 SDValue V2, SelectionDAG &DAG) {
12988 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12989 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12991 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12993 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12995 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12998 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
13000 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
13001 /// isn't available.
13002 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13003 const APInt &Zeroable,
13004 SDValue V1, SDValue V2,
13005 const X86Subtarget &Subtarget,
13006 SelectionDAG &DAG) {
13007 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13008 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
13009 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13011 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
13012 Zeroable, Subtarget, DAG))
13015 if (V2.isUndef()) {
13016 // Check for being able to broadcast a single element.
13017 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
13018 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13021 // Use low duplicate instructions for masks that match their pattern.
13022 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
13023 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
13025 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
13026 // Non-half-crossing single input shuffles can be lowered with an
13027 // interleaved permutation.
13028 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13029 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
13030 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
13031 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13034 // With AVX2 we have direct support for this permutation.
13035 if (Subtarget.hasAVX2())
13036 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
13037 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13039 // Try to create an in-lane repeating shuffle mask and then shuffle the
13040 // the results into the target lanes.
13041 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13042 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13045 // Otherwise, fall back.
13046 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
13050 // Use dedicated unpack instructions for masks that match their pattern.
13052 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
13055 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
13056 Zeroable, Subtarget, DAG))
13059 // Check if the blend happens to exactly fit that of SHUFPD.
13061 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
13064 // Try to create an in-lane repeating shuffle mask and then shuffle the
13065 // the results into the target lanes.
13066 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13067 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13070 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13071 // shuffle. However, if we have AVX2 and either inputs are already in place,
13072 // we will be able to shuffle even across lanes the other input in a single
13073 // instruction so skip this pattern.
13074 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
13075 isShuffleMaskInputInPlace(1, Mask))))
13076 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13077 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
13079 // If we have VLX support, we can use VEXPAND.
13080 if (Subtarget.hasVLX())
13081 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
13082 V1, V2, DAG, Subtarget))
13085 // If we have AVX2 then we always want to lower with a blend because an v4 we
13086 // can fully permute the elements.
13087 if (Subtarget.hasAVX2())
13088 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
13091 // Otherwise fall back on generic lowering.
13092 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
13095 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
13097 /// This routine is only called when we have AVX2 and thus a reasonable
13098 /// instruction set for v4i64 shuffling..
13099 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13100 const APInt &Zeroable,
13101 SDValue V1, SDValue V2,
13102 const X86Subtarget &Subtarget,
13103 SelectionDAG &DAG) {
13104 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13105 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
13106 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
13107 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
13109 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
13110 Zeroable, Subtarget, DAG))
13113 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
13114 Zeroable, Subtarget, DAG))
13117 // Check for being able to broadcast a single element.
13118 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
13119 Mask, Subtarget, DAG))
13122 if (V2.isUndef()) {
13123 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13124 // can use lower latency instructions that will operate on both lanes.
13125 SmallVector<int, 2> RepeatedMask;
13126 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
13127 SmallVector<int, 4> PSHUFDMask;
13128 scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
13129 return DAG.getBitcast(
13131 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
13132 DAG.getBitcast(MVT::v8i32, V1),
13133 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13136 // AVX2 provides a direct instruction for permuting a single input across
13138 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
13139 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
13142 // Try to use shift instructions.
13143 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
13144 Zeroable, Subtarget, DAG))
13147 // If we have VLX support, we can use VALIGN or VEXPAND.
13148 if (Subtarget.hasVLX()) {
13149 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
13150 Mask, Subtarget, DAG))
13153 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
13154 V1, V2, DAG, Subtarget))
13158 // Try to use PALIGNR.
13159 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
13160 Mask, Subtarget, DAG))
13163 // Use dedicated unpack instructions for masks that match their pattern.
13165 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
13168 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13169 // shuffle. However, if we have AVX2 and either inputs are already in place,
13170 // we will be able to shuffle even across lanes the other input in a single
13171 // instruction so skip this pattern.
13172 if (!isShuffleMaskInputInPlace(0, Mask) &&
13173 !isShuffleMaskInputInPlace(1, Mask))
13174 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13175 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
13178 // Otherwise fall back on generic blend lowering.
13179 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
13183 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
13185 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
13186 /// isn't available.
13187 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13188 const APInt &Zeroable,
13189 SDValue V1, SDValue V2,
13190 const X86Subtarget &Subtarget,
13191 SelectionDAG &DAG) {
13192 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13193 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
13194 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13196 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
13197 Zeroable, Subtarget, DAG))
13200 // Check for being able to broadcast a single element.
13201 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
13202 Mask, Subtarget, DAG))
13205 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13206 // options to efficiently lower the shuffle.
13207 SmallVector<int, 4> RepeatedMask;
13208 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
13209 assert(RepeatedMask.size() == 4 &&
13210 "Repeated masks must be half the mask width!");
13212 // Use even/odd duplicate instructions for masks that match their pattern.
13213 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13214 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
13215 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13216 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
13219 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
13220 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13222 // Use dedicated unpack instructions for masks that match their pattern.
13224 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
13227 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
13228 // have already handled any direct blends.
13229 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
13232 // Try to create an in-lane repeating shuffle mask and then shuffle the
13233 // the results into the target lanes.
13234 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13235 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13238 // If we have a single input shuffle with different shuffle patterns in the
13239 // two 128-bit lanes use the variable mask to VPERMILPS.
13240 if (V2.isUndef()) {
13241 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13242 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
13243 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
13245 if (Subtarget.hasAVX2())
13246 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
13248 // Otherwise, fall back.
13249 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
13253 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13255 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13256 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
13258 // If we have VLX support, we can use VEXPAND.
13259 if (Subtarget.hasVLX())
13260 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
13261 V1, V2, DAG, Subtarget))
13264 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13265 // since after split we get a more efficient code using vpunpcklwd and
13266 // vpunpckhwd instrs than vblend.
13267 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
13268 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
13272 // If we have AVX2 then we always want to lower with a blend because at v8 we
13273 // can fully permute the elements.
13274 if (Subtarget.hasAVX2())
13275 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
13278 // Otherwise fall back on generic lowering.
13279 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
13282 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
13284 /// This routine is only called when we have AVX2 and thus a reasonable
13285 /// instruction set for v8i32 shuffling..
13286 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13287 const APInt &Zeroable,
13288 SDValue V1, SDValue V2,
13289 const X86Subtarget &Subtarget,
13290 SelectionDAG &DAG) {
13291 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13292 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
13293 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13294 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
13296 // Whenever we can lower this as a zext, that instruction is strictly faster
13297 // than any alternative. It also allows us to fold memory operands into the
13298 // shuffle in many cases.
13299 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13300 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13303 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
13304 // since after split we get a more efficient code than vblend by using
13305 // vpunpcklwd and vpunpckhwd instrs.
13306 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
13307 !Subtarget.hasAVX512())
13309 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
13312 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
13313 Zeroable, Subtarget, DAG))
13316 // Check for being able to broadcast a single element.
13317 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13318 Mask, Subtarget, DAG))
13321 // If the shuffle mask is repeated in each 128-bit lane we can use more
13322 // efficient instructions that mirror the shuffles across the two 128-bit
13324 SmallVector<int, 4> RepeatedMask;
13325 bool Is128BitLaneRepeatedShuffle =
13326 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13327 if (Is128BitLaneRepeatedShuffle) {
13328 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13330 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13331 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13333 // Use dedicated unpack instructions for masks that match their pattern.
13335 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13339 // Try to use shift instructions.
13340 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13341 Zeroable, Subtarget, DAG))
13344 // If we have VLX support, we can use VALIGN or EXPAND.
13345 if (Subtarget.hasVLX()) {
13346 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13347 Mask, Subtarget, DAG))
13350 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13351 V1, V2, DAG, Subtarget))
13355 // Try to use byte rotation instructions.
13356 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13357 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13360 // Try to create an in-lane repeating shuffle mask and then shuffle the
13361 // results into the target lanes.
13362 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13363 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13366 // If the shuffle patterns aren't repeated but it is a single input, directly
13367 // generate a cross-lane VPERMD instruction.
13368 if (V2.isUndef()) {
13369 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13370 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13373 // Assume that a single SHUFPS is faster than an alternative sequence of
13374 // multiple instructions (even if the CPU has a domain penalty).
13375 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13376 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13377 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13378 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13379 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13380 CastV1, CastV2, DAG);
13381 return DAG.getBitcast(MVT::v8i32, ShufPS);
13384 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13386 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13387 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13390 // Otherwise fall back on generic blend lowering.
13391 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13395 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13397 /// This routine is only called when we have AVX2 and thus a reasonable
13398 /// instruction set for v16i16 shuffling..
13399 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13400 const APInt &Zeroable,
13401 SDValue V1, SDValue V2,
13402 const X86Subtarget &Subtarget,
13403 SelectionDAG &DAG) {
13404 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13405 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13406 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13407 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13409 // Whenever we can lower this as a zext, that instruction is strictly faster
13410 // than any alternative. It also allows us to fold memory operands into the
13411 // shuffle in many cases.
13412 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13413 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13416 // Check for being able to broadcast a single element.
13417 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13418 Mask, Subtarget, DAG))
13421 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13422 Zeroable, Subtarget, DAG))
13425 // Use dedicated unpack instructions for masks that match their pattern.
13427 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13430 // Use dedicated pack instructions for masks that match their pattern.
13431 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
13435 // Try to use shift instructions.
13436 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13437 Zeroable, Subtarget, DAG))
13440 // Try to use byte rotation instructions.
13441 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13442 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13445 // Try to create an in-lane repeating shuffle mask and then shuffle the
13446 // the results into the target lanes.
13447 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13448 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13451 if (V2.isUndef()) {
13452 // There are no generalized cross-lane shuffle operations available on i16
13454 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13455 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13456 Mask, DAG, Subtarget);
13458 SmallVector<int, 8> RepeatedMask;
13459 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13460 // As this is a single-input shuffle, the repeated mask should be
13461 // a strictly valid v8i16 mask that we can pass through to the v8i16
13462 // lowering to handle even the v16 case.
13463 return lowerV8I16GeneralSingleInputVectorShuffle(
13464 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13468 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13469 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13472 // AVX512BWVL can lower to VPERMW.
13473 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13474 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13476 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13478 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13479 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13482 // Otherwise fall back on generic lowering.
13483 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13486 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13488 /// This routine is only called when we have AVX2 and thus a reasonable
13489 /// instruction set for v32i8 shuffling..
13490 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13491 const APInt &Zeroable,
13492 SDValue V1, SDValue V2,
13493 const X86Subtarget &Subtarget,
13494 SelectionDAG &DAG) {
13495 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13496 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13497 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13498 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13500 // Whenever we can lower this as a zext, that instruction is strictly faster
13501 // than any alternative. It also allows us to fold memory operands into the
13502 // shuffle in many cases.
13503 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13504 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13507 // Check for being able to broadcast a single element.
13508 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13509 Mask, Subtarget, DAG))
13512 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13513 Zeroable, Subtarget, DAG))
13516 // Use dedicated unpack instructions for masks that match their pattern.
13518 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13521 // Use dedicated pack instructions for masks that match their pattern.
13522 if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
13526 // Try to use shift instructions.
13527 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13528 Zeroable, Subtarget, DAG))
13531 // Try to use byte rotation instructions.
13532 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13533 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13536 // Try to create an in-lane repeating shuffle mask and then shuffle the
13537 // the results into the target lanes.
13538 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13539 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13542 // There are no generalized cross-lane shuffle operations available on i8
13544 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13545 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13548 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13549 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13552 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13554 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13555 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13558 // Otherwise fall back on generic lowering.
13559 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13562 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13564 /// This routine either breaks down the specific type of a 256-bit x86 vector
13565 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13566 /// together based on the available instructions.
13567 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13568 MVT VT, SDValue V1, SDValue V2,
13569 const APInt &Zeroable,
13570 const X86Subtarget &Subtarget,
13571 SelectionDAG &DAG) {
13572 // If we have a single input to the zero element, insert that into V1 if we
13573 // can do so cheaply.
13574 int NumElts = VT.getVectorNumElements();
13575 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13577 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13578 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13579 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13582 // Handle special cases where the lower or upper half is UNDEF.
13584 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13587 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13588 // can check for those subtargets here and avoid much of the subtarget
13589 // querying in the per-vector-type lowering routines. With AVX1 we have
13590 // essentially *zero* ability to manipulate a 256-bit vector with integer
13591 // types. Since we'll use floating point types there eventually, just
13592 // immediately cast everything to a float and operate entirely in that domain.
13593 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13594 int ElementBits = VT.getScalarSizeInBits();
13595 if (ElementBits < 32) {
13596 // No floating point type available, if we can't use the bit operations
13597 // for masking/blending then decompose into 128-bit vectors.
13599 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13601 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13603 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13606 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13607 VT.getVectorNumElements());
13608 V1 = DAG.getBitcast(FpVT, V1);
13609 V2 = DAG.getBitcast(FpVT, V2);
13610 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13613 switch (VT.SimpleTy) {
13615 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13617 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13619 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13621 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13623 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13625 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13628 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13632 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13633 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13634 ArrayRef<int> Mask, SDValue V1,
13635 SDValue V2, SelectionDAG &DAG) {
13636 assert(VT.getScalarSizeInBits() == 64 &&
13637 "Unexpected element type size for 128bit shuffle.");
13639 // To handle 256 bit vector requires VLX and most probably
13640 // function lowerV2X128VectorShuffle() is better solution.
13641 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13643 SmallVector<int, 4> WidenedMask;
13644 if (!canWidenShuffleElements(Mask, WidenedMask))
13647 // Check for patterns which can be matched with a single insert of a 256-bit
13649 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13650 {0, 1, 2, 3, 0, 1, 2, 3});
13651 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13652 {0, 1, 2, 3, 8, 9, 10, 11})) {
13653 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13654 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13655 DAG.getIntPtrConstant(0, DL));
13656 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13657 OnlyUsesV1 ? V1 : V2,
13658 DAG.getIntPtrConstant(0, DL));
13659 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13662 assert(WidenedMask.size() == 4);
13664 // See if this is an insertion of the lower 128-bits of V2 into V1.
13665 bool IsInsert = true;
13667 for (int i = 0; i < 4; ++i) {
13668 assert(WidenedMask[i] >= -1);
13669 if (WidenedMask[i] < 0)
13672 // Make sure all V1 subvectors are in place.
13673 if (WidenedMask[i] < 4) {
13674 if (WidenedMask[i] != i) {
13679 // Make sure we only have a single V2 index and its the lowest 128-bits.
13680 if (V2Index >= 0 || WidenedMask[i] != 4) {
13687 if (IsInsert && V2Index >= 0) {
13688 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13689 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13690 DAG.getIntPtrConstant(0, DL));
13691 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13694 // Try to lower to to vshuf64x2/vshuf32x4.
13695 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13696 unsigned PermMask = 0;
13697 // Insure elements came from the same Op.
13698 for (int i = 0; i < 4; ++i) {
13699 assert(WidenedMask[i] >= -1);
13700 if (WidenedMask[i] < 0)
13703 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13704 unsigned OpIndex = i / 2;
13705 if (Ops[OpIndex].isUndef())
13707 else if (Ops[OpIndex] != Op)
13710 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13711 // bits defined by a vshuf64x2 instruction's immediate control byte.
13712 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13715 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13716 DAG.getConstant(PermMask, DL, MVT::i8));
13719 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13720 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13721 const APInt &Zeroable,
13722 SDValue V1, SDValue V2,
13723 const X86Subtarget &Subtarget,
13724 SelectionDAG &DAG) {
13725 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13726 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13727 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13729 if (V2.isUndef()) {
13730 // Use low duplicate instructions for masks that match their pattern.
13731 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13732 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13734 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13735 // Non-half-crossing single input shuffles can be lowered with an
13736 // interleaved permutation.
13737 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13738 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13739 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13740 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13741 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13742 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13745 SmallVector<int, 4> RepeatedMask;
13746 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13747 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13748 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13751 if (SDValue Shuf128 =
13752 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13755 if (SDValue Unpck =
13756 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13759 // Check if the blend happens to exactly fit that of SHUFPD.
13761 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13764 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13765 V2, DAG, Subtarget))
13768 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13769 Zeroable, Subtarget, DAG))
13772 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13775 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13776 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13777 const APInt &Zeroable,
13778 SDValue V1, SDValue V2,
13779 const X86Subtarget &Subtarget,
13780 SelectionDAG &DAG) {
13781 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13782 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13783 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13785 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13786 // options to efficiently lower the shuffle.
13787 SmallVector<int, 4> RepeatedMask;
13788 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13789 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13791 // Use even/odd duplicate instructions for masks that match their pattern.
13792 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13793 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13794 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13795 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13798 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13799 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13801 // Use dedicated unpack instructions for masks that match their pattern.
13802 if (SDValue Unpck =
13803 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13806 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13807 Zeroable, Subtarget, DAG))
13810 // Otherwise, fall back to a SHUFPS sequence.
13811 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13814 // If we have a single input shuffle with different shuffle patterns in the
13815 // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
13816 if (V2.isUndef() &&
13817 !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
13818 SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
13819 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
13822 // If we have AVX512F support, we can use VEXPAND.
13823 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13824 V1, V2, DAG, Subtarget))
13827 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13830 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13831 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13832 const APInt &Zeroable,
13833 SDValue V1, SDValue V2,
13834 const X86Subtarget &Subtarget,
13835 SelectionDAG &DAG) {
13836 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13837 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13838 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13840 if (V2.isUndef()) {
13841 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13842 // can use lower latency instructions that will operate on all four
13844 SmallVector<int, 2> Repeated128Mask;
13845 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13846 SmallVector<int, 4> PSHUFDMask;
13847 scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
13848 return DAG.getBitcast(
13850 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13851 DAG.getBitcast(MVT::v16i32, V1),
13852 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13855 SmallVector<int, 4> Repeated256Mask;
13856 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13857 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13858 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13861 if (SDValue Shuf128 =
13862 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13865 // Try to use shift instructions.
13866 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13867 Zeroable, Subtarget, DAG))
13870 // Try to use VALIGN.
13871 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13872 Mask, Subtarget, DAG))
13875 // Try to use PALIGNR.
13876 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13877 Mask, Subtarget, DAG))
13880 if (SDValue Unpck =
13881 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13883 // If we have AVX512F support, we can use VEXPAND.
13884 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13885 V2, DAG, Subtarget))
13888 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13889 Zeroable, Subtarget, DAG))
13892 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13895 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13896 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13897 const APInt &Zeroable,
13898 SDValue V1, SDValue V2,
13899 const X86Subtarget &Subtarget,
13900 SelectionDAG &DAG) {
13901 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13902 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13903 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13905 // Whenever we can lower this as a zext, that instruction is strictly faster
13906 // than any alternative. It also allows us to fold memory operands into the
13907 // shuffle in many cases.
13908 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13909 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13912 // If the shuffle mask is repeated in each 128-bit lane we can use more
13913 // efficient instructions that mirror the shuffles across the four 128-bit
13915 SmallVector<int, 4> RepeatedMask;
13916 bool Is128BitLaneRepeatedShuffle =
13917 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13918 if (Is128BitLaneRepeatedShuffle) {
13919 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13921 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13922 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13924 // Use dedicated unpack instructions for masks that match their pattern.
13926 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13930 // Try to use shift instructions.
13931 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13932 Zeroable, Subtarget, DAG))
13935 // Try to use VALIGN.
13936 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13937 Mask, Subtarget, DAG))
13940 // Try to use byte rotation instructions.
13941 if (Subtarget.hasBWI())
13942 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13943 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13946 // Assume that a single SHUFPS is faster than using a permv shuffle.
13947 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13948 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13949 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13950 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13951 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13952 CastV1, CastV2, DAG);
13953 return DAG.getBitcast(MVT::v16i32, ShufPS);
13955 // If we have AVX512F support, we can use VEXPAND.
13956 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13957 V1, V2, DAG, Subtarget))
13960 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13961 Zeroable, Subtarget, DAG))
13963 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13966 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13967 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13968 const APInt &Zeroable,
13969 SDValue V1, SDValue V2,
13970 const X86Subtarget &Subtarget,
13971 SelectionDAG &DAG) {
13972 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13973 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13974 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13975 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13977 // Whenever we can lower this as a zext, that instruction is strictly faster
13978 // than any alternative. It also allows us to fold memory operands into the
13979 // shuffle in many cases.
13980 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13981 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13984 // Use dedicated unpack instructions for masks that match their pattern.
13986 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13989 // Try to use shift instructions.
13990 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13991 Zeroable, Subtarget, DAG))
13994 // Try to use byte rotation instructions.
13995 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13996 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13999 if (V2.isUndef()) {
14000 SmallVector<int, 8> RepeatedMask;
14001 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
14002 // As this is a single-input shuffle, the repeated mask should be
14003 // a strictly valid v8i16 mask that we can pass through to the v8i16
14004 // lowering to handle even the v32 case.
14005 return lowerV8I16GeneralSingleInputVectorShuffle(
14006 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
14010 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
14011 Zeroable, Subtarget, DAG))
14014 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
14017 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
14018 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14019 const APInt &Zeroable,
14020 SDValue V1, SDValue V2,
14021 const X86Subtarget &Subtarget,
14022 SelectionDAG &DAG) {
14023 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14024 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
14025 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
14026 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
14028 // Whenever we can lower this as a zext, that instruction is strictly faster
14029 // than any alternative. It also allows us to fold memory operands into the
14030 // shuffle in many cases.
14031 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
14032 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
14035 // Use dedicated unpack instructions for masks that match their pattern.
14037 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
14040 // Try to use shift instructions.
14041 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
14042 Zeroable, Subtarget, DAG))
14045 // Try to use byte rotation instructions.
14046 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
14047 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14050 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
14051 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
14054 // VBMI can use VPERMV/VPERMV3 byte shuffles.
14055 if (Subtarget.hasVBMI())
14056 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
14058 // Try to create an in-lane repeating shuffle mask and then shuffle the
14059 // the results into the target lanes.
14060 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
14061 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
14064 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
14065 Zeroable, Subtarget, DAG))
14068 // FIXME: Implement direct support for this type!
14069 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
14072 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
14074 /// This routine either breaks down the specific type of a 512-bit x86 vector
14075 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
14076 /// together based on the available instructions.
14077 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14078 MVT VT, SDValue V1, SDValue V2,
14079 const APInt &Zeroable,
14080 const X86Subtarget &Subtarget,
14081 SelectionDAG &DAG) {
14082 assert(Subtarget.hasAVX512() &&
14083 "Cannot lower 512-bit vectors w/ basic ISA!");
14085 // If we have a single input to the zero element, insert that into V1 if we
14086 // can do so cheaply.
14087 int NumElts = Mask.size();
14088 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
14090 if (NumV2Elements == 1 && Mask[0] >= NumElts)
14091 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
14092 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
14095 // Handle special cases where the lower or upper half is UNDEF.
14097 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
14100 // Check for being able to broadcast a single element.
14101 if (SDValue Broadcast =
14102 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
14105 // Dispatch to each element type for lowering. If we don't have support for
14106 // specific element type shuffles at 512 bits, immediately split them and
14107 // lower them. Each lowering routine of a given type is allowed to assume that
14108 // the requisite ISA extensions for that element type are available.
14109 switch (VT.SimpleTy) {
14111 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14113 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14115 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14117 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14119 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14121 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
14124 llvm_unreachable("Not a valid 512-bit x86 vector type!");
14128 // Lower vXi1 vector shuffles.
14129 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
14130 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
14131 // vector, shuffle and then truncate it back.
14132 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
14133 MVT VT, SDValue V1, SDValue V2,
14134 const X86Subtarget &Subtarget,
14135 SelectionDAG &DAG) {
14136 assert(Subtarget.hasAVX512() &&
14137 "Cannot lower 512-bit vectors w/o basic ISA!");
14139 switch (VT.SimpleTy) {
14141 llvm_unreachable("Expected a vector of i1 elements");
14143 ExtVT = MVT::v2i64;
14146 ExtVT = MVT::v4i32;
14149 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
14152 ExtVT = MVT::v16i32;
14155 ExtVT = MVT::v32i16;
14158 ExtVT = MVT::v64i8;
14162 if (ISD::isBuildVectorAllZeros(V1.getNode()))
14163 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14164 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
14165 V1 = getOnesVector(ExtVT, DAG, DL);
14167 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
14170 V2 = DAG.getUNDEF(ExtVT);
14171 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
14172 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
14173 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
14174 V2 = getOnesVector(ExtVT, DAG, DL);
14176 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
14178 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
14179 // i1 was sign extended we can use X86ISD::CVT2MASK.
14180 int NumElems = VT.getVectorNumElements();
14181 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
14182 (Subtarget.hasDQI() && (NumElems < 32)))
14183 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
14185 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
14188 /// Helper function that returns true if the shuffle mask should be
14189 /// commuted to improve canonicalization.
14190 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
14191 int NumElements = Mask.size();
14193 int NumV1Elements = 0, NumV2Elements = 0;
14197 else if (M < NumElements)
14202 // Commute the shuffle as needed such that more elements come from V1 than
14203 // V2. This allows us to match the shuffle pattern strictly on how many
14204 // elements come from V1 without handling the symmetric cases.
14205 if (NumV2Elements > NumV1Elements)
14208 assert(NumV1Elements > 0 && "No V1 indices");
14210 if (NumV2Elements == 0)
14213 // When the number of V1 and V2 elements are the same, try to minimize the
14214 // number of uses of V2 in the low half of the vector. When that is tied,
14215 // ensure that the sum of indices for V1 is equal to or lower than the sum
14216 // indices for V2. When those are equal, try to ensure that the number of odd
14217 // indices for V1 is lower than the number of odd indices for V2.
14218 if (NumV1Elements == NumV2Elements) {
14219 int LowV1Elements = 0, LowV2Elements = 0;
14220 for (int M : Mask.slice(0, NumElements / 2))
14221 if (M >= NumElements)
14225 if (LowV2Elements > LowV1Elements)
14227 if (LowV2Elements == LowV1Elements) {
14228 int SumV1Indices = 0, SumV2Indices = 0;
14229 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14230 if (Mask[i] >= NumElements)
14232 else if (Mask[i] >= 0)
14234 if (SumV2Indices < SumV1Indices)
14236 if (SumV2Indices == SumV1Indices) {
14237 int NumV1OddIndices = 0, NumV2OddIndices = 0;
14238 for (int i = 0, Size = Mask.size(); i < Size; ++i)
14239 if (Mask[i] >= NumElements)
14240 NumV2OddIndices += i % 2;
14241 else if (Mask[i] >= 0)
14242 NumV1OddIndices += i % 2;
14243 if (NumV2OddIndices < NumV1OddIndices)
14252 /// \brief Top-level lowering for x86 vector shuffles.
14254 /// This handles decomposition, canonicalization, and lowering of all x86
14255 /// vector shuffles. Most of the specific lowering strategies are encapsulated
14256 /// above in helper routines. The canonicalization attempts to widen shuffles
14257 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
14258 /// s.t. only one of the two inputs needs to be tested, etc.
14259 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
14260 SelectionDAG &DAG) {
14261 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
14262 ArrayRef<int> Mask = SVOp->getMask();
14263 SDValue V1 = Op.getOperand(0);
14264 SDValue V2 = Op.getOperand(1);
14265 MVT VT = Op.getSimpleValueType();
14266 int NumElements = VT.getVectorNumElements();
14268 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
14270 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
14271 "Can't lower MMX shuffles");
14273 bool V1IsUndef = V1.isUndef();
14274 bool V2IsUndef = V2.isUndef();
14275 if (V1IsUndef && V2IsUndef)
14276 return DAG.getUNDEF(VT);
14278 // When we create a shuffle node we put the UNDEF node to second operand,
14279 // but in some cases the first operand may be transformed to UNDEF.
14280 // In this case we should just commute the node.
14282 return DAG.getCommutedVectorShuffle(*SVOp);
14284 // Check for non-undef masks pointing at an undef vector and make the masks
14285 // undef as well. This makes it easier to match the shuffle based solely on
14289 if (M >= NumElements) {
14290 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
14291 for (int &M : NewMask)
14292 if (M >= NumElements)
14294 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
14297 // Check for illegal shuffle mask element index values.
14298 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
14299 assert(llvm::all_of(Mask,
14300 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
14301 "Out of bounds shuffle index");
14303 // We actually see shuffles that are entirely re-arrangements of a set of
14304 // zero inputs. This mostly happens while decomposing complex shuffles into
14305 // simple ones. Directly lower these as a buildvector of zeros.
14306 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
14307 if (Zeroable.isAllOnesValue())
14308 return getZeroVector(VT, Subtarget, DAG, DL);
14310 // Try to collapse shuffles into using a vector type with fewer elements but
14311 // wider element types. We cap this to not form integers or floating point
14312 // elements wider than 64 bits, but it might be interesting to form i128
14313 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
14314 SmallVector<int, 16> WidenedMask;
14315 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
14316 canWidenShuffleElements(Mask, WidenedMask)) {
14317 MVT NewEltVT = VT.isFloatingPoint()
14318 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
14319 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
14320 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
14321 // Make sure that the new vector type is legal. For example, v2f64 isn't
14323 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
14324 V1 = DAG.getBitcast(NewVT, V1);
14325 V2 = DAG.getBitcast(NewVT, V2);
14326 return DAG.getBitcast(
14327 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
14331 // Commute the shuffle if it will improve canonicalization.
14332 if (canonicalizeShuffleMaskWithCommute(Mask))
14333 return DAG.getCommutedVectorShuffle(*SVOp);
14335 // For each vector width, delegate to a specialized lowering routine.
14336 if (VT.is128BitVector())
14337 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14340 if (VT.is256BitVector())
14341 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14344 if (VT.is512BitVector())
14345 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14349 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14351 llvm_unreachable("Unimplemented!");
14354 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14355 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14356 const X86Subtarget &Subtarget,
14357 SelectionDAG &DAG) {
14358 SDValue Cond = Op.getOperand(0);
14359 SDValue LHS = Op.getOperand(1);
14360 SDValue RHS = Op.getOperand(2);
14362 MVT VT = Op.getSimpleValueType();
14364 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14366 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14368 // Only non-legal VSELECTs reach this lowering, convert those into generic
14369 // shuffles and re-use the shuffle lowering path for blends.
14370 SmallVector<int, 32> Mask;
14371 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14372 SDValue CondElt = CondBV->getOperand(i);
14374 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14377 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14380 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14381 // A vselect where all conditions and data are constants can be optimized into
14382 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14383 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14384 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14385 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14388 // Try to lower this to a blend-style vector shuffle. This can handle all
14389 // constant condition cases.
14390 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14393 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14394 // with patterns on the mask registers on AVX-512.
14395 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14398 // Variable blends are only legal from SSE4.1 onward.
14399 if (!Subtarget.hasSSE41())
14403 MVT VT = Op.getSimpleValueType();
14405 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14406 // into an i1 condition so that we can use the mask-based 512-bit blend
14408 if (VT.getSizeInBits() == 512) {
14409 SDValue Cond = Op.getOperand(0);
14410 // The vNi1 condition case should be handled above as it can be trivially
14412 assert(Cond.getValueType().getScalarSizeInBits() ==
14413 VT.getScalarSizeInBits() &&
14414 "Should have a size-matched integer condition!");
14415 // Build a mask by testing the condition against itself (tests for zero).
14416 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14417 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14418 // Now return a new VSELECT using the mask.
14419 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14422 // Only some types will be legal on some subtargets. If we can emit a legal
14423 // VSELECT-matching blend, return Op, and but if we need to expand, return
14425 switch (VT.SimpleTy) {
14427 // Most of the vector types have blends past SSE4.1.
14431 // The byte blends for AVX vectors were introduced only in AVX2.
14432 if (Subtarget.hasAVX2())
14439 // FIXME: We should custom lower this by fixing the condition and using i8
14445 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14446 MVT VT = Op.getSimpleValueType();
14449 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14452 if (VT.getSizeInBits() == 8) {
14453 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14454 Op.getOperand(0), Op.getOperand(1));
14455 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14458 if (VT == MVT::f32) {
14459 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14460 // the result back to FR32 register. It's only worth matching if the
14461 // result has a single use which is a store or a bitcast to i32. And in
14462 // the case of a store, it's not worth it if the index is a constant 0,
14463 // because a MOVSSmr can be used instead, which is smaller and faster.
14464 if (!Op.hasOneUse())
14466 SDNode *User = *Op.getNode()->use_begin();
14467 if ((User->getOpcode() != ISD::STORE ||
14468 isNullConstant(Op.getOperand(1))) &&
14469 (User->getOpcode() != ISD::BITCAST ||
14470 User->getValueType(0) != MVT::i32))
14472 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14473 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14475 return DAG.getBitcast(MVT::f32, Extract);
14478 if (VT == MVT::i32 || VT == MVT::i64) {
14479 // ExtractPS/pextrq works with constant index.
14480 if (isa<ConstantSDNode>(Op.getOperand(1)))
14487 /// Extract one bit from mask vector, like v16i1 or v8i1.
14488 /// AVX-512 feature.
14490 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14491 SDValue Vec = Op.getOperand(0);
14493 MVT VecVT = Vec.getSimpleValueType();
14494 SDValue Idx = Op.getOperand(1);
14495 MVT EltVT = Op.getSimpleValueType();
14497 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14498 "Unexpected vector type in ExtractBitFromMaskVector");
14500 // variable index can't be handled in mask registers,
14501 // extend vector to VR512/128
14502 if (!isa<ConstantSDNode>(Idx)) {
14503 unsigned NumElts = VecVT.getVectorNumElements();
14504 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14505 // than extending to 128/256bit.
14506 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14507 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14508 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14509 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14510 ExtVT.getVectorElementType(), Ext, Idx);
14511 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14514 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14515 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14516 (VecVT.getVectorNumElements() < 8)) {
14517 // Use kshiftlw/rw instruction.
14518 VecVT = MVT::v16i1;
14519 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14520 DAG.getUNDEF(VecVT),
14522 DAG.getIntPtrConstant(0, dl));
14524 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14525 if (MaxSift - IdxVal)
14526 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14527 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14528 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14529 DAG.getConstant(MaxSift, dl, MVT::i8));
14530 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14531 DAG.getIntPtrConstant(0, dl));
14535 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14536 SelectionDAG &DAG) const {
14538 SDValue Vec = Op.getOperand(0);
14539 MVT VecVT = Vec.getSimpleValueType();
14540 SDValue Idx = Op.getOperand(1);
14542 if (VecVT.getVectorElementType() == MVT::i1)
14543 return ExtractBitFromMaskVector(Op, DAG);
14545 if (!isa<ConstantSDNode>(Idx)) {
14546 // Its more profitable to go through memory (1 cycles throughput)
14547 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14548 // IACA tool was used to get performance estimation
14549 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14551 // example : extractelement <16 x i8> %a, i32 %i
14553 // Block Throughput: 3.00 Cycles
14554 // Throughput Bottleneck: Port5
14556 // | Num Of | Ports pressure in cycles | |
14557 // | Uops | 0 - DV | 5 | 6 | 7 | |
14558 // ---------------------------------------------
14559 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14560 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14561 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14562 // Total Num Of Uops: 4
14565 // Block Throughput: 1.00 Cycles
14566 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14568 // | | Ports pressure in cycles | |
14569 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14570 // ---------------------------------------------------------
14571 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14572 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14573 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14574 // Total Num Of Uops: 4
14579 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14581 // If this is a 256-bit vector result, first extract the 128-bit vector and
14582 // then extract the element from the 128-bit vector.
14583 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14584 // Get the 128-bit vector.
14585 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14586 MVT EltVT = VecVT.getVectorElementType();
14588 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14589 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14591 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14592 // this can be done with a mask.
14593 IdxVal &= ElemsPerChunk - 1;
14594 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14595 DAG.getConstant(IdxVal, dl, MVT::i32));
14598 assert(VecVT.is128BitVector() && "Unexpected vector length");
14600 MVT VT = Op.getSimpleValueType();
14602 if (VT.getSizeInBits() == 16) {
14603 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14604 // we're going to zero extend the register or fold the store (SSE41 only).
14605 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14606 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14607 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14608 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14609 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14611 // Transform it so it match pextrw which produces a 32-bit result.
14612 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14613 Op.getOperand(0), Op.getOperand(1));
14614 return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
14617 if (Subtarget.hasSSE41())
14618 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14621 // TODO: We only extract a single element from v16i8, we can probably afford
14622 // to be more aggressive here before using the default approach of spilling to
14624 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14625 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14626 int DWordIdx = IdxVal / 4;
14627 if (DWordIdx == 0) {
14628 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14629 DAG.getBitcast(MVT::v4i32, Vec),
14630 DAG.getIntPtrConstant(DWordIdx, dl));
14631 int ShiftVal = (IdxVal % 4) * 8;
14633 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14634 DAG.getConstant(ShiftVal, dl, MVT::i32));
14635 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14638 int WordIdx = IdxVal / 2;
14639 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14640 DAG.getBitcast(MVT::v8i16, Vec),
14641 DAG.getIntPtrConstant(WordIdx, dl));
14642 int ShiftVal = (IdxVal % 2) * 8;
14644 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14645 DAG.getConstant(ShiftVal, dl, MVT::i16));
14646 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14649 if (VT.getSizeInBits() == 32) {
14653 // SHUFPS the element to the lowest double word, then movss.
14654 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14655 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14656 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14657 DAG.getIntPtrConstant(0, dl));
14660 if (VT.getSizeInBits() == 64) {
14661 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14662 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14663 // to match extract_elt for f64.
14667 // UNPCKHPD the element to the lowest double word, then movsd.
14668 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14669 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14670 int Mask[2] = { 1, -1 };
14671 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14672 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14673 DAG.getIntPtrConstant(0, dl));
14679 /// Insert one bit to mask vector, like v16i1 or v8i1.
14680 /// AVX-512 feature.
14682 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14684 SDValue Vec = Op.getOperand(0);
14685 SDValue Elt = Op.getOperand(1);
14686 SDValue Idx = Op.getOperand(2);
14687 MVT VecVT = Vec.getSimpleValueType();
14689 if (!isa<ConstantSDNode>(Idx)) {
14690 // Non constant index. Extend source and destination,
14691 // insert element and then truncate the result.
14692 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14693 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14694 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14695 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14696 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14697 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14700 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14701 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14702 unsigned NumElems = VecVT.getVectorNumElements();
14704 if(Vec.isUndef()) {
14706 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14707 DAG.getConstant(IdxVal, dl, MVT::i8));
14711 // Insertion of one bit into first position
14712 if (IdxVal == 0 ) {
14713 // Clean top bits of vector.
14714 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14715 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14716 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14717 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14718 // Clean the first bit in source vector.
14719 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14720 DAG.getConstant(1 , dl, MVT::i8));
14721 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14722 DAG.getConstant(1, dl, MVT::i8));
14724 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14726 // Insertion of one bit into last position
14727 if (IdxVal == NumElems -1) {
14728 // Move the bit to the last position inside the vector.
14729 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14730 DAG.getConstant(IdxVal, dl, MVT::i8));
14731 // Clean the last bit in the source vector.
14732 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14733 DAG.getConstant(1, dl, MVT::i8));
14734 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14735 DAG.getConstant(1 , dl, MVT::i8));
14737 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14740 // Use shuffle to insert element.
14741 SmallVector<int, 64> MaskVec(NumElems);
14742 for (unsigned i = 0; i != NumElems; ++i)
14743 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14745 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14748 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14749 SelectionDAG &DAG) const {
14750 MVT VT = Op.getSimpleValueType();
14751 MVT EltVT = VT.getVectorElementType();
14752 unsigned NumElts = VT.getVectorNumElements();
14754 if (EltVT == MVT::i1)
14755 return InsertBitToMaskVector(Op, DAG);
14758 SDValue N0 = Op.getOperand(0);
14759 SDValue N1 = Op.getOperand(1);
14760 SDValue N2 = Op.getOperand(2);
14761 if (!isa<ConstantSDNode>(N2))
14763 auto *N2C = cast<ConstantSDNode>(N2);
14764 unsigned IdxVal = N2C->getZExtValue();
14766 bool IsZeroElt = X86::isZeroNode(N1);
14767 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14769 // If we are inserting a element, see if we can do this more efficiently with
14770 // a blend shuffle with a rematerializable vector than a costly integer
14772 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14773 16 <= EltVT.getSizeInBits()) {
14774 SmallVector<int, 8> BlendMask;
14775 for (unsigned i = 0; i != NumElts; ++i)
14776 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14777 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14778 : getOnesVector(VT, DAG, dl);
14779 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14782 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14783 // into that, and then insert the subvector back into the result.
14784 if (VT.is256BitVector() || VT.is512BitVector()) {
14785 // With a 256-bit vector, we can insert into the zero element efficiently
14786 // using a blend if we have AVX or AVX2 and the right data type.
14787 if (VT.is256BitVector() && IdxVal == 0) {
14788 // TODO: It is worthwhile to cast integer to floating point and back
14789 // and incur a domain crossing penalty if that's what we'll end up
14790 // doing anyway after extracting to a 128-bit vector.
14791 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14792 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14793 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14794 N2 = DAG.getIntPtrConstant(1, dl);
14795 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14799 // Get the desired 128-bit vector chunk.
14800 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14802 // Insert the element into the desired chunk.
14803 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14804 assert(isPowerOf2_32(NumEltsIn128));
14805 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14806 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14808 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14809 DAG.getConstant(IdxIn128, dl, MVT::i32));
14811 // Insert the changed part back into the bigger vector
14812 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14814 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14816 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14817 // argument. SSE41 required for pinsrb.
14818 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14820 if (VT == MVT::v8i16) {
14821 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14822 Opc = X86ISD::PINSRW;
14824 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14825 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14826 Opc = X86ISD::PINSRB;
14829 if (N1.getValueType() != MVT::i32)
14830 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14831 if (N2.getValueType() != MVT::i32)
14832 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14833 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14836 if (Subtarget.hasSSE41()) {
14837 if (EltVT == MVT::f32) {
14838 // Bits [7:6] of the constant are the source select. This will always be
14839 // zero here. The DAG Combiner may combine an extract_elt index into
14840 // these bits. For example (insert (extract, 3), 2) could be matched by
14841 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14842 // Bits [5:4] of the constant are the destination select. This is the
14843 // value of the incoming immediate.
14844 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14845 // combine either bitwise AND or insert of float 0.0 to set these bits.
14847 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14848 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14849 // If this is an insertion of 32-bits into the low 32-bits of
14850 // a vector, we prefer to generate a blend with immediate rather
14851 // than an insertps. Blends are simpler operations in hardware and so
14852 // will always have equal or better performance than insertps.
14853 // But if optimizing for size and there's a load folding opportunity,
14854 // generate insertps because blendps does not have a 32-bit memory
14856 N2 = DAG.getIntPtrConstant(1, dl);
14857 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14858 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14860 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14861 // Create this as a scalar to vector..
14862 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14863 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14866 // PINSR* works with constant index.
14867 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14874 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14875 SelectionDAG &DAG) {
14877 MVT OpVT = Op.getSimpleValueType();
14879 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14881 if (X86::isZeroNode(Op.getOperand(0)))
14882 return getZeroVector(OpVT, Subtarget, DAG, dl);
14884 // If this is a 256-bit vector result, first insert into a 128-bit
14885 // vector and then insert into the 256-bit vector.
14886 if (!OpVT.is128BitVector()) {
14887 // Insert into a 128-bit vector.
14888 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14889 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14890 OpVT.getVectorNumElements() / SizeFactor);
14892 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14894 // Insert the 128-bit vector.
14895 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14897 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14899 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14900 if (OpVT == MVT::v4i32)
14903 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14904 return DAG.getBitcast(
14905 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14908 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14909 // simple superregister reference or explicit instructions to insert
14910 // the upper bits of a vector.
14911 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14912 SelectionDAG &DAG) {
14913 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14915 return insert1BitVector(Op, DAG, Subtarget);
14918 // Returns the appropriate wrapper opcode for a global reference.
14919 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14920 // References to absolute symbols are never PC-relative.
14921 if (GV && GV->isAbsoluteSymbolRef())
14922 return X86ISD::Wrapper;
14924 CodeModel::Model M = getTargetMachine().getCodeModel();
14925 if (Subtarget.isPICStyleRIPRel() &&
14926 (M == CodeModel::Small || M == CodeModel::Kernel))
14927 return X86ISD::WrapperRIP;
14929 return X86ISD::Wrapper;
14932 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14933 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14934 // one of the above mentioned nodes. It has to be wrapped because otherwise
14935 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14936 // be used to form addressing mode. These wrapped nodes will be selected
14939 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14940 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14942 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14943 // global base reg.
14944 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14946 auto PtrVT = getPointerTy(DAG.getDataLayout());
14947 SDValue Result = DAG.getTargetConstantPool(
14948 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14950 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14951 // With PIC, the address is actually $g + Offset.
14954 DAG.getNode(ISD::ADD, DL, PtrVT,
14955 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14961 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14962 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14964 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14965 // global base reg.
14966 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14968 auto PtrVT = getPointerTy(DAG.getDataLayout());
14969 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14971 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14973 // With PIC, the address is actually $g + Offset.
14976 DAG.getNode(ISD::ADD, DL, PtrVT,
14977 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14983 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14984 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14986 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14987 // global base reg.
14988 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14989 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14991 auto PtrVT = getPointerTy(DAG.getDataLayout());
14992 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14995 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14997 // With PIC, the address is actually $g + Offset.
14998 if (isPositionIndependent() && !Subtarget.is64Bit()) {
15000 DAG.getNode(ISD::ADD, DL, PtrVT,
15001 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
15004 // For symbols that require a load from a stub to get the address, emit the
15006 if (isGlobalStubReference(OpFlag))
15007 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
15008 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15014 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
15015 // Create the TargetBlockAddressAddress node.
15016 unsigned char OpFlags =
15017 Subtarget.classifyBlockAddressReference();
15018 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
15019 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
15021 auto PtrVT = getPointerTy(DAG.getDataLayout());
15022 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
15023 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
15025 // With PIC, the address is actually $g + Offset.
15026 if (isGlobalRelativeToPICBase(OpFlags)) {
15027 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15028 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15034 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
15035 const SDLoc &dl, int64_t Offset,
15036 SelectionDAG &DAG) const {
15037 // Create the TargetGlobalAddress node, folding in the constant
15038 // offset if it is legal.
15039 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
15040 CodeModel::Model M = DAG.getTarget().getCodeModel();
15041 auto PtrVT = getPointerTy(DAG.getDataLayout());
15043 if (OpFlags == X86II::MO_NO_FLAG &&
15044 X86::isOffsetSuitableForCodeModel(Offset, M)) {
15045 // A direct static reference to a global.
15046 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
15049 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
15052 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
15054 // With PIC, the address is actually $g + Offset.
15055 if (isGlobalRelativeToPICBase(OpFlags)) {
15056 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
15057 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
15060 // For globals that require a load from a stub to get the address, emit the
15062 if (isGlobalStubReference(OpFlags))
15063 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
15064 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15066 // If there was a non-zero offset that we didn't fold, create an explicit
15067 // addition for it.
15069 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
15070 DAG.getConstant(Offset, dl, PtrVT));
15076 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
15077 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
15078 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
15079 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
15083 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
15084 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
15085 unsigned char OperandFlags, bool LocalDynamic = false) {
15086 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15087 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15089 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15090 GA->getValueType(0),
15094 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
15098 SDValue Ops[] = { Chain, TGA, *InFlag };
15099 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15101 SDValue Ops[] = { Chain, TGA };
15102 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
15105 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
15106 MFI.setAdjustsStack(true);
15107 MFI.setHasCalls(true);
15109 SDValue Flag = Chain.getValue(1);
15110 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
15113 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
15115 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15118 SDLoc dl(GA); // ? function entry point might be better
15119 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15120 DAG.getNode(X86ISD::GlobalBaseReg,
15121 SDLoc(), PtrVT), InFlag);
15122 InFlag = Chain.getValue(1);
15124 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
15127 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
15129 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15131 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
15132 X86::RAX, X86II::MO_TLSGD);
15135 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
15141 // Get the start address of the TLS block for this module.
15142 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
15143 .getInfo<X86MachineFunctionInfo>();
15144 MFI->incNumLocalDynamicTLSAccesses();
15148 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
15149 X86II::MO_TLSLD, /*LocalDynamic=*/true);
15152 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
15153 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
15154 InFlag = Chain.getValue(1);
15155 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
15156 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
15159 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
15163 unsigned char OperandFlags = X86II::MO_DTPOFF;
15164 unsigned WrapperKind = X86ISD::Wrapper;
15165 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15166 GA->getValueType(0),
15167 GA->getOffset(), OperandFlags);
15168 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15170 // Add x@dtpoff with the base.
15171 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
15174 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
15175 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
15176 const EVT PtrVT, TLSModel::Model model,
15177 bool is64Bit, bool isPIC) {
15180 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
15181 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
15182 is64Bit ? 257 : 256));
15184 SDValue ThreadPointer =
15185 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
15186 MachinePointerInfo(Ptr));
15188 unsigned char OperandFlags = 0;
15189 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
15191 unsigned WrapperKind = X86ISD::Wrapper;
15192 if (model == TLSModel::LocalExec) {
15193 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
15194 } else if (model == TLSModel::InitialExec) {
15196 OperandFlags = X86II::MO_GOTTPOFF;
15197 WrapperKind = X86ISD::WrapperRIP;
15199 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
15202 llvm_unreachable("Unexpected model");
15205 // emit "addl x@ntpoff,%eax" (local exec)
15206 // or "addl x@indntpoff,%eax" (initial exec)
15207 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
15209 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
15210 GA->getOffset(), OperandFlags);
15211 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
15213 if (model == TLSModel::InitialExec) {
15214 if (isPIC && !is64Bit) {
15215 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
15216 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15220 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
15221 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
15224 // The address of the thread local variable is the add of the thread
15225 // pointer with the offset of the variable.
15226 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
15230 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
15232 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
15234 if (DAG.getTarget().Options.EmulatedTLS)
15235 return LowerToTLSEmulatedModel(GA, DAG);
15237 const GlobalValue *GV = GA->getGlobal();
15238 auto PtrVT = getPointerTy(DAG.getDataLayout());
15239 bool PositionIndependent = isPositionIndependent();
15241 if (Subtarget.isTargetELF()) {
15242 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
15244 case TLSModel::GeneralDynamic:
15245 if (Subtarget.is64Bit())
15246 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
15247 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
15248 case TLSModel::LocalDynamic:
15249 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
15250 Subtarget.is64Bit());
15251 case TLSModel::InitialExec:
15252 case TLSModel::LocalExec:
15253 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
15254 PositionIndependent);
15256 llvm_unreachable("Unknown TLS model.");
15259 if (Subtarget.isTargetDarwin()) {
15260 // Darwin only has one model of TLS. Lower to that.
15261 unsigned char OpFlag = 0;
15262 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
15263 X86ISD::WrapperRIP : X86ISD::Wrapper;
15265 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
15266 // global base reg.
15267 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
15269 OpFlag = X86II::MO_TLVP_PIC_BASE;
15271 OpFlag = X86II::MO_TLVP;
15273 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
15274 GA->getValueType(0),
15275 GA->getOffset(), OpFlag);
15276 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
15278 // With PIC32, the address is actually $g + Offset.
15280 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
15281 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
15284 // Lowering the machine isd will make sure everything is in the right
15286 SDValue Chain = DAG.getEntryNode();
15287 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
15288 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15289 SDValue Args[] = { Chain, Offset };
15290 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15291 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15292 DAG.getIntPtrConstant(0, DL, true),
15293 Chain.getValue(1), DL);
15295 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15296 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15297 MFI.setAdjustsStack(true);
15299 // And our return value (tls address) is in the standard call return value
15301 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15302 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15305 if (Subtarget.isTargetKnownWindowsMSVC() ||
15306 Subtarget.isTargetWindowsItanium() ||
15307 Subtarget.isTargetWindowsGNU()) {
15308 // Just use the implicit TLS architecture
15309 // Need to generate something similar to:
15310 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15312 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15313 // mov rcx, qword [rdx+rcx*8]
15314 // mov eax, .tls$:tlsvar
15315 // [rax+rcx] contains the address
15316 // Windows 64bit: gs:0x58
15317 // Windows 32bit: fs:__tls_array
15320 SDValue Chain = DAG.getEntryNode();
15322 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15323 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15324 // use its literal value of 0x2C.
15325 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15326 ? Type::getInt8PtrTy(*DAG.getContext(),
15328 : Type::getInt32PtrTy(*DAG.getContext(),
15331 SDValue TlsArray = Subtarget.is64Bit()
15332 ? DAG.getIntPtrConstant(0x58, dl)
15333 : (Subtarget.isTargetWindowsGNU()
15334 ? DAG.getIntPtrConstant(0x2C, dl)
15335 : DAG.getExternalSymbol("_tls_array", PtrVT));
15337 SDValue ThreadPointer =
15338 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15341 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15342 res = ThreadPointer;
15344 // Load the _tls_index variable
15345 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15346 if (Subtarget.is64Bit())
15347 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15348 MachinePointerInfo(), MVT::i32);
15350 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15352 auto &DL = DAG.getDataLayout();
15354 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15355 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15357 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15360 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15362 // Get the offset of start of .tls section
15363 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15364 GA->getValueType(0),
15365 GA->getOffset(), X86II::MO_SECREL);
15366 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15368 // The address of the thread local variable is the add of the thread
15369 // pointer with the offset of the variable.
15370 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15373 llvm_unreachable("TLS not implemented for this target.");
15376 /// Lower SRA_PARTS and friends, which return two i32 values
15377 /// and take a 2 x i32 value to shift plus a shift amount.
15378 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15379 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15380 MVT VT = Op.getSimpleValueType();
15381 unsigned VTBits = VT.getSizeInBits();
15383 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15384 SDValue ShOpLo = Op.getOperand(0);
15385 SDValue ShOpHi = Op.getOperand(1);
15386 SDValue ShAmt = Op.getOperand(2);
15387 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15388 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15390 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15391 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15392 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15393 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15394 : DAG.getConstant(0, dl, VT);
15396 SDValue Tmp2, Tmp3;
15397 if (Op.getOpcode() == ISD::SHL_PARTS) {
15398 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15399 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15401 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15402 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15405 // If the shift amount is larger or equal than the width of a part we can't
15406 // rely on the results of shld/shrd. Insert a test and select the appropriate
15407 // values for large shift amounts.
15408 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15409 DAG.getConstant(VTBits, dl, MVT::i8));
15410 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15411 AndNode, DAG.getConstant(0, dl, MVT::i8));
15414 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15415 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15416 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15418 if (Op.getOpcode() == ISD::SHL_PARTS) {
15419 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15420 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15422 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15423 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15426 SDValue Ops[2] = { Lo, Hi };
15427 return DAG.getMergeValues(Ops, dl);
15430 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15431 SelectionDAG &DAG) const {
15432 SDValue Src = Op.getOperand(0);
15433 MVT SrcVT = Src.getSimpleValueType();
15434 MVT VT = Op.getSimpleValueType();
15437 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15438 if (SrcVT.isVector()) {
15439 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15440 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15441 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15442 DAG.getUNDEF(SrcVT)));
15444 if (SrcVT.getVectorElementType() == MVT::i1) {
15445 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15446 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15447 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15448 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15449 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15450 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15455 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15456 "Unknown SINT_TO_FP to lower!");
15458 // These are really Legal; return the operand so the caller accepts it as
15460 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15462 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15463 Subtarget.is64Bit()) {
15467 SDValue ValueToStore = Op.getOperand(0);
15468 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15469 !Subtarget.is64Bit())
15470 // Bitcasting to f64 here allows us to do a single 64-bit store from
15471 // an SSE register, avoiding the store forwarding penalty that would come
15472 // with two 32-bit stores.
15473 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15475 unsigned Size = SrcVT.getSizeInBits()/8;
15476 MachineFunction &MF = DAG.getMachineFunction();
15477 auto PtrVT = getPointerTy(MF.getDataLayout());
15478 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15479 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15480 SDValue Chain = DAG.getStore(
15481 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15482 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15483 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15486 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15488 SelectionDAG &DAG) const {
15492 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15494 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15496 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15498 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15500 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15501 MachineMemOperand *MMO;
15503 int SSFI = FI->getIndex();
15504 MMO = DAG.getMachineFunction().getMachineMemOperand(
15505 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15506 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15508 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15509 StackSlot = StackSlot.getOperand(1);
15511 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15512 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15514 Tys, Ops, SrcVT, MMO);
15517 Chain = Result.getValue(1);
15518 SDValue InFlag = Result.getValue(2);
15520 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15521 // shouldn't be necessary except that RFP cannot be live across
15522 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15523 MachineFunction &MF = DAG.getMachineFunction();
15524 unsigned SSFISize = Op.getValueSizeInBits()/8;
15525 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15526 auto PtrVT = getPointerTy(MF.getDataLayout());
15527 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15528 Tys = DAG.getVTList(MVT::Other);
15530 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15532 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15533 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15534 MachineMemOperand::MOStore, SSFISize, SSFISize);
15536 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15537 Ops, Op.getValueType(), MMO);
15538 Result = DAG.getLoad(
15539 Op.getValueType(), DL, Chain, StackSlot,
15540 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15546 /// 64-bit unsigned integer to double expansion.
15547 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15548 SelectionDAG &DAG) const {
15549 // This algorithm is not obvious. Here it is what we're trying to output:
15552 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15553 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15555 haddpd %xmm0, %xmm0
15557 pshufd $0x4e, %xmm0, %xmm1
15563 LLVMContext *Context = DAG.getContext();
15565 // Build some magic constants.
15566 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15567 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15568 auto PtrVT = getPointerTy(DAG.getDataLayout());
15569 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15571 SmallVector<Constant*,2> CV1;
15573 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15574 APInt(64, 0x4330000000000000ULL))));
15576 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15577 APInt(64, 0x4530000000000000ULL))));
15578 Constant *C1 = ConstantVector::get(CV1);
15579 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15581 // Load the 64-bit value into an XMM register.
15582 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15585 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15586 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15587 /* Alignment = */ 16);
15589 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15592 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15593 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15594 /* Alignment = */ 16);
15595 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15596 // TODO: Are there any fast-math-flags to propagate here?
15597 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15600 if (Subtarget.hasSSE3()) {
15601 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15602 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15604 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15605 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15606 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15607 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15610 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15611 DAG.getIntPtrConstant(0, dl));
15614 /// 32-bit unsigned integer to float expansion.
15615 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15616 SelectionDAG &DAG) const {
15618 // FP constant to bias correct the final result.
15619 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15622 // Load the 32-bit value into an XMM register.
15623 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15626 // Zero out the upper parts of the register.
15627 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15629 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15630 DAG.getBitcast(MVT::v2f64, Load),
15631 DAG.getIntPtrConstant(0, dl));
15633 // Or the load with the bias.
15634 SDValue Or = DAG.getNode(
15635 ISD::OR, dl, MVT::v2i64,
15636 DAG.getBitcast(MVT::v2i64,
15637 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15638 DAG.getBitcast(MVT::v2i64,
15639 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15641 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15642 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15644 // Subtract the bias.
15645 // TODO: Are there any fast-math-flags to propagate here?
15646 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15648 // Handle final rounding.
15649 MVT DestVT = Op.getSimpleValueType();
15651 if (DestVT.bitsLT(MVT::f64))
15652 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15653 DAG.getIntPtrConstant(0, dl));
15654 if (DestVT.bitsGT(MVT::f64))
15655 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15657 // Handle final rounding.
15661 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15662 const X86Subtarget &Subtarget, SDLoc &DL) {
15663 if (Op.getSimpleValueType() != MVT::v2f64)
15666 SDValue N0 = Op.getOperand(0);
15667 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15669 // Legalize to v4i32 type.
15670 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15671 DAG.getUNDEF(MVT::v2i32));
15673 if (Subtarget.hasAVX512())
15674 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15676 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15677 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15678 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15679 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15681 // Two to the power of half-word-size.
15682 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15684 // Clear upper part of LO, lower HI.
15685 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15686 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15688 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15689 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15690 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15692 // Add the two halves.
15693 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15696 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15697 const X86Subtarget &Subtarget) {
15698 // The algorithm is the following:
15699 // #ifdef __SSE4_1__
15700 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15701 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15702 // (uint4) 0x53000000, 0xaa);
15704 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15705 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15707 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15708 // return (float4) lo + fhi;
15710 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15711 // reassociate the two FADDs, and if we do that, the algorithm fails
15712 // spectacularly (PR24512).
15713 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15714 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15715 // there's also the MachineCombiner reassociations happening on Machine IR.
15716 if (DAG.getTarget().Options.UnsafeFPMath)
15720 SDValue V = Op->getOperand(0);
15721 MVT VecIntVT = V.getSimpleValueType();
15722 bool Is128 = VecIntVT == MVT::v4i32;
15723 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15724 // If we convert to something else than the supported type, e.g., to v4f64,
15726 if (VecFloatVT != Op->getSimpleValueType(0))
15729 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15730 "Unsupported custom type");
15732 // In the #idef/#else code, we have in common:
15733 // - The vector of constants:
15739 // Create the splat vector for 0x4b000000.
15740 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15741 // Create the splat vector for 0x53000000.
15742 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15744 // Create the right shift.
15745 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15746 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15749 if (Subtarget.hasSSE41()) {
15750 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15751 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15752 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15753 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15754 // Low will be bitcasted right away, so do not bother bitcasting back to its
15756 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15757 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15758 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15759 // (uint4) 0x53000000, 0xaa);
15760 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15761 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15762 // High will be bitcasted right away, so do not bother bitcasting back to
15763 // its original type.
15764 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15765 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15767 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15768 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15769 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15770 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15772 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15773 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15776 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15777 SDValue VecCstFAdd = DAG.getConstantFP(
15778 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15780 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15781 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15782 // TODO: Are there any fast-math-flags to propagate here?
15784 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15785 // return (float4) lo + fhi;
15786 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15787 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15790 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15791 SelectionDAG &DAG) const {
15792 SDValue N0 = Op.getOperand(0);
15793 MVT SrcVT = N0.getSimpleValueType();
15796 if (SrcVT.getVectorElementType() == MVT::i1) {
15797 if (SrcVT == MVT::v2i1)
15798 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15799 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15800 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15801 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15802 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15805 switch (SrcVT.SimpleTy) {
15807 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15809 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15812 assert(!Subtarget.hasAVX512());
15813 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15817 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15818 SelectionDAG &DAG) const {
15819 SDValue N0 = Op.getOperand(0);
15821 auto PtrVT = getPointerTy(DAG.getDataLayout());
15823 if (Op.getSimpleValueType().isVector())
15824 return lowerUINT_TO_FP_vec(Op, DAG);
15826 MVT SrcVT = N0.getSimpleValueType();
15827 MVT DstVT = Op.getSimpleValueType();
15829 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15830 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15831 // Conversions from unsigned i32 to f32/f64 are legal,
15832 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15836 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15837 return LowerUINT_TO_FP_i64(Op, DAG);
15838 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15839 return LowerUINT_TO_FP_i32(Op, DAG);
15840 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15843 // Make a 64-bit buffer, and use it to build an FILD.
15844 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15845 if (SrcVT == MVT::i32) {
15846 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15847 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15848 StackSlot, MachinePointerInfo());
15849 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15850 OffsetSlot, MachinePointerInfo());
15851 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15855 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15856 SDValue ValueToStore = Op.getOperand(0);
15857 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15858 // Bitcasting to f64 here allows us to do a single 64-bit store from
15859 // an SSE register, avoiding the store forwarding penalty that would come
15860 // with two 32-bit stores.
15861 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15862 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15863 MachinePointerInfo());
15864 // For i64 source, we need to add the appropriate power of 2 if the input
15865 // was negative. This is the same as the optimization in
15866 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15867 // we must be careful to do the computation in x87 extended precision, not
15868 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15869 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15870 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15871 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15872 MachineMemOperand::MOLoad, 8, 8);
15874 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15875 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15876 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15879 APInt FF(32, 0x5F800000ULL);
15881 // Check whether the sign bit is set.
15882 SDValue SignSet = DAG.getSetCC(
15883 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15884 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15886 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15887 SDValue FudgePtr = DAG.getConstantPool(
15888 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15890 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15891 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15892 SDValue Four = DAG.getIntPtrConstant(4, dl);
15893 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15894 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15896 // Load the value out, extending it from f32 to f80.
15897 // FIXME: Avoid the extend by constructing the right constant pool?
15898 SDValue Fudge = DAG.getExtLoad(
15899 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15900 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15901 /* Alignment = */ 4);
15902 // Extend everything to 80 bits to force it to be done on x87.
15903 // TODO: Are there any fast-math-flags to propagate here?
15904 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15905 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15906 DAG.getIntPtrConstant(0, dl));
15909 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15910 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15911 // just return an <SDValue(), SDValue()> pair.
15912 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15913 // to i16, i32 or i64, and we lower it to a legal sequence.
15914 // If lowered to the final integer result we return a <result, SDValue()> pair.
15915 // Otherwise we lower it to a sequence ending with a FIST, return a
15916 // <FIST, StackSlot> pair, and the caller is responsible for loading
15917 // the final integer result from StackSlot.
15918 std::pair<SDValue,SDValue>
15919 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15920 bool IsSigned, bool IsReplace) const {
15923 EVT DstTy = Op.getValueType();
15924 EVT TheVT = Op.getOperand(0).getValueType();
15925 auto PtrVT = getPointerTy(DAG.getDataLayout());
15927 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15928 // f16 must be promoted before using the lowering in this routine.
15929 // fp128 does not use this lowering.
15930 return std::make_pair(SDValue(), SDValue());
15933 // If using FIST to compute an unsigned i64, we'll need some fixup
15934 // to handle values above the maximum signed i64. A FIST is always
15935 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15936 bool UnsignedFixup = !IsSigned &&
15937 DstTy == MVT::i64 &&
15938 (!Subtarget.is64Bit() ||
15939 !isScalarFPTypeInSSEReg(TheVT));
15941 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15942 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15943 // The low 32 bits of the fist result will have the correct uint32 result.
15944 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15948 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15949 DstTy.getSimpleVT() >= MVT::i16 &&
15950 "Unknown FP_TO_INT to lower!");
15952 // These are really Legal.
15953 if (DstTy == MVT::i32 &&
15954 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15955 return std::make_pair(SDValue(), SDValue());
15956 if (Subtarget.is64Bit() &&
15957 DstTy == MVT::i64 &&
15958 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15959 return std::make_pair(SDValue(), SDValue());
15961 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15963 MachineFunction &MF = DAG.getMachineFunction();
15964 unsigned MemSize = DstTy.getSizeInBits()/8;
15965 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15966 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15969 switch (DstTy.getSimpleVT().SimpleTy) {
15970 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15971 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15972 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15973 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15976 SDValue Chain = DAG.getEntryNode();
15977 SDValue Value = Op.getOperand(0);
15978 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15980 if (UnsignedFixup) {
15982 // Conversion to unsigned i64 is implemented with a select,
15983 // depending on whether the source value fits in the range
15984 // of a signed i64. Let Thresh be the FP equivalent of
15985 // 0x8000000000000000ULL.
15987 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15988 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15989 // Fist-to-mem64 FistSrc
15990 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15991 // to XOR'ing the high 32 bits with Adjust.
15993 // Being a power of 2, Thresh is exactly representable in all FP formats.
15994 // For X87 we'd like to use the smallest FP type for this constant, but
15995 // for DAG type consistency we have to match the FP operand type.
15997 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15998 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15999 bool LosesInfo = false;
16000 if (TheVT == MVT::f64)
16001 // The rounding mode is irrelevant as the conversion should be exact.
16002 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
16004 else if (TheVT == MVT::f80)
16005 Status = Thresh.convert(APFloat::x87DoubleExtended(),
16006 APFloat::rmNearestTiesToEven, &LosesInfo);
16008 assert(Status == APFloat::opOK && !LosesInfo &&
16009 "FP conversion should have been exact");
16011 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
16013 SDValue Cmp = DAG.getSetCC(DL,
16014 getSetCCResultType(DAG.getDataLayout(),
16015 *DAG.getContext(), TheVT),
16016 Value, ThreshVal, ISD::SETLT);
16017 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
16018 DAG.getConstant(0, DL, MVT::i32),
16019 DAG.getConstant(0x80000000, DL, MVT::i32));
16020 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
16021 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
16022 *DAG.getContext(), TheVT),
16023 Value, ThreshVal, ISD::SETLT);
16024 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
16027 // FIXME This causes a redundant load/store if the SSE-class value is already
16028 // in memory, such as if it is on the callstack.
16029 if (isScalarFPTypeInSSEReg(TheVT)) {
16030 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
16031 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
16032 MachinePointerInfo::getFixedStack(MF, SSFI));
16033 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
16035 Chain, StackSlot, DAG.getValueType(TheVT)
16038 MachineMemOperand *MMO =
16039 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16040 MachineMemOperand::MOLoad, MemSize, MemSize);
16041 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
16042 Chain = Value.getValue(1);
16043 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
16044 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
16047 MachineMemOperand *MMO =
16048 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
16049 MachineMemOperand::MOStore, MemSize, MemSize);
16051 if (UnsignedFixup) {
16053 // Insert the FIST, load its result as two i32's,
16054 // and XOR the high i32 with Adjust.
16056 SDValue FistOps[] = { Chain, Value, StackSlot };
16057 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16058 FistOps, DstTy, MMO);
16061 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
16062 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
16065 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
16066 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
16068 if (Subtarget.is64Bit()) {
16069 // Join High32 and Low32 into a 64-bit result.
16070 // (High32 << 32) | Low32
16071 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
16072 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
16073 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
16074 DAG.getConstant(32, DL, MVT::i8));
16075 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
16076 return std::make_pair(Result, SDValue());
16079 SDValue ResultOps[] = { Low32, High32 };
16081 SDValue pair = IsReplace
16082 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
16083 : DAG.getMergeValues(ResultOps, DL);
16084 return std::make_pair(pair, SDValue());
16086 // Build the FP_TO_INT*_IN_MEM
16087 SDValue Ops[] = { Chain, Value, StackSlot };
16088 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
16090 return std::make_pair(FIST, StackSlot);
16094 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
16095 const X86Subtarget &Subtarget) {
16096 MVT VT = Op->getSimpleValueType(0);
16097 SDValue In = Op->getOperand(0);
16098 MVT InVT = In.getSimpleValueType();
16101 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
16102 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
16104 // Optimize vectors in AVX mode:
16107 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
16108 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
16109 // Concat upper and lower parts.
16112 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
16113 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
16114 // Concat upper and lower parts.
16117 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
16118 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
16119 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
16122 if (Subtarget.hasInt256())
16123 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
16125 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
16126 SDValue Undef = DAG.getUNDEF(InVT);
16127 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
16128 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16129 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
16131 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
16132 VT.getVectorNumElements()/2);
16134 OpLo = DAG.getBitcast(HVT, OpLo);
16135 OpHi = DAG.getBitcast(HVT, OpHi);
16137 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
16140 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
16141 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
16142 MVT VT = Op->getSimpleValueType(0);
16143 SDValue In = Op->getOperand(0);
16144 MVT InVT = In.getSimpleValueType();
16146 unsigned NumElts = VT.getVectorNumElements();
16148 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
16149 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
16150 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
16152 if (InVT.getVectorElementType() != MVT::i1)
16155 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
16157 if (!VT.is512BitVector() && !Subtarget.hasVLX())
16158 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16161 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
16163 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
16165 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
16167 return SelectedVal;
16168 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
16171 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16172 SelectionDAG &DAG) {
16173 if (Subtarget.hasFp256())
16174 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16180 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
16181 SelectionDAG &DAG) {
16183 MVT VT = Op.getSimpleValueType();
16184 SDValue In = Op.getOperand(0);
16185 MVT SVT = In.getSimpleValueType();
16187 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
16188 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
16190 if (Subtarget.hasFp256())
16191 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
16194 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
16195 VT.getVectorNumElements() != SVT.getVectorNumElements());
16199 /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
16200 /// It makes use of the fact that vectors with enough leading sign/zero bits
16201 /// prevent the PACKSS/PACKUS from saturating the results.
16202 /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
16203 /// within each 128-bit lane.
16204 static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
16205 const SDLoc &DL, SelectionDAG &DAG,
16206 const X86Subtarget &Subtarget) {
16207 assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
16208 "Unexpected PACK opcode");
16210 // Requires SSE2 but AVX512 has fast truncate.
16211 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
16214 EVT SrcVT = In.getValueType();
16216 // No truncation required, we might get here due to recursive calls.
16217 if (SrcVT == DstVT)
16220 // We only support vector truncation to 128bits or greater from a
16221 // 256bits or greater source.
16222 unsigned DstSizeInBits = DstVT.getSizeInBits();
16223 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
16224 if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
16227 LLVMContext &Ctx = *DAG.getContext();
16228 unsigned NumElems = SrcVT.getVectorNumElements();
16229 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
16230 assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
16232 EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
16234 // Extract lower/upper subvectors.
16235 unsigned NumSubElts = NumElems / 2;
16236 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16237 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
16239 // Pack to the largest type possible:
16240 // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
16241 EVT InVT = MVT::i16, OutVT = MVT::i8;
16242 if (DstVT.getScalarSizeInBits() > 8 &&
16243 (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
16248 unsigned SubSizeInBits = SrcSizeInBits / 2;
16249 InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
16250 OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
16252 // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
16253 if (SrcVT.is256BitVector()) {
16254 Lo = DAG.getBitcast(InVT, Lo);
16255 Hi = DAG.getBitcast(InVT, Hi);
16256 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16257 return DAG.getBitcast(DstVT, Res);
16260 // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
16261 // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
16262 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
16263 Lo = DAG.getBitcast(InVT, Lo);
16264 Hi = DAG.getBitcast(InVT, Hi);
16265 SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
16267 // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
16268 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
16269 Res = DAG.getBitcast(MVT::v4i64, Res);
16270 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
16272 if (DstVT.is256BitVector())
16273 return DAG.getBitcast(DstVT, Res);
16275 // If 512bit -> 128bit truncate another stage.
16276 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16277 Res = DAG.getBitcast(PackedVT, Res);
16278 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16281 // Recursively pack lower/upper subvectors, concat result and pack again.
16282 assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
16283 EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
16284 Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
16285 Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
16287 PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
16288 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16289 return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
16292 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16293 const X86Subtarget &Subtarget) {
16296 MVT VT = Op.getSimpleValueType();
16297 SDValue In = Op.getOperand(0);
16298 MVT InVT = In.getSimpleValueType();
16300 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16302 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16303 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16304 if (InVT.getScalarSizeInBits() <= 16) {
16305 if (Subtarget.hasBWI()) {
16306 // legal, will go to VPMOVB2M, VPMOVW2M
16307 // Shift packed bytes not supported natively, bitcast to word
16308 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16309 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16310 DAG.getBitcast(ExtVT, In),
16311 DAG.getConstant(ShiftInx, DL, ExtVT));
16312 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16313 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16315 // Use TESTD/Q, extended vector to packed dword/qword.
16316 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16317 "Unexpected vector type.");
16318 unsigned NumElts = InVT.getVectorNumElements();
16319 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16320 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16322 ShiftInx = InVT.getScalarSizeInBits() - 1;
16325 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16326 DAG.getConstant(ShiftInx, DL, InVT));
16327 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16330 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16332 MVT VT = Op.getSimpleValueType();
16333 SDValue In = Op.getOperand(0);
16334 MVT InVT = In.getSimpleValueType();
16335 unsigned InNumEltBits = InVT.getScalarSizeInBits();
16337 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16338 "Invalid TRUNCATE operation");
16340 if (VT.getVectorElementType() == MVT::i1)
16341 return LowerTruncateVecI1(Op, DAG, Subtarget);
16343 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16344 if (Subtarget.hasAVX512()) {
16345 // word to byte only under BWI
16346 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16347 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16348 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16349 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16352 // Truncate with PACKSS if we are truncating a vector with sign-bits that
16353 // extend all the way to the packed/truncated value.
16354 unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
16355 if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
16357 truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
16360 // Truncate with PACKUS if we are truncating a vector with leading zero bits
16361 // that extend all the way to the packed/truncated value.
16362 // Pre-SSE41 we can only use PACKUSWB.
16364 DAG.computeKnownBits(In, Known);
16365 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
16366 if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
16368 truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
16371 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16372 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16373 if (Subtarget.hasInt256()) {
16374 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16375 In = DAG.getBitcast(MVT::v8i32, In);
16376 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16377 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16378 DAG.getIntPtrConstant(0, DL));
16381 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16382 DAG.getIntPtrConstant(0, DL));
16383 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16384 DAG.getIntPtrConstant(2, DL));
16385 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16386 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16387 static const int ShufMask[] = {0, 2, 4, 6};
16388 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16391 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16392 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16393 if (Subtarget.hasInt256()) {
16394 In = DAG.getBitcast(MVT::v32i8, In);
16396 // The PSHUFB mask:
16397 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16398 -1, -1, -1, -1, -1, -1, -1, -1,
16399 16, 17, 20, 21, 24, 25, 28, 29,
16400 -1, -1, -1, -1, -1, -1, -1, -1 };
16401 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16402 In = DAG.getBitcast(MVT::v4i64, In);
16404 static const int ShufMask2[] = {0, 2, -1, -1};
16405 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16406 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16407 DAG.getIntPtrConstant(0, DL));
16408 return DAG.getBitcast(VT, In);
16411 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16412 DAG.getIntPtrConstant(0, DL));
16414 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16415 DAG.getIntPtrConstant(4, DL));
16417 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16418 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16420 // The PSHUFB mask:
16421 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16422 -1, -1, -1, -1, -1, -1, -1, -1};
16424 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16425 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16427 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16428 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16430 // The MOVLHPS Mask:
16431 static const int ShufMask2[] = {0, 1, 4, 5};
16432 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16433 return DAG.getBitcast(MVT::v8i16, res);
16436 // Handle truncation of V256 to V128 using shuffles.
16437 if (!VT.is128BitVector() || !InVT.is256BitVector())
16440 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16442 unsigned NumElems = VT.getVectorNumElements();
16443 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16445 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16446 // Prepare truncation shuffle mask
16447 for (unsigned i = 0; i != NumElems; ++i)
16448 MaskVec[i] = i * 2;
16449 In = DAG.getBitcast(NVT, In);
16450 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16451 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16452 DAG.getIntPtrConstant(0, DL));
16455 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16456 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16457 MVT VT = Op.getSimpleValueType();
16459 if (VT.isVector()) {
16460 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16461 SDValue Src = Op.getOperand(0);
16463 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16464 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16465 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16466 DAG.getUNDEF(MVT::v2f32)));
16472 assert(!VT.isVector());
16474 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16475 IsSigned, /*IsReplace=*/ false);
16476 SDValue FIST = Vals.first, StackSlot = Vals.second;
16477 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16478 if (!FIST.getNode())
16481 if (StackSlot.getNode())
16482 // Load the result.
16483 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16485 // The node is the result.
16489 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16491 MVT VT = Op.getSimpleValueType();
16492 SDValue In = Op.getOperand(0);
16493 MVT SVT = In.getSimpleValueType();
16495 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16497 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16498 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16499 In, DAG.getUNDEF(SVT)));
16502 /// The only differences between FABS and FNEG are the mask and the logic op.
16503 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16504 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16505 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16506 "Wrong opcode for lowering FABS or FNEG.");
16508 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16510 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16511 // into an FNABS. We'll lower the FABS after that if it is still in use.
16513 for (SDNode *User : Op->uses())
16514 if (User->getOpcode() == ISD::FNEG)
16518 MVT VT = Op.getSimpleValueType();
16520 bool IsF128 = (VT == MVT::f128);
16522 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16523 // decide if we should generate a 16-byte constant mask when we only need 4 or
16524 // 8 bytes for the scalar case.
16529 if (VT.isVector()) {
16531 EltVT = VT.getVectorElementType();
16532 } else if (IsF128) {
16533 // SSE instructions are used for optimized f128 logical operations.
16534 LogicVT = MVT::f128;
16537 // There are no scalar bitwise logical SSE/AVX instructions, so we
16538 // generate a 16-byte vector constant and logic op even for the scalar case.
16539 // Using a 16-byte mask allows folding the load of the mask with
16540 // the logic op, so it can save (~4 bytes) on code size.
16541 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16545 unsigned EltBits = EltVT.getSizeInBits();
16546 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16548 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16549 const fltSemantics &Sem =
16550 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16551 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16552 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16554 SDValue Op0 = Op.getOperand(0);
16555 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16557 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16558 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16560 if (VT.isVector() || IsF128)
16561 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16563 // For the scalar case extend to a 128-bit vector, perform the logic op,
16564 // and extract the scalar result back out.
16565 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16566 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16567 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16568 DAG.getIntPtrConstant(0, dl));
16571 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16572 SDValue Mag = Op.getOperand(0);
16573 SDValue Sign = Op.getOperand(1);
16576 // If the sign operand is smaller, extend it first.
16577 MVT VT = Op.getSimpleValueType();
16578 if (Sign.getSimpleValueType().bitsLT(VT))
16579 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16581 // And if it is bigger, shrink it first.
16582 if (Sign.getSimpleValueType().bitsGT(VT))
16583 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16585 // At this point the operands and the result should have the same
16586 // type, and that won't be f80 since that is not custom lowered.
16587 bool IsF128 = (VT == MVT::f128);
16588 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16589 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16590 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16591 "Unexpected type in LowerFCOPYSIGN");
16593 MVT EltVT = VT.getScalarType();
16594 const fltSemantics &Sem =
16595 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16596 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16598 // Perform all scalar logic operations as 16-byte vectors because there are no
16599 // scalar FP logic instructions in SSE.
16600 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16601 // unnecessary splats, but we might miss load folding opportunities. Should
16602 // this decision be based on OptimizeForSize?
16603 bool IsFakeVector = !VT.isVector() && !IsF128;
16606 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16608 // The mask constants are automatically splatted for vector types.
16609 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16610 SDValue SignMask = DAG.getConstantFP(
16611 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16612 SDValue MagMask = DAG.getConstantFP(
16613 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16615 // First, clear all bits but the sign bit from the second operand (sign).
16617 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16618 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16620 // Next, clear the sign bit from the first operand (magnitude).
16621 // TODO: If we had general constant folding for FP logic ops, this check
16622 // wouldn't be necessary.
16624 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16625 APFloat APF = Op0CN->getValueAPF();
16627 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16629 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16631 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16632 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16635 // OR the magnitude value with the sign bit.
16636 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16637 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16638 DAG.getIntPtrConstant(0, dl));
16641 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16642 SDValue N0 = Op.getOperand(0);
16644 MVT VT = Op.getSimpleValueType();
16646 MVT OpVT = N0.getSimpleValueType();
16647 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16648 "Unexpected type for FGETSIGN");
16650 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16651 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16652 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16653 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16654 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16655 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16659 // Check whether an OR'd tree is PTEST-able.
16660 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16661 SelectionDAG &DAG) {
16662 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16664 if (!Subtarget.hasSSE41())
16667 if (!Op->hasOneUse())
16670 SDNode *N = Op.getNode();
16673 SmallVector<SDValue, 8> Opnds;
16674 DenseMap<SDValue, unsigned> VecInMap;
16675 SmallVector<SDValue, 8> VecIns;
16676 EVT VT = MVT::Other;
16678 // Recognize a special case where a vector is casted into wide integer to
16680 Opnds.push_back(N->getOperand(0));
16681 Opnds.push_back(N->getOperand(1));
16683 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16684 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16685 // BFS traverse all OR'd operands.
16686 if (I->getOpcode() == ISD::OR) {
16687 Opnds.push_back(I->getOperand(0));
16688 Opnds.push_back(I->getOperand(1));
16689 // Re-evaluate the number of nodes to be traversed.
16690 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16694 // Quit if a non-EXTRACT_VECTOR_ELT
16695 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16698 // Quit if without a constant index.
16699 SDValue Idx = I->getOperand(1);
16700 if (!isa<ConstantSDNode>(Idx))
16703 SDValue ExtractedFromVec = I->getOperand(0);
16704 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16705 if (M == VecInMap.end()) {
16706 VT = ExtractedFromVec.getValueType();
16707 // Quit if not 128/256-bit vector.
16708 if (!VT.is128BitVector() && !VT.is256BitVector())
16710 // Quit if not the same type.
16711 if (VecInMap.begin() != VecInMap.end() &&
16712 VT != VecInMap.begin()->first.getValueType())
16714 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16715 VecIns.push_back(ExtractedFromVec);
16717 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16720 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16721 "Not extracted from 128-/256-bit vector.");
16723 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16725 for (DenseMap<SDValue, unsigned>::const_iterator
16726 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16727 // Quit if not all elements are used.
16728 if (I->second != FullMask)
16732 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16734 // Cast all vectors into TestVT for PTEST.
16735 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16736 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16738 // If more than one full vector is evaluated, OR them first before PTEST.
16739 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16740 // Each iteration will OR 2 nodes and append the result until there is only
16741 // 1 node left, i.e. the final OR'd value of all vectors.
16742 SDValue LHS = VecIns[Slot];
16743 SDValue RHS = VecIns[Slot + 1];
16744 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16747 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16750 /// \brief return true if \c Op has a use that doesn't just read flags.
16751 static bool hasNonFlagsUse(SDValue Op) {
16752 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16754 SDNode *User = *UI;
16755 unsigned UOpNo = UI.getOperandNo();
16756 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16757 // Look pass truncate.
16758 UOpNo = User->use_begin().getOperandNo();
16759 User = *User->use_begin();
16762 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16763 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16769 // Emit KTEST instruction for bit vectors on AVX-512
16770 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16771 const X86Subtarget &Subtarget) {
16772 if (Op.getOpcode() == ISD::BITCAST) {
16773 auto hasKTEST = [&](MVT VT) {
16774 unsigned SizeInBits = VT.getSizeInBits();
16775 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16776 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16778 SDValue Op0 = Op.getOperand(0);
16779 MVT Op0VT = Op0.getValueType().getSimpleVT();
16780 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16782 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16787 /// Emit nodes that will be selected as "test Op0,Op0", or something
16789 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16790 SelectionDAG &DAG) const {
16791 if (Op.getValueType() == MVT::i1) {
16792 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16793 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16794 DAG.getConstant(0, dl, MVT::i8));
16796 // CF and OF aren't always set the way we want. Determine which
16797 // of these we need.
16798 bool NeedCF = false;
16799 bool NeedOF = false;
16802 case X86::COND_A: case X86::COND_AE:
16803 case X86::COND_B: case X86::COND_BE:
16806 case X86::COND_G: case X86::COND_GE:
16807 case X86::COND_L: case X86::COND_LE:
16808 case X86::COND_O: case X86::COND_NO: {
16809 // Check if we really need to set the
16810 // Overflow flag. If NoSignedWrap is present
16811 // that is not actually needed.
16812 switch (Op->getOpcode()) {
16817 if (Op.getNode()->getFlags().hasNoSignedWrap())
16827 // See if we can use the EFLAGS value from the operand instead of
16828 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16829 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16830 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16831 // Emit KTEST for bit vectors
16832 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16834 // Emit a CMP with 0, which is the TEST pattern.
16835 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16836 DAG.getConstant(0, dl, Op.getValueType()));
16838 unsigned Opcode = 0;
16839 unsigned NumOperands = 0;
16841 // Truncate operations may prevent the merge of the SETCC instruction
16842 // and the arithmetic instruction before it. Attempt to truncate the operands
16843 // of the arithmetic instruction and use a reduced bit-width instruction.
16844 bool NeedTruncation = false;
16845 SDValue ArithOp = Op;
16846 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16847 SDValue Arith = Op->getOperand(0);
16848 // Both the trunc and the arithmetic op need to have one user each.
16849 if (Arith->hasOneUse())
16850 switch (Arith.getOpcode()) {
16857 NeedTruncation = true;
16863 // Sometimes flags can be set either with an AND or with an SRL/SHL
16864 // instruction. SRL/SHL variant should be preferred for masks longer than this
16866 const int ShiftToAndMaxMaskWidth = 32;
16867 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16869 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16870 // which may be the result of a CAST. We use the variable 'Op', which is the
16871 // non-casted variable when we check for possible users.
16872 switch (ArithOp.getOpcode()) {
16874 // We only want to rewrite this as a target-specific node with attached
16875 // flags if there is a reasonable chance of either using that to do custom
16876 // instructions selection that can fold some of the memory operands, or if
16877 // only the flags are used. If there are other uses, leave the node alone
16878 // and emit a test instruction.
16879 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16880 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16881 if (UI->getOpcode() != ISD::CopyToReg &&
16882 UI->getOpcode() != ISD::SETCC &&
16883 UI->getOpcode() != ISD::STORE)
16886 if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16887 // An add of one will be selected as an INC.
16889 (!Subtarget.slowIncDec() ||
16890 DAG.getMachineFunction().getFunction()->optForSize())) {
16891 Opcode = X86ISD::INC;
16896 // An add of negative one (subtract of one) will be selected as a DEC.
16897 if (C->isAllOnesValue() &&
16898 (!Subtarget.slowIncDec() ||
16899 DAG.getMachineFunction().getFunction()->optForSize())) {
16900 Opcode = X86ISD::DEC;
16906 // Otherwise use a regular EFLAGS-setting add.
16907 Opcode = X86ISD::ADD;
16912 // If we have a constant logical shift that's only used in a comparison
16913 // against zero turn it into an equivalent AND. This allows turning it into
16914 // a TEST instruction later.
16915 if (ZeroCheck && Op->hasOneUse() &&
16916 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16917 EVT VT = Op.getValueType();
16918 unsigned BitWidth = VT.getSizeInBits();
16919 unsigned ShAmt = Op->getConstantOperandVal(1);
16920 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16922 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16923 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16924 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16925 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16927 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16928 DAG.getConstant(Mask, dl, VT));
16933 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16934 // because a TEST instruction will be better. However, AND should be
16935 // preferred if the instruction can be combined into ANDN.
16936 if (!hasNonFlagsUse(Op)) {
16937 SDValue Op0 = ArithOp->getOperand(0);
16938 SDValue Op1 = ArithOp->getOperand(1);
16939 EVT VT = ArithOp.getValueType();
16940 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16941 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16942 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16944 // If we cannot select an ANDN instruction, check if we can replace
16945 // AND+IMM64 with a shift before giving up. This is possible for masks
16946 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16947 if (!isProperAndn) {
16951 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16952 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16956 const APInt &Mask = CN->getAPIntValue();
16957 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16958 break; // Prefer TEST instruction.
16960 unsigned BitWidth = Mask.getBitWidth();
16961 unsigned LeadingOnes = Mask.countLeadingOnes();
16962 unsigned TrailingZeros = Mask.countTrailingZeros();
16964 if (LeadingOnes + TrailingZeros == BitWidth) {
16965 assert(TrailingZeros < VT.getSizeInBits() &&
16966 "Shift amount should be less than the type width");
16967 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16968 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16969 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16973 unsigned LeadingZeros = Mask.countLeadingZeros();
16974 unsigned TrailingOnes = Mask.countTrailingOnes();
16976 if (LeadingZeros + TrailingOnes == BitWidth) {
16977 assert(LeadingZeros < VT.getSizeInBits() &&
16978 "Shift amount should be less than the type width");
16979 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16980 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16981 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16992 // Similar to ISD::ADD above, check if the uses will preclude useful
16993 // lowering of the target-specific node.
16994 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16995 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16996 if (UI->getOpcode() != ISD::CopyToReg &&
16997 UI->getOpcode() != ISD::SETCC &&
16998 UI->getOpcode() != ISD::STORE)
17001 // Otherwise use a regular EFLAGS-setting instruction.
17002 switch (ArithOp.getOpcode()) {
17003 default: llvm_unreachable("unexpected operator!");
17004 case ISD::SUB: Opcode = X86ISD::SUB; break;
17005 case ISD::XOR: Opcode = X86ISD::XOR; break;
17006 case ISD::AND: Opcode = X86ISD::AND; break;
17008 if (!NeedTruncation && ZeroCheck) {
17009 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
17012 Opcode = X86ISD::OR;
17026 return SDValue(Op.getNode(), 1);
17032 // If we found that truncation is beneficial, perform the truncation and
17034 if (NeedTruncation) {
17035 EVT VT = Op.getValueType();
17036 SDValue WideVal = Op->getOperand(0);
17037 EVT WideVT = WideVal.getValueType();
17038 unsigned ConvertedOp = 0;
17039 // Use a target machine opcode to prevent further DAGCombine
17040 // optimizations that may separate the arithmetic operations
17041 // from the setcc node.
17042 switch (WideVal.getOpcode()) {
17044 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
17045 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
17046 case ISD::AND: ConvertedOp = X86ISD::AND; break;
17047 case ISD::OR: ConvertedOp = X86ISD::OR; break;
17048 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
17052 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17053 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
17054 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
17055 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
17056 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
17062 // Emit KTEST for bit vectors
17063 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
17066 // Emit a CMP with 0, which is the TEST pattern.
17067 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
17068 DAG.getConstant(0, dl, Op.getValueType()));
17070 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
17071 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
17073 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
17074 DAG.ReplaceAllUsesWith(Op, New);
17075 return SDValue(New.getNode(), 1);
17078 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
17080 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
17081 const SDLoc &dl, SelectionDAG &DAG) const {
17082 if (isNullConstant(Op1))
17083 return EmitTest(Op0, X86CC, dl, DAG);
17085 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
17086 "Unexpected comparison operation for MVT::i1 operands");
17088 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
17089 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
17090 // Only promote the compare up to I32 if it is a 16 bit operation
17091 // with an immediate. 16 bit immediates are to be avoided.
17092 if ((Op0.getValueType() == MVT::i16 &&
17093 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
17094 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
17095 !Subtarget.isAtom()) {
17096 unsigned ExtendOp =
17097 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17098 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
17099 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
17101 // Use SUB instead of CMP to enable CSE between SUB and CMP.
17102 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
17103 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
17104 return SDValue(Sub.getNode(), 1);
17106 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
17109 /// Convert a comparison if required by the subtarget.
17110 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
17111 SelectionDAG &DAG) const {
17112 // If the subtarget does not support the FUCOMI instruction, floating-point
17113 // comparisons have to be converted.
17114 if (Subtarget.hasCMov() ||
17115 Cmp.getOpcode() != X86ISD::CMP ||
17116 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
17117 !Cmp.getOperand(1).getValueType().isFloatingPoint())
17120 // The instruction selector will select an FUCOM instruction instead of
17121 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
17122 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
17123 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
17125 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
17126 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
17127 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
17128 DAG.getConstant(8, dl, MVT::i8));
17129 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
17131 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
17132 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
17133 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
17136 /// Check if replacement of SQRT with RSQRT should be disabled.
17137 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
17138 EVT VT = Op.getValueType();
17140 // We never want to use both SQRT and RSQRT instructions for the same input.
17141 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
17145 return Subtarget.hasFastVectorFSQRT();
17146 return Subtarget.hasFastScalarFSQRT();
17149 /// The minimum architected relative accuracy is 2^-12. We need one
17150 /// Newton-Raphson step to have a good float result (24 bits of precision).
17151 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
17152 SelectionDAG &DAG, int Enabled,
17153 int &RefinementSteps,
17154 bool &UseOneConstNR,
17155 bool Reciprocal) const {
17156 EVT VT = Op.getValueType();
17158 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
17159 // TODO: Add support for AVX512 (v16f32).
17160 // It is likely not profitable to do this for f64 because a double-precision
17161 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
17162 // instructions: convert to single, rsqrtss, convert back to double, refine
17163 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
17164 // along with FMA, this could be a throughput win.
17165 // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
17166 // after legalize types.
17167 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17168 (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
17169 (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
17170 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17171 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17172 RefinementSteps = 1;
17174 UseOneConstNR = false;
17175 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
17180 /// The minimum architected relative accuracy is 2^-12. We need one
17181 /// Newton-Raphson step to have a good float result (24 bits of precision).
17182 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
17184 int &RefinementSteps) const {
17185 EVT VT = Op.getValueType();
17187 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
17188 // TODO: Add support for AVX512 (v16f32).
17189 // It is likely not profitable to do this for f64 because a double-precision
17190 // reciprocal estimate with refinement on x86 prior to FMA requires
17191 // 15 instructions: convert to single, rcpss, convert back to double, refine
17192 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
17193 // along with FMA, this could be a throughput win.
17195 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
17196 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
17197 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
17198 // Enable estimate codegen with 1 refinement step for vector division.
17199 // Scalar division estimates are disabled because they break too much
17200 // real-world code. These defaults are intended to match GCC behavior.
17201 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
17204 if (RefinementSteps == ReciprocalEstimate::Unspecified)
17205 RefinementSteps = 1;
17207 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
17212 /// If we have at least two divisions that use the same divisor, convert to
17213 /// multiplication by a reciprocal. This may need to be adjusted for a given
17214 /// CPU if a division's cost is not at least twice the cost of a multiplication.
17215 /// This is because we still need one division to calculate the reciprocal and
17216 /// then we need two multiplies by that reciprocal as replacements for the
17217 /// original divisions.
17218 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
17222 /// Helper for creating a X86ISD::SETCC node.
17223 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
17224 SelectionDAG &DAG) {
17225 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
17226 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
17229 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
17230 /// according to equal/not-equal condition code \p CC.
17231 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
17232 const SDLoc &dl, SelectionDAG &DAG) {
17233 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
17234 // instruction. Since the shift amount is in-range-or-undefined, we know
17235 // that doing a bittest on the i32 value is ok. We extend to i32 because
17236 // the encoding for the i16 version is larger than the i32 version.
17237 // Also promote i16 to i32 for performance / code size reason.
17238 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
17239 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
17241 // See if we can use the 32-bit instruction instead of the 64-bit one for a
17242 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
17243 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
17244 // known to be zero.
17245 if (Src.getValueType() == MVT::i64 &&
17246 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
17247 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
17249 // If the operand types disagree, extend the shift amount to match. Since
17250 // BT ignores high bits (like shifts) we can use anyextend.
17251 if (Src.getValueType() != BitNo.getValueType())
17252 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
17254 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
17255 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
17256 return getSETCC(Cond, BT, dl , DAG);
17259 /// Result of 'and' is compared against zero. Change to a BT node if possible.
17260 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
17261 const SDLoc &dl, SelectionDAG &DAG) {
17262 assert(And.getOpcode() == ISD::AND && "Expected AND node!");
17263 SDValue Op0 = And.getOperand(0);
17264 SDValue Op1 = And.getOperand(1);
17265 if (Op0.getOpcode() == ISD::TRUNCATE)
17266 Op0 = Op0.getOperand(0);
17267 if (Op1.getOpcode() == ISD::TRUNCATE)
17268 Op1 = Op1.getOperand(0);
17271 if (Op1.getOpcode() == ISD::SHL)
17272 std::swap(Op0, Op1);
17273 if (Op0.getOpcode() == ISD::SHL) {
17274 if (isOneConstant(Op0.getOperand(0))) {
17275 // If we looked past a truncate, check that it's only truncating away
17277 unsigned BitWidth = Op0.getValueSizeInBits();
17278 unsigned AndBitWidth = And.getValueSizeInBits();
17279 if (BitWidth > AndBitWidth) {
17281 DAG.computeKnownBits(Op0, Known);
17282 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
17286 RHS = Op0.getOperand(1);
17288 } else if (Op1.getOpcode() == ISD::Constant) {
17289 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
17290 uint64_t AndRHSVal = AndRHS->getZExtValue();
17291 SDValue AndLHS = Op0;
17293 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
17294 LHS = AndLHS.getOperand(0);
17295 RHS = AndLHS.getOperand(1);
17298 // Use BT if the immediate can't be encoded in a TEST instruction.
17299 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17301 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17306 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17311 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17313 static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17318 // SSE Condition code mapping:
17327 switch (SetCCOpcode) {
17328 default: llvm_unreachable("Unexpected SETCC condition");
17330 case ISD::SETEQ: SSECC = 0; break;
17332 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17334 case ISD::SETOLT: SSECC = 1; break;
17336 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17338 case ISD::SETOLE: SSECC = 2; break;
17339 case ISD::SETUO: SSECC = 3; break;
17341 case ISD::SETNE: SSECC = 4; break;
17342 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17343 case ISD::SETUGE: SSECC = 5; break;
17344 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17345 case ISD::SETUGT: SSECC = 6; break;
17346 case ISD::SETO: SSECC = 7; break;
17347 case ISD::SETUEQ: SSECC = 8; break;
17348 case ISD::SETONE: SSECC = 12; break;
17351 std::swap(Op0, Op1);
17356 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17357 /// concatenate the result back.
17358 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17359 MVT VT = Op.getSimpleValueType();
17361 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17362 "Unsupported value type for operation");
17364 unsigned NumElems = VT.getVectorNumElements();
17366 SDValue CC = Op.getOperand(2);
17368 // Extract the LHS vectors
17369 SDValue LHS = Op.getOperand(0);
17370 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17371 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17373 // Extract the RHS vectors
17374 SDValue RHS = Op.getOperand(1);
17375 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17376 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17378 // Issue the operation on the smaller types and concatenate the result back
17379 MVT EltVT = VT.getVectorElementType();
17380 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17381 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17382 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17383 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17386 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17387 SDValue Op0 = Op.getOperand(0);
17388 SDValue Op1 = Op.getOperand(1);
17389 SDValue CC = Op.getOperand(2);
17390 MVT VT = Op.getSimpleValueType();
17393 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17394 "Unexpected type for boolean compare operation");
17395 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17396 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17397 DAG.getConstant(-1, dl, VT));
17398 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17399 DAG.getConstant(-1, dl, VT));
17400 switch (SetCCOpcode) {
17401 default: llvm_unreachable("Unexpected SETCC condition");
17403 // (x == y) -> ~(x ^ y)
17404 return DAG.getNode(ISD::XOR, dl, VT,
17405 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17406 DAG.getConstant(-1, dl, VT));
17408 // (x != y) -> (x ^ y)
17409 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17412 // (x > y) -> (x & ~y)
17413 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17416 // (x < y) -> (~x & y)
17417 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17420 // (x <= y) -> (~x | y)
17421 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17424 // (x >=y) -> (x | ~y)
17425 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17429 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17431 SDValue Op0 = Op.getOperand(0);
17432 SDValue Op1 = Op.getOperand(1);
17433 SDValue CC = Op.getOperand(2);
17434 MVT VT = Op.getSimpleValueType();
17437 assert(VT.getVectorElementType() == MVT::i1 &&
17438 "Cannot set masked compare for this operation");
17440 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17442 bool Unsigned = false;
17445 switch (SetCCOpcode) {
17446 default: llvm_unreachable("Unexpected SETCC condition");
17447 case ISD::SETNE: SSECC = 4; break;
17448 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17449 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17450 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17451 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17452 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17453 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17454 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17455 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17456 case ISD::SETLE: SSECC = 2; break;
17460 std::swap(Op0, Op1);
17462 // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
17463 if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
17464 SDValue A = peekThroughBitcasts(Op0);
17465 if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
17466 ISD::isBuildVectorAllZeros(Op1.getNode())) {
17467 MVT VT0 = Op0.getSimpleValueType();
17468 SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
17469 SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
17470 return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
17476 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17477 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17478 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17479 DAG.getConstant(SSECC, dl, MVT::i8));
17482 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17483 /// operand \p Op1. If non-trivial (for example because it's not constant)
17484 /// return an empty value.
17485 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17486 SelectionDAG &DAG) {
17487 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17491 MVT VT = Op1.getSimpleValueType();
17492 MVT EVT = VT.getVectorElementType();
17493 unsigned n = VT.getVectorNumElements();
17494 SmallVector<SDValue, 8> ULTOp1;
17496 for (unsigned i = 0; i < n; ++i) {
17497 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17498 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17501 // Avoid underflow.
17502 APInt Val = Elt->getAPIntValue();
17506 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17509 return DAG.getBuildVector(VT, dl, ULTOp1);
17512 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17513 SelectionDAG &DAG) {
17514 SDValue Op0 = Op.getOperand(0);
17515 SDValue Op1 = Op.getOperand(1);
17516 SDValue CC = Op.getOperand(2);
17517 MVT VT = Op.getSimpleValueType();
17518 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17519 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17524 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17525 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17529 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17530 assert(VT.getVectorNumElements() <= 16);
17531 Opc = X86ISD::CMPM;
17533 Opc = X86ISD::CMPP;
17534 // The SSE/AVX packed FP comparison nodes are defined with a
17535 // floating-point vector result that matches the operand type. This allows
17536 // them to work with an SSE1 target (integer vector types are not legal).
17537 VT = Op0.getSimpleValueType();
17540 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17541 // emit two comparisons and a logic op to tie them together.
17543 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17544 if (SSECC >= 8 && !Subtarget.hasAVX()) {
17545 // LLVM predicate is SETUEQ or SETONE.
17547 unsigned CombineOpc;
17548 if (Cond == ISD::SETUEQ) {
17551 CombineOpc = X86ISD::FOR;
17553 assert(Cond == ISD::SETONE);
17556 CombineOpc = X86ISD::FAND;
17559 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17560 DAG.getConstant(CC0, dl, MVT::i8));
17561 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17562 DAG.getConstant(CC1, dl, MVT::i8));
17563 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17565 // Handle all other FP comparisons here.
17566 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17567 DAG.getConstant(SSECC, dl, MVT::i8));
17570 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17571 // result type of SETCC. The bitcast is expected to be optimized away
17572 // during combining/isel.
17573 if (Opc == X86ISD::CMPP)
17574 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17579 MVT VTOp0 = Op0.getSimpleValueType();
17580 assert(VTOp0 == Op1.getSimpleValueType() &&
17581 "Expected operands with same type!");
17582 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17583 "Invalid number of packed elements for source and destination!");
17585 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17586 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17587 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17588 // legalizer firstly checks if the first operand in input to the setcc has
17589 // a legal type. If so, then it promotes the return type to that same type.
17590 // Otherwise, the return type is promoted to the 'next legal type' which,
17591 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17593 // We reach this code only if the following two conditions are met:
17594 // 1. Both return type and operand type have been promoted to wider types
17595 // by the type legalizer.
17596 // 2. The original operand type has been promoted to a 256-bit vector.
17598 // Note that condition 2. only applies for AVX targets.
17599 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17600 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17603 // The non-AVX512 code below works under the assumption that source and
17604 // destination types are the same.
17605 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17606 "Value types for source and destination must be the same!");
17608 // Break 256-bit integer vector compare into smaller ones.
17609 if (VT.is256BitVector() && !Subtarget.hasInt256())
17610 return Lower256IntVSETCC(Op, DAG);
17612 // Operands are boolean (vectors of i1)
17613 MVT OpVT = Op1.getSimpleValueType();
17614 if (OpVT.getVectorElementType() == MVT::i1)
17615 return LowerBoolVSETCC_AVX512(Op, DAG);
17617 // The result is boolean, but operands are int/float
17618 if (VT.getVectorElementType() == MVT::i1) {
17619 // In AVX-512 architecture setcc returns mask with i1 elements,
17620 // But there is no compare instruction for i8 and i16 elements in KNL.
17621 // In this case use SSE compare
17622 bool UseAVX512Inst =
17623 (OpVT.is512BitVector() ||
17624 OpVT.getScalarSizeInBits() >= 32 ||
17625 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17628 return LowerIntVSETCC_AVX512(Op, DAG);
17630 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17631 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17634 // Lower using XOP integer comparisons.
17635 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17636 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17637 // Translate compare code to XOP PCOM compare mode.
17638 unsigned CmpMode = 0;
17640 default: llvm_unreachable("Unexpected SETCC condition");
17642 case ISD::SETLT: CmpMode = 0x00; break;
17644 case ISD::SETLE: CmpMode = 0x01; break;
17646 case ISD::SETGT: CmpMode = 0x02; break;
17648 case ISD::SETGE: CmpMode = 0x03; break;
17649 case ISD::SETEQ: CmpMode = 0x04; break;
17650 case ISD::SETNE: CmpMode = 0x05; break;
17653 // Are we comparing unsigned or signed integers?
17655 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17657 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17658 DAG.getConstant(CmpMode, dl, MVT::i8));
17661 // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
17662 // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
17663 if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
17664 SDValue BC0 = peekThroughBitcasts(Op0);
17665 if (BC0.getOpcode() == ISD::AND) {
17667 SmallVector<APInt, 64> EltBits;
17668 if (getTargetConstantBitsFromNode(BC0.getOperand(1),
17669 VT.getScalarSizeInBits(), UndefElts,
17670 EltBits, false, false)) {
17671 if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
17673 Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
17679 // We are handling one of the integer comparisons here. Since SSE only has
17680 // GT and EQ comparisons for integer, swapping operands and multiple
17681 // operations may be required for some comparisons.
17682 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17684 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17685 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17686 bool Invert = Cond == ISD::SETNE ||
17687 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17689 // If both operands are known non-negative, then an unsigned compare is the
17690 // same as a signed compare and there's no need to flip signbits.
17691 // TODO: We could check for more general simplifications here since we're
17692 // computing known bits.
17693 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17694 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17696 // Special case: Use min/max operations for SETULE/SETUGE
17697 MVT VET = VT.getVectorElementType();
17699 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
17700 (Subtarget.hasSSE2() && (VET == MVT::i8));
17701 bool MinMax = false;
17705 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17706 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17710 Swap = Invert = FlipSigns = false;
17713 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17714 bool Subus = false;
17715 if (!MinMax && HasSubus) {
17716 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17718 // t = psubus Op0, Op1
17719 // pcmpeq t, <0..0>
17722 case ISD::SETULT: {
17723 // If the comparison is against a constant we can turn this into a
17724 // setule. With psubus, setule does not require a swap. This is
17725 // beneficial because the constant in the register is no longer
17726 // destructed as the destination so it can be hoisted out of a loop.
17727 // Only do this pre-AVX since vpcmp* is no longer destructive.
17728 if (Subtarget.hasAVX())
17730 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17732 Subus = true; Invert = false; Swap = false;
17736 // Psubus is better than flip-sign because it requires no inversion.
17737 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17738 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17742 Opc = X86ISD::SUBUS;
17748 std::swap(Op0, Op1);
17750 // Check that the operation in question is available (most are plain SSE2,
17751 // but PCMPGTQ and PCMPEQQ have different requirements).
17752 if (VT == MVT::v2i64) {
17753 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17754 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17756 // First cast everything to the right type.
17757 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17758 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17760 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17761 // bits of the inputs before performing those operations. The lower
17762 // compare is always unsigned.
17765 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17767 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17768 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17769 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17771 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17772 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17774 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17775 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17776 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17778 // Create masks for only the low parts/high parts of the 64 bit integers.
17779 static const int MaskHi[] = { 1, 1, 3, 3 };
17780 static const int MaskLo[] = { 0, 0, 2, 2 };
17781 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17782 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17783 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17785 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17786 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17789 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17791 return DAG.getBitcast(VT, Result);
17794 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17795 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17796 // pcmpeqd + pshufd + pand.
17797 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17799 // First cast everything to the right type.
17800 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17801 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17804 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17806 // Make sure the lower and upper halves are both all-ones.
17807 static const int Mask[] = { 1, 0, 3, 2 };
17808 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17809 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17812 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17814 return DAG.getBitcast(VT, Result);
17818 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17819 // bits of the inputs before performing those operations.
17821 MVT EltVT = VT.getVectorElementType();
17822 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17824 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17825 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17828 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17830 // If the logical-not of the result is required, perform that now.
17832 Result = DAG.getNOT(dl, Result, VT);
17835 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17838 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17839 getZeroVector(VT, Subtarget, DAG, dl));
17844 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17846 MVT VT = Op.getSimpleValueType();
17848 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17850 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17851 SDValue Op0 = Op.getOperand(0);
17852 SDValue Op1 = Op.getOperand(1);
17854 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17856 // Optimize to BT if possible.
17857 // Lower (X & (1 << N)) == 0 to BT(X, N).
17858 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17859 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17860 if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
17861 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17862 if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
17866 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17868 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17869 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17871 // If the input is a setcc, then reuse the input setcc or use a new one with
17872 // the inverted condition.
17873 if (Op0.getOpcode() == X86ISD::SETCC) {
17874 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17875 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17879 CCode = X86::GetOppositeBranchCondition(CCode);
17880 return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17884 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17885 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17886 if (X86CC == X86::COND_INVALID)
17889 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17890 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17891 return getSETCC(X86CC, EFLAGS, dl, DAG);
17894 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17895 SDValue LHS = Op.getOperand(0);
17896 SDValue RHS = Op.getOperand(1);
17897 SDValue Carry = Op.getOperand(2);
17898 SDValue Cond = Op.getOperand(3);
17901 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17902 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17904 // Recreate the carry if needed.
17905 EVT CarryVT = Carry.getValueType();
17906 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17907 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17908 Carry, DAG.getConstant(NegOne, DL, CarryVT));
17910 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17911 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17912 return getSETCC(CC, Cmp.getValue(1), DL, DAG);
17915 /// Return true if opcode is a X86 logical comparison.
17916 static bool isX86LogicalCmp(SDValue Op) {
17917 unsigned Opc = Op.getOpcode();
17918 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17919 Opc == X86ISD::SAHF)
17921 if (Op.getResNo() == 1 &&
17922 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17923 Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
17924 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17925 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17928 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17934 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17935 if (V.getOpcode() != ISD::TRUNCATE)
17938 SDValue VOp0 = V.getOperand(0);
17939 unsigned InBits = VOp0.getValueSizeInBits();
17940 unsigned Bits = V.getValueSizeInBits();
17941 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17944 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17945 bool AddTest = true;
17946 SDValue Cond = Op.getOperand(0);
17947 SDValue Op1 = Op.getOperand(1);
17948 SDValue Op2 = Op.getOperand(2);
17950 MVT VT = Op1.getSimpleValueType();
17953 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17954 // are available or VBLENDV if AVX is available.
17955 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17956 if (Cond.getOpcode() == ISD::SETCC &&
17957 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17958 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17959 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17960 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17961 unsigned SSECC = translateX86FSETCC(
17962 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17964 if (Subtarget.hasAVX512()) {
17965 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17966 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17967 assert(!VT.isVector() && "Not a scalar type?");
17968 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17971 if (SSECC < 8 || Subtarget.hasAVX()) {
17972 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17973 DAG.getConstant(SSECC, DL, MVT::i8));
17975 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17976 // of 3 logic instructions for size savings and potentially speed.
17977 // Unfortunately, there is no scalar form of VBLENDV.
17979 // If either operand is a constant, don't try this. We can expect to
17980 // optimize away at least one of the logic instructions later in that
17981 // case, so that sequence would be faster than a variable blend.
17983 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17984 // uses XMM0 as the selection register. That may need just as many
17985 // instructions as the AND/ANDN/OR sequence due to register moves, so
17988 if (Subtarget.hasAVX() &&
17989 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17991 // Convert to vectors, do a VSELECT, and convert back to scalar.
17992 // All of the conversions should be optimized away.
17994 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17995 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17996 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17997 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17999 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
18000 VCmp = DAG.getBitcast(VCmpVT, VCmp);
18002 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
18004 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
18005 VSel, DAG.getIntPtrConstant(0, DL));
18007 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
18008 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
18009 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
18013 // AVX512 fallback is to lower selects of scalar floats to masked moves.
18014 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
18015 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
18016 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
18019 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
18021 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
18022 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
18023 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
18024 Op1Scalar = Op1.getOperand(0);
18026 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
18027 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
18028 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
18029 Op2Scalar = Op2.getOperand(0);
18030 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
18031 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
18032 Op1Scalar, Op2Scalar);
18033 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
18034 return DAG.getBitcast(VT, newSelect);
18035 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
18036 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
18037 DAG.getIntPtrConstant(0, DL));
18041 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
18042 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
18043 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18044 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
18045 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
18046 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
18047 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
18048 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
18051 if (Cond.getOpcode() == ISD::SETCC) {
18052 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
18054 // If the condition was updated, it's possible that the operands of the
18055 // select were also updated (for example, EmitTest has a RAUW). Refresh
18056 // the local references to the select operands in case they got stale.
18057 Op1 = Op.getOperand(1);
18058 Op2 = Op.getOperand(2);
18062 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
18063 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
18064 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
18065 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
18066 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
18067 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
18068 if (Cond.getOpcode() == X86ISD::SETCC &&
18069 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
18070 isNullConstant(Cond.getOperand(1).getOperand(1))) {
18071 SDValue Cmp = Cond.getOperand(1);
18072 unsigned CondCode =
18073 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
18075 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18076 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
18077 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
18078 SDValue CmpOp0 = Cmp.getOperand(0);
18080 // Apply further optimizations for special cases
18081 // (select (x != 0), -1, 0) -> neg & sbb
18082 // (select (x == 0), 0, -1) -> neg & sbb
18083 if (isNullConstant(Y) &&
18084 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
18085 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
18086 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
18087 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
18088 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18089 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18090 SDValue(Neg.getNode(), 1));
18094 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
18095 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
18096 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18098 SDValue Res = // Res = 0 or -1.
18099 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18100 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
18102 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
18103 Res = DAG.getNOT(DL, Res, Res.getValueType());
18105 if (!isNullConstant(Op2))
18106 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
18108 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
18109 Cmp.getOperand(0).getOpcode() == ISD::AND &&
18110 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
18111 SDValue CmpOp0 = Cmp.getOperand(0);
18112 SDValue Src1, Src2;
18113 // true if Op2 is XOR or OR operator and one of its operands
18115 // ( a , a op b) || ( b , a op b)
18116 auto isOrXorPattern = [&]() {
18117 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
18118 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
18120 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
18127 if (isOrXorPattern()) {
18129 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
18130 // we need mask of all zeros or ones with same size of the other
18132 if (CmpSz > VT.getSizeInBits())
18133 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
18134 else if (CmpSz < VT.getSizeInBits())
18135 Neg = DAG.getNode(ISD::AND, DL, VT,
18136 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
18137 DAG.getConstant(1, DL, VT));
18140 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
18141 Neg); // -(and (x, 0x1))
18142 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
18143 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
18148 // Look past (and (setcc_carry (cmp ...)), 1).
18149 if (Cond.getOpcode() == ISD::AND &&
18150 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18151 isOneConstant(Cond.getOperand(1)))
18152 Cond = Cond.getOperand(0);
18154 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18155 // setting operand in place of the X86ISD::SETCC.
18156 unsigned CondOpcode = Cond.getOpcode();
18157 if (CondOpcode == X86ISD::SETCC ||
18158 CondOpcode == X86ISD::SETCC_CARRY) {
18159 CC = Cond.getOperand(0);
18161 SDValue Cmp = Cond.getOperand(1);
18162 unsigned Opc = Cmp.getOpcode();
18163 MVT VT = Op.getSimpleValueType();
18165 bool IllegalFPCMov = false;
18166 if (VT.isFloatingPoint() && !VT.isVector() &&
18167 !isScalarFPTypeInSSEReg(VT)) // FPStack?
18168 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
18170 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
18171 Opc == X86ISD::BT) { // FIXME
18175 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18176 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18177 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18178 Cond.getOperand(0).getValueType() != MVT::i8)) {
18179 SDValue LHS = Cond.getOperand(0);
18180 SDValue RHS = Cond.getOperand(1);
18181 unsigned X86Opcode;
18184 switch (CondOpcode) {
18185 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18186 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18187 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18188 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18189 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18190 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18191 default: llvm_unreachable("unexpected overflowing operator");
18193 if (CondOpcode == ISD::UMULO)
18194 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18197 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18199 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
18201 if (CondOpcode == ISD::UMULO)
18202 Cond = X86Op.getValue(2);
18204 Cond = X86Op.getValue(1);
18206 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
18211 // Look past the truncate if the high bits are known zero.
18212 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18213 Cond = Cond.getOperand(0);
18215 // We know the result of AND is compared against zero. Try to match
18217 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
18218 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
18219 CC = NewSetCC.getOperand(0);
18220 Cond = NewSetCC.getOperand(1);
18227 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
18228 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
18231 // a < b ? -1 : 0 -> RES = ~setcc_carry
18232 // a < b ? 0 : -1 -> RES = setcc_carry
18233 // a >= b ? -1 : 0 -> RES = setcc_carry
18234 // a >= b ? 0 : -1 -> RES = ~setcc_carry
18235 if (Cond.getOpcode() == X86ISD::SUB) {
18236 Cond = ConvertCmpIfNecessary(Cond, DAG);
18237 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
18239 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
18240 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
18241 (isNullConstant(Op1) || isNullConstant(Op2))) {
18242 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
18243 DAG.getConstant(X86::COND_B, DL, MVT::i8),
18245 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
18246 return DAG.getNOT(DL, Res, Res.getValueType());
18251 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
18252 // widen the cmov and push the truncate through. This avoids introducing a new
18253 // branch during isel and doesn't add any extensions.
18254 if (Op.getValueType() == MVT::i8 &&
18255 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
18256 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
18257 if (T1.getValueType() == T2.getValueType() &&
18258 // Blacklist CopyFromReg to avoid partial register stalls.
18259 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
18260 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
18262 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
18266 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
18267 // condition is true.
18268 SDValue Ops[] = { Op2, Op1, CC, Cond };
18269 return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
18272 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
18273 const X86Subtarget &Subtarget,
18274 SelectionDAG &DAG) {
18275 MVT VT = Op->getSimpleValueType(0);
18276 SDValue In = Op->getOperand(0);
18277 MVT InVT = In.getSimpleValueType();
18278 MVT VTElt = VT.getVectorElementType();
18279 MVT InVTElt = InVT.getVectorElementType();
18283 if ((InVTElt == MVT::i1) &&
18284 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
18286 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
18288 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18290 unsigned NumElts = VT.getVectorNumElements();
18292 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
18293 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
18294 if (In.getOpcode() == X86ISD::VSEXT)
18295 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
18296 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
18299 if (InVTElt != MVT::i1)
18303 if (!VT.is512BitVector() && !Subtarget.hasVLX()) {
18304 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
18305 } else if (VTElt == MVT::i16 || VTElt == MVT::i8) {
18306 // If we don't have BWI support we need to extend 8/16-bit to 32-bit.
18307 // Otherwise we end up with vselects we can't handle.
18308 ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
18312 if (Subtarget.hasDQI()) {
18313 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
18314 assert(!VT.is512BitVector() && "Unexpected vector type");
18316 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
18317 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
18318 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
18323 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
18326 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18327 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18328 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18329 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18330 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18331 const X86Subtarget &Subtarget,
18332 SelectionDAG &DAG) {
18333 SDValue In = Op->getOperand(0);
18334 MVT VT = Op->getSimpleValueType(0);
18335 MVT InVT = In.getSimpleValueType();
18336 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18338 MVT SVT = VT.getVectorElementType();
18339 MVT InSVT = InVT.getVectorElementType();
18340 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18342 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18344 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18346 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18347 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18348 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18353 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18354 // For 512-bit vectors, we need 128-bits or 256-bits.
18355 if (VT.getSizeInBits() > 128) {
18356 // Input needs to be at least the same number of elements as output, and
18357 // at least 128-bits.
18358 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18359 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18362 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18363 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18365 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18366 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18367 // need to be handled here for 256/512-bit results.
18368 if (Subtarget.hasInt256()) {
18369 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18370 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18371 X86ISD::VSEXT : X86ISD::VZEXT;
18372 return DAG.getNode(ExtOpc, dl, VT, In);
18375 // We should only get here for sign extend.
18376 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18377 "Unexpected opcode!");
18379 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18383 // As SRAI is only available on i16/i32 types, we expand only up to i32
18384 // and handle i64 separately.
18385 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18386 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18387 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18388 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18389 Curr = DAG.getBitcast(CurrVT, Curr);
18392 SDValue SignExt = Curr;
18393 if (CurrVT != InVT) {
18394 unsigned SignExtShift =
18395 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18396 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18397 DAG.getConstant(SignExtShift, dl, MVT::i8));
18403 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18404 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18405 DAG.getConstant(31, dl, MVT::i8));
18406 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18407 return DAG.getBitcast(VT, Ext);
18413 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18414 SelectionDAG &DAG) {
18415 MVT VT = Op->getSimpleValueType(0);
18416 SDValue In = Op->getOperand(0);
18417 MVT InVT = In.getSimpleValueType();
18420 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
18421 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18423 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18424 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18425 (VT != MVT::v16i16 || InVT != MVT::v16i8))
18428 if (Subtarget.hasInt256())
18429 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18431 // Optimize vectors in AVX mode
18432 // Sign extend v8i16 to v8i32 and
18435 // Divide input vector into two parts
18436 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18437 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18438 // concat the vectors to original VT
18440 unsigned NumElems = InVT.getVectorNumElements();
18441 SDValue Undef = DAG.getUNDEF(InVT);
18443 SmallVector<int,8> ShufMask1(NumElems, -1);
18444 for (unsigned i = 0; i != NumElems/2; ++i)
18447 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18449 SmallVector<int,8> ShufMask2(NumElems, -1);
18450 for (unsigned i = 0; i != NumElems/2; ++i)
18451 ShufMask2[i] = i + NumElems/2;
18453 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18455 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18456 VT.getVectorNumElements() / 2);
18458 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18459 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18461 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18464 // Lower truncating store. We need a special lowering to vXi1 vectors
18465 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18466 SelectionDAG &DAG) {
18467 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18469 EVT MemVT = St->getMemoryVT();
18470 assert(St->isTruncatingStore() && "We only custom truncating store.");
18471 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18472 "Expected truncstore of i1 vector");
18474 SDValue Op = St->getValue();
18475 MVT OpVT = Op.getValueType().getSimpleVT();
18476 unsigned NumElts = OpVT.getVectorNumElements();
18477 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18479 // Truncate and store - everything is legal
18480 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18481 if (MemVT.getSizeInBits() < 8)
18482 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18483 DAG.getUNDEF(MVT::v8i1), Op,
18484 DAG.getIntPtrConstant(0, dl));
18485 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18486 St->getMemOperand());
18489 // A subset, assume that we have only AVX-512F
18490 if (NumElts <= 8) {
18492 // Extend to 8-elts vector
18493 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18494 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18495 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18497 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18498 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18499 St->getMemOperand());
18502 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18503 // Divide the vector into 2 parts and store each part separately
18504 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18505 DAG.getIntPtrConstant(0, dl));
18506 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18507 SDValue BasePtr = St->getBasePtr();
18508 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18509 St->getMemOperand());
18510 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18511 DAG.getIntPtrConstant(16, dl));
18512 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18514 SDValue BasePtrHi =
18515 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18516 DAG.getConstant(2, dl, BasePtr.getValueType()));
18518 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18519 BasePtrHi, St->getMemOperand());
18520 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18523 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18524 const X86Subtarget &Subtarget,
18525 SelectionDAG &DAG) {
18527 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18529 EVT MemVT = Ld->getMemoryVT();
18530 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18531 "Expected i1 vector load");
18532 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18533 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18534 MVT VT = Op.getValueType().getSimpleVT();
18535 unsigned NumElts = VT.getVectorNumElements();
18537 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18538 (Subtarget.hasDQI() && NumElts < 16) ||
18540 // Load and extend - everything is legal
18542 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18544 Ld->getMemOperand());
18545 // Replace chain users with the new chain.
18546 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18547 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18548 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18549 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18551 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18552 DAG.getIntPtrConstant(0, dl));
18554 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18556 Ld->getMemOperand());
18557 // Replace chain users with the new chain.
18558 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18559 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18561 // Finally, do a normal sign-extend to the desired register.
18562 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18565 if (NumElts <= 8) {
18566 // A subset, assume that we have only AVX-512F
18567 unsigned NumBitsToLoad = 8;
18568 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18569 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18571 Ld->getMemOperand());
18572 // Replace chain users with the new chain.
18573 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18574 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18576 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18577 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18580 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18582 // we should take care to v4i1 and v2i1
18584 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18585 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18586 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18587 DAG.getIntPtrConstant(0, dl));
18590 assert(VT == MVT::v32i8 && "Unexpected extload type");
18592 SmallVector<SDValue, 2> Chains;
18594 SDValue BasePtr = Ld->getBasePtr();
18595 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18597 Ld->getMemOperand());
18598 Chains.push_back(LoadLo.getValue(1));
18600 SDValue BasePtrHi =
18601 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18602 DAG.getConstant(2, dl, BasePtr.getValueType()));
18604 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18606 Ld->getMemOperand());
18607 Chains.push_back(LoadHi.getValue(1));
18608 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18609 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18611 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18612 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18613 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18616 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18617 // may emit an illegal shuffle but the expansion is still better than scalar
18618 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18619 // we'll emit a shuffle and a arithmetic shift.
18620 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18621 // TODO: It is possible to support ZExt by zeroing the undef values during
18622 // the shuffle phase or after the shuffle.
18623 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18624 SelectionDAG &DAG) {
18625 MVT RegVT = Op.getSimpleValueType();
18626 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18627 assert(RegVT.isInteger() &&
18628 "We only custom lower integer vector sext loads.");
18630 // Nothing useful we can do without SSE2 shuffles.
18631 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18633 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18635 EVT MemVT = Ld->getMemoryVT();
18636 if (MemVT.getScalarType() == MVT::i1)
18637 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18640 unsigned RegSz = RegVT.getSizeInBits();
18642 ISD::LoadExtType Ext = Ld->getExtensionType();
18644 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18645 && "Only anyext and sext are currently implemented.");
18646 assert(MemVT != RegVT && "Cannot extend to the same type");
18647 assert(MemVT.isVector() && "Must load a vector from memory");
18649 unsigned NumElems = RegVT.getVectorNumElements();
18650 unsigned MemSz = MemVT.getSizeInBits();
18651 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18653 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18654 // The only way in which we have a legal 256-bit vector result but not the
18655 // integer 256-bit operations needed to directly lower a sextload is if we
18656 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18657 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18658 // correctly legalized. We do this late to allow the canonical form of
18659 // sextload to persist throughout the rest of the DAG combiner -- it wants
18660 // to fold together any extensions it can, and so will fuse a sign_extend
18661 // of an sextload into a sextload targeting a wider value.
18663 if (MemSz == 128) {
18664 // Just switch this to a normal load.
18665 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18666 "it must be a legal 128-bit vector "
18668 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18669 Ld->getPointerInfo(), Ld->getAlignment(),
18670 Ld->getMemOperand()->getFlags());
18672 assert(MemSz < 128 &&
18673 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18674 // Do an sext load to a 128-bit vector type. We want to use the same
18675 // number of elements, but elements half as wide. This will end up being
18676 // recursively lowered by this routine, but will succeed as we definitely
18677 // have all the necessary features if we're using AVX1.
18679 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18680 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18682 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18683 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18684 Ld->getMemOperand()->getFlags());
18687 // Replace chain users with the new chain.
18688 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18689 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18691 // Finally, do a normal sign-extend to the desired register.
18692 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18695 // All sizes must be a power of two.
18696 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18697 "Non-power-of-two elements are not custom lowered!");
18699 // Attempt to load the original value using scalar loads.
18700 // Find the largest scalar type that divides the total loaded size.
18701 MVT SclrLoadTy = MVT::i8;
18702 for (MVT Tp : MVT::integer_valuetypes()) {
18703 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18708 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18709 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18711 SclrLoadTy = MVT::f64;
18713 // Calculate the number of scalar loads that we need to perform
18714 // in order to load our vector from memory.
18715 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18717 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18718 "Can only lower sext loads with a single scalar load!");
18720 unsigned loadRegZize = RegSz;
18721 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18724 // If we don't have BWI we won't be able to create the shuffle needed for
18726 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18727 MemVT == MVT::v8i8)
18730 // Represent our vector as a sequence of elements which are the
18731 // largest scalar that we can load.
18732 EVT LoadUnitVecVT = EVT::getVectorVT(
18733 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18735 // Represent the data using the same element type that is stored in
18736 // memory. In practice, we ''widen'' MemVT.
18738 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18739 loadRegZize / MemVT.getScalarSizeInBits());
18741 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18742 "Invalid vector type");
18744 // We can't shuffle using an illegal type.
18745 assert(TLI.isTypeLegal(WideVecVT) &&
18746 "We only lower types that form legal widened vector types");
18748 SmallVector<SDValue, 8> Chains;
18749 SDValue Ptr = Ld->getBasePtr();
18750 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18751 TLI.getPointerTy(DAG.getDataLayout()));
18752 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18754 for (unsigned i = 0; i < NumLoads; ++i) {
18755 // Perform a single load.
18756 SDValue ScalarLoad =
18757 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18758 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18759 Chains.push_back(ScalarLoad.getValue(1));
18760 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18761 // another round of DAGCombining.
18763 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18765 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18766 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18768 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18771 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18773 // Bitcast the loaded value to a vector of the original element type, in
18774 // the size of the target vector type.
18775 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18776 unsigned SizeRatio = RegSz / MemSz;
18778 if (Ext == ISD::SEXTLOAD) {
18779 // If we have SSE4.1, we can directly emit a VSEXT node.
18780 if (Subtarget.hasSSE41()) {
18781 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18782 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18786 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18788 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18789 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18791 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18792 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18796 if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
18797 MemVT == MVT::v8i8) {
18798 SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
18799 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18803 // Redistribute the loaded elements into the different locations.
18804 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18805 for (unsigned i = 0; i != NumElems; ++i)
18806 ShuffleVec[i * SizeRatio] = i;
18808 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18809 DAG.getUNDEF(WideVecVT), ShuffleVec);
18811 // Bitcast to the requested type.
18812 Shuff = DAG.getBitcast(RegVT, Shuff);
18813 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18817 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18818 /// each of which has no other use apart from the AND / OR.
18819 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18820 Opc = Op.getOpcode();
18821 if (Opc != ISD::OR && Opc != ISD::AND)
18823 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18824 Op.getOperand(0).hasOneUse() &&
18825 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18826 Op.getOperand(1).hasOneUse());
18829 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18830 /// SETCC node has a single use.
18831 static bool isXor1OfSetCC(SDValue Op) {
18832 if (Op.getOpcode() != ISD::XOR)
18834 if (isOneConstant(Op.getOperand(1)))
18835 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18836 Op.getOperand(0).hasOneUse();
18840 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18841 bool addTest = true;
18842 SDValue Chain = Op.getOperand(0);
18843 SDValue Cond = Op.getOperand(1);
18844 SDValue Dest = Op.getOperand(2);
18847 bool Inverted = false;
18849 if (Cond.getOpcode() == ISD::SETCC) {
18850 // Check for setcc([su]{add,sub,mul}o == 0).
18851 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18852 isNullConstant(Cond.getOperand(1)) &&
18853 Cond.getOperand(0).getResNo() == 1 &&
18854 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18855 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18856 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18857 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18858 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18859 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18861 Cond = Cond.getOperand(0);
18863 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18868 // FIXME: LowerXALUO doesn't handle these!!
18869 else if (Cond.getOpcode() == X86ISD::ADD ||
18870 Cond.getOpcode() == X86ISD::SUB ||
18871 Cond.getOpcode() == X86ISD::SMUL ||
18872 Cond.getOpcode() == X86ISD::UMUL)
18873 Cond = LowerXALUO(Cond, DAG);
18876 // Look pass (and (setcc_carry (cmp ...)), 1).
18877 if (Cond.getOpcode() == ISD::AND &&
18878 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18879 isOneConstant(Cond.getOperand(1)))
18880 Cond = Cond.getOperand(0);
18882 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18883 // setting operand in place of the X86ISD::SETCC.
18884 unsigned CondOpcode = Cond.getOpcode();
18885 if (CondOpcode == X86ISD::SETCC ||
18886 CondOpcode == X86ISD::SETCC_CARRY) {
18887 CC = Cond.getOperand(0);
18889 SDValue Cmp = Cond.getOperand(1);
18890 unsigned Opc = Cmp.getOpcode();
18891 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18892 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18896 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18900 // These can only come from an arithmetic instruction with overflow,
18901 // e.g. SADDO, UADDO.
18902 Cond = Cond.getOperand(1);
18908 CondOpcode = Cond.getOpcode();
18909 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18910 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18911 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18912 Cond.getOperand(0).getValueType() != MVT::i8)) {
18913 SDValue LHS = Cond.getOperand(0);
18914 SDValue RHS = Cond.getOperand(1);
18915 unsigned X86Opcode;
18918 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18919 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18921 switch (CondOpcode) {
18922 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18924 if (isOneConstant(RHS)) {
18925 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18928 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18929 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18931 if (isOneConstant(RHS)) {
18932 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18935 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18936 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18937 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18938 default: llvm_unreachable("unexpected overflowing operator");
18941 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18942 if (CondOpcode == ISD::UMULO)
18943 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18946 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18948 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18950 if (CondOpcode == ISD::UMULO)
18951 Cond = X86Op.getValue(2);
18953 Cond = X86Op.getValue(1);
18955 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18959 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18960 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18961 if (CondOpc == ISD::OR) {
18962 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18963 // two branches instead of an explicit OR instruction with a
18965 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18966 isX86LogicalCmp(Cmp)) {
18967 CC = Cond.getOperand(0).getOperand(0);
18968 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18969 Chain, Dest, CC, Cmp);
18970 CC = Cond.getOperand(1).getOperand(0);
18974 } else { // ISD::AND
18975 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18976 // two branches instead of an explicit AND instruction with a
18977 // separate test. However, we only do this if this block doesn't
18978 // have a fall-through edge, because this requires an explicit
18979 // jmp when the condition is false.
18980 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18981 isX86LogicalCmp(Cmp) &&
18982 Op.getNode()->hasOneUse()) {
18983 X86::CondCode CCode =
18984 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18985 CCode = X86::GetOppositeBranchCondition(CCode);
18986 CC = DAG.getConstant(CCode, dl, MVT::i8);
18987 SDNode *User = *Op.getNode()->use_begin();
18988 // Look for an unconditional branch following this conditional branch.
18989 // We need this because we need to reverse the successors in order
18990 // to implement FCMP_OEQ.
18991 if (User->getOpcode() == ISD::BR) {
18992 SDValue FalseBB = User->getOperand(1);
18994 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18995 assert(NewBR == User);
18999 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19000 Chain, Dest, CC, Cmp);
19001 X86::CondCode CCode =
19002 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
19003 CCode = X86::GetOppositeBranchCondition(CCode);
19004 CC = DAG.getConstant(CCode, dl, MVT::i8);
19010 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
19011 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
19012 // It should be transformed during dag combiner except when the condition
19013 // is set by a arithmetics with overflow node.
19014 X86::CondCode CCode =
19015 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
19016 CCode = X86::GetOppositeBranchCondition(CCode);
19017 CC = DAG.getConstant(CCode, dl, MVT::i8);
19018 Cond = Cond.getOperand(0).getOperand(1);
19020 } else if (Cond.getOpcode() == ISD::SETCC &&
19021 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
19022 // For FCMP_OEQ, we can emit
19023 // two branches instead of an explicit AND instruction with a
19024 // separate test. However, we only do this if this block doesn't
19025 // have a fall-through edge, because this requires an explicit
19026 // jmp when the condition is false.
19027 if (Op.getNode()->hasOneUse()) {
19028 SDNode *User = *Op.getNode()->use_begin();
19029 // Look for an unconditional branch following this conditional branch.
19030 // We need this because we need to reverse the successors in order
19031 // to implement FCMP_OEQ.
19032 if (User->getOpcode() == ISD::BR) {
19033 SDValue FalseBB = User->getOperand(1);
19035 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19036 assert(NewBR == User);
19040 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19041 Cond.getOperand(0), Cond.getOperand(1));
19042 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19043 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19044 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19045 Chain, Dest, CC, Cmp);
19046 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
19051 } else if (Cond.getOpcode() == ISD::SETCC &&
19052 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
19053 // For FCMP_UNE, we can emit
19054 // two branches instead of an explicit AND instruction with a
19055 // separate test. However, we only do this if this block doesn't
19056 // have a fall-through edge, because this requires an explicit
19057 // jmp when the condition is false.
19058 if (Op.getNode()->hasOneUse()) {
19059 SDNode *User = *Op.getNode()->use_begin();
19060 // Look for an unconditional branch following this conditional branch.
19061 // We need this because we need to reverse the successors in order
19062 // to implement FCMP_UNE.
19063 if (User->getOpcode() == ISD::BR) {
19064 SDValue FalseBB = User->getOperand(1);
19066 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
19067 assert(NewBR == User);
19070 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
19071 Cond.getOperand(0), Cond.getOperand(1));
19072 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
19073 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
19074 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19075 Chain, Dest, CC, Cmp);
19076 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
19086 // Look pass the truncate if the high bits are known zero.
19087 if (isTruncWithZeroHighBitsInput(Cond, DAG))
19088 Cond = Cond.getOperand(0);
19090 // We know the result of AND is compared against zero. Try to match
19092 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
19093 if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
19094 CC = NewSetCC.getOperand(0);
19095 Cond = NewSetCC.getOperand(1);
19102 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
19103 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
19104 Cond = EmitTest(Cond, X86Cond, dl, DAG);
19106 Cond = ConvertCmpIfNecessary(Cond, DAG);
19107 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
19108 Chain, Dest, CC, Cond);
19111 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
19112 // Calls to _alloca are needed to probe the stack when allocating more than 4k
19113 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
19114 // that the guard pages used by the OS virtual memory manager are allocated in
19115 // correct sequence.
19117 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
19118 SelectionDAG &DAG) const {
19119 MachineFunction &MF = DAG.getMachineFunction();
19120 bool SplitStack = MF.shouldSplitStack();
19121 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
19122 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
19123 SplitStack || EmitStackProbe;
19127 SDNode *Node = Op.getNode();
19128 SDValue Chain = Op.getOperand(0);
19129 SDValue Size = Op.getOperand(1);
19130 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
19131 EVT VT = Node->getValueType(0);
19133 // Chain the dynamic stack allocation so that it doesn't modify the stack
19134 // pointer when other instructions are using the stack.
19135 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
19137 bool Is64Bit = Subtarget.is64Bit();
19138 MVT SPTy = getPointerTy(DAG.getDataLayout());
19142 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19143 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
19144 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
19145 " not tell us which reg is the stack pointer!");
19147 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
19148 Chain = SP.getValue(1);
19149 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19150 unsigned StackAlign = TFI.getStackAlignment();
19151 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
19152 if (Align > StackAlign)
19153 Result = DAG.getNode(ISD::AND, dl, VT, Result,
19154 DAG.getConstant(-(uint64_t)Align, dl, VT));
19155 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
19156 } else if (SplitStack) {
19157 MachineRegisterInfo &MRI = MF.getRegInfo();
19160 // The 64 bit implementation of segmented stacks needs to clobber both r10
19161 // r11. This makes it impossible to use it along with nested parameters.
19162 const Function *F = MF.getFunction();
19163 for (const auto &A : F->args()) {
19164 if (A.hasNestAttr())
19165 report_fatal_error("Cannot use segmented stacks with functions that "
19166 "have nested arguments.");
19170 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
19171 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
19172 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
19173 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
19174 DAG.getRegister(Vreg, SPTy));
19176 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
19177 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
19178 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
19180 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19181 unsigned SPReg = RegInfo->getStackRegister();
19182 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
19183 Chain = SP.getValue(1);
19186 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
19187 DAG.getConstant(-(uint64_t)Align, dl, VT));
19188 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
19194 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
19195 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
19197 SDValue Ops[2] = {Result, Chain};
19198 return DAG.getMergeValues(Ops, dl);
19201 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
19202 MachineFunction &MF = DAG.getMachineFunction();
19203 auto PtrVT = getPointerTy(MF.getDataLayout());
19204 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19206 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19209 if (!Subtarget.is64Bit() ||
19210 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
19211 // vastart just stores the address of the VarArgsFrameIndex slot into the
19212 // memory location argument.
19213 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19214 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
19215 MachinePointerInfo(SV));
19219 // gp_offset (0 - 6 * 8)
19220 // fp_offset (48 - 48 + 8 * 16)
19221 // overflow_arg_area (point to parameters coming in memory).
19223 SmallVector<SDValue, 8> MemOps;
19224 SDValue FIN = Op.getOperand(1);
19226 SDValue Store = DAG.getStore(
19227 Op.getOperand(0), DL,
19228 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
19229 MachinePointerInfo(SV));
19230 MemOps.push_back(Store);
19233 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
19234 Store = DAG.getStore(
19235 Op.getOperand(0), DL,
19236 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
19237 MachinePointerInfo(SV, 4));
19238 MemOps.push_back(Store);
19240 // Store ptr to overflow_arg_area
19241 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
19242 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
19244 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
19245 MemOps.push_back(Store);
19247 // Store ptr to reg_save_area.
19248 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
19249 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
19250 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
19251 Store = DAG.getStore(
19252 Op.getOperand(0), DL, RSFIN, FIN,
19253 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
19254 MemOps.push_back(Store);
19255 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
19258 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
19259 assert(Subtarget.is64Bit() &&
19260 "LowerVAARG only handles 64-bit va_arg!");
19261 assert(Op.getNumOperands() == 4);
19263 MachineFunction &MF = DAG.getMachineFunction();
19264 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
19265 // The Win64 ABI uses char* instead of a structure.
19266 return DAG.expandVAArg(Op.getNode());
19268 SDValue Chain = Op.getOperand(0);
19269 SDValue SrcPtr = Op.getOperand(1);
19270 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
19271 unsigned Align = Op.getConstantOperandVal(3);
19274 EVT ArgVT = Op.getNode()->getValueType(0);
19275 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
19276 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
19279 // Decide which area this value should be read from.
19280 // TODO: Implement the AMD64 ABI in its entirety. This simple
19281 // selection mechanism works only for the basic types.
19282 if (ArgVT == MVT::f80) {
19283 llvm_unreachable("va_arg for f80 not yet implemented");
19284 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
19285 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
19286 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
19287 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
19289 llvm_unreachable("Unhandled argument type in LowerVAARG");
19292 if (ArgMode == 2) {
19293 // Sanity Check: Make sure using fp_offset makes sense.
19294 assert(!Subtarget.useSoftFloat() &&
19295 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
19296 Subtarget.hasSSE1());
19299 // Insert VAARG_64 node into the DAG
19300 // VAARG_64 returns two values: Variable Argument Address, Chain
19301 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19302 DAG.getConstant(ArgMode, dl, MVT::i8),
19303 DAG.getConstant(Align, dl, MVT::i32)};
19304 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19305 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
19306 VTs, InstOps, MVT::i64,
19307 MachinePointerInfo(SV),
19309 /*Volatile=*/false,
19311 /*WriteMem=*/true);
19312 Chain = VAARG.getValue(1);
19314 // Load the next argument and return it
19315 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19318 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19319 SelectionDAG &DAG) {
19320 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19321 // where a va_list is still an i8*.
19322 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19323 if (Subtarget.isCallingConvWin64(
19324 DAG.getMachineFunction().getFunction()->getCallingConv()))
19325 // Probably a Win64 va_copy.
19326 return DAG.expandVACopy(Op.getNode());
19328 SDValue Chain = Op.getOperand(0);
19329 SDValue DstPtr = Op.getOperand(1);
19330 SDValue SrcPtr = Op.getOperand(2);
19331 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19332 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19335 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19336 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19338 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19341 /// Handle vector element shifts where the shift amount is a constant.
19342 /// Takes immediate version of shift as input.
19343 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19344 SDValue SrcOp, uint64_t ShiftAmt,
19345 SelectionDAG &DAG) {
19346 MVT ElementType = VT.getVectorElementType();
19348 // Bitcast the source vector to the output type, this is mainly necessary for
19349 // vXi8/vXi64 shifts.
19350 if (VT != SrcOp.getSimpleValueType())
19351 SrcOp = DAG.getBitcast(VT, SrcOp);
19353 // Fold this packed shift into its first operand if ShiftAmt is 0.
19357 // Check for ShiftAmt >= element width
19358 if (ShiftAmt >= ElementType.getSizeInBits()) {
19359 if (Opc == X86ISD::VSRAI)
19360 ShiftAmt = ElementType.getSizeInBits() - 1;
19362 return DAG.getConstant(0, dl, VT);
19365 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19366 && "Unknown target vector shift-by-constant node");
19368 // Fold this packed vector shift into a build vector if SrcOp is a
19369 // vector of Constants or UNDEFs.
19370 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19371 SmallVector<SDValue, 8> Elts;
19372 unsigned NumElts = SrcOp->getNumOperands();
19373 ConstantSDNode *ND;
19376 default: llvm_unreachable("Unknown opcode!");
19377 case X86ISD::VSHLI:
19378 for (unsigned i=0; i!=NumElts; ++i) {
19379 SDValue CurrentOp = SrcOp->getOperand(i);
19380 if (CurrentOp->isUndef()) {
19381 Elts.push_back(CurrentOp);
19384 ND = cast<ConstantSDNode>(CurrentOp);
19385 const APInt &C = ND->getAPIntValue();
19386 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19389 case X86ISD::VSRLI:
19390 for (unsigned i=0; i!=NumElts; ++i) {
19391 SDValue CurrentOp = SrcOp->getOperand(i);
19392 if (CurrentOp->isUndef()) {
19393 Elts.push_back(CurrentOp);
19396 ND = cast<ConstantSDNode>(CurrentOp);
19397 const APInt &C = ND->getAPIntValue();
19398 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19401 case X86ISD::VSRAI:
19402 for (unsigned i=0; i!=NumElts; ++i) {
19403 SDValue CurrentOp = SrcOp->getOperand(i);
19404 if (CurrentOp->isUndef()) {
19405 Elts.push_back(CurrentOp);
19408 ND = cast<ConstantSDNode>(CurrentOp);
19409 const APInt &C = ND->getAPIntValue();
19410 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19415 return DAG.getBuildVector(VT, dl, Elts);
19418 return DAG.getNode(Opc, dl, VT, SrcOp,
19419 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19422 /// Handle vector element shifts where the shift amount may or may not be a
19423 /// constant. Takes immediate version of shift as input.
19424 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19425 SDValue SrcOp, SDValue ShAmt,
19426 const X86Subtarget &Subtarget,
19427 SelectionDAG &DAG) {
19428 MVT SVT = ShAmt.getSimpleValueType();
19429 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19431 // Catch shift-by-constant.
19432 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19433 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19434 CShAmt->getZExtValue(), DAG);
19436 // Change opcode to non-immediate version
19438 default: llvm_unreachable("Unknown target vector shift node");
19439 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19440 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19441 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19444 // Need to build a vector containing shift amount.
19445 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19446 // +=================+============+=======================================+
19447 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19448 // +=================+============+=======================================+
19449 // | i64 | Yes, No | Use ShAmt as lowest elt |
19450 // | i32 | Yes | zero-extend in-reg |
19451 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19452 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19453 // +=================+============+=======================================+
19455 if (SVT == MVT::i64)
19456 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19457 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19458 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19459 ShAmt = ShAmt.getOperand(0);
19460 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19461 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19462 } else if (Subtarget.hasSSE41() &&
19463 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19464 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19465 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19467 SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
19468 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19469 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19472 // The return type has to be a 128-bit type with the same element
19473 // type as the input type.
19474 MVT EltVT = VT.getVectorElementType();
19475 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19477 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19478 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19481 /// \brief Return Mask with the necessary casting or extending
19482 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19483 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19484 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19487 if (isAllOnesConstant(Mask))
19488 return DAG.getTargetConstant(1, dl, MaskVT);
19489 if (X86::isZeroNode(Mask))
19490 return DAG.getTargetConstant(0, dl, MaskVT);
19492 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19493 // Mask should be extended
19494 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19495 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19498 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19499 if (MaskVT == MVT::v64i1) {
19500 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19501 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19503 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19504 DAG.getConstant(0, dl, MVT::i32));
19505 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19506 DAG.getConstant(1, dl, MVT::i32));
19508 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19509 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19511 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19513 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19515 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19516 return DAG.getBitcast(MaskVT,
19517 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19521 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19522 Mask.getSimpleValueType().getSizeInBits());
19523 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19524 // are extracted by EXTRACT_SUBVECTOR.
19525 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19526 DAG.getBitcast(BitcastVT, Mask),
19527 DAG.getIntPtrConstant(0, dl));
19531 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19532 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19533 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19534 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19535 SDValue PreservedSrc,
19536 const X86Subtarget &Subtarget,
19537 SelectionDAG &DAG) {
19538 MVT VT = Op.getSimpleValueType();
19539 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19540 unsigned OpcodeSelect = ISD::VSELECT;
19543 if (isAllOnesConstant(Mask))
19546 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19548 switch (Op.getOpcode()) {
19551 case X86ISD::CMPM_RND:
19552 case X86ISD::CMPMU:
19553 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19554 case X86ISD::VFPCLASS:
19555 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19556 case X86ISD::VTRUNC:
19557 case X86ISD::VTRUNCS:
19558 case X86ISD::VTRUNCUS:
19559 case X86ISD::CVTPS2PH:
19560 // We can't use ISD::VSELECT here because it is not always "Legal"
19561 // for the destination type. For example vpmovqb require only AVX512
19562 // and vselect that can operate on byte element type require BWI
19563 OpcodeSelect = X86ISD::SELECT;
19566 if (PreservedSrc.isUndef())
19567 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19568 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19571 /// \brief Creates an SDNode for a predicated scalar operation.
19572 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19573 /// The mask is coming as MVT::i8 and it should be transformed
19574 /// to MVT::v1i1 while lowering masking intrinsics.
19575 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19576 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19577 /// for a scalar instruction.
19578 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19579 SDValue PreservedSrc,
19580 const X86Subtarget &Subtarget,
19581 SelectionDAG &DAG) {
19583 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19584 if (MaskConst->getZExtValue() & 0x1)
19587 MVT VT = Op.getSimpleValueType();
19590 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19591 if (Op.getOpcode() == X86ISD::FSETCCM ||
19592 Op.getOpcode() == X86ISD::FSETCCM_RND)
19593 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19594 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19595 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19597 if (PreservedSrc.isUndef())
19598 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19599 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19602 static int getSEHRegistrationNodeSize(const Function *Fn) {
19603 if (!Fn->hasPersonalityFn())
19604 report_fatal_error(
19605 "querying registration node size for function without personality");
19606 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19607 // WinEHStatePass for the full struct definition.
19608 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19609 case EHPersonality::MSVC_X86SEH: return 24;
19610 case EHPersonality::MSVC_CXX: return 16;
19613 report_fatal_error(
19614 "can only recover FP for 32-bit MSVC EH personality functions");
19617 /// When the MSVC runtime transfers control to us, either to an outlined
19618 /// function or when returning to a parent frame after catching an exception, we
19619 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19620 /// Here's the math:
19621 /// RegNodeBase = EntryEBP - RegNodeSize
19622 /// ParentFP = RegNodeBase - ParentFrameOffset
19623 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19624 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19625 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19626 SDValue EntryEBP) {
19627 MachineFunction &MF = DAG.getMachineFunction();
19630 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19631 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19633 // It's possible that the parent function no longer has a personality function
19634 // if the exceptional code was optimized away, in which case we just return
19635 // the incoming EBP.
19636 if (!Fn->hasPersonalityFn())
19639 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19640 // registration, or the .set_setframe offset.
19641 MCSymbol *OffsetSym =
19642 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19643 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19644 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19645 SDValue ParentFrameOffset =
19646 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19648 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19649 // prologue to RBP in the parent function.
19650 const X86Subtarget &Subtarget =
19651 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19652 if (Subtarget.is64Bit())
19653 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19655 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19656 // RegNodeBase = EntryEBP - RegNodeSize
19657 // ParentFP = RegNodeBase - ParentFrameOffset
19658 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19659 DAG.getConstant(RegNodeSize, dl, PtrVT));
19660 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19663 SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
19664 SelectionDAG &DAG) const {
19665 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19666 auto isRoundModeCurDirection = [](SDValue Rnd) {
19667 if (!isa<ConstantSDNode>(Rnd))
19670 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19671 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19675 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19676 MVT VT = Op.getSimpleValueType();
19677 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19679 switch(IntrData->Type) {
19680 case INTR_TYPE_1OP:
19681 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19682 case INTR_TYPE_2OP:
19683 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19685 case INTR_TYPE_3OP:
19686 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19687 Op.getOperand(2), Op.getOperand(3));
19688 case INTR_TYPE_4OP:
19689 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19690 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19691 case INTR_TYPE_1OP_MASK_RM: {
19692 SDValue Src = Op.getOperand(1);
19693 SDValue PassThru = Op.getOperand(2);
19694 SDValue Mask = Op.getOperand(3);
19695 SDValue RoundingMode;
19696 // We always add rounding mode to the Node.
19697 // If the rounding mode is not specified, we add the
19698 // "current direction" mode.
19699 if (Op.getNumOperands() == 4)
19701 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19703 RoundingMode = Op.getOperand(4);
19704 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19705 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19707 Mask, PassThru, Subtarget, DAG);
19709 case INTR_TYPE_1OP_MASK: {
19710 SDValue Src = Op.getOperand(1);
19711 SDValue PassThru = Op.getOperand(2);
19712 SDValue Mask = Op.getOperand(3);
19713 // We add rounding mode to the Node when
19714 // - RM Opcode is specified and
19715 // - RM is not "current direction".
19716 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19717 if (IntrWithRoundingModeOpcode != 0) {
19718 SDValue Rnd = Op.getOperand(4);
19719 if (!isRoundModeCurDirection(Rnd)) {
19720 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19721 dl, Op.getValueType(),
19723 Mask, PassThru, Subtarget, DAG);
19726 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19727 Mask, PassThru, Subtarget, DAG);
19729 case INTR_TYPE_SCALAR_MASK: {
19730 SDValue Src1 = Op.getOperand(1);
19731 SDValue Src2 = Op.getOperand(2);
19732 SDValue passThru = Op.getOperand(3);
19733 SDValue Mask = Op.getOperand(4);
19734 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19735 // There are 2 kinds of intrinsics in this group:
19736 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19737 // (2) With rounding mode and sae - 7 operands.
19738 bool HasRounding = IntrWithRoundingModeOpcode != 0;
19739 if (Op.getNumOperands() == (5U + HasRounding)) {
19741 SDValue Rnd = Op.getOperand(5);
19742 if (!isRoundModeCurDirection(Rnd))
19743 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19744 dl, VT, Src1, Src2, Rnd),
19745 Mask, passThru, Subtarget, DAG);
19747 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19749 Mask, passThru, Subtarget, DAG);
19752 assert(Op.getNumOperands() == (6U + HasRounding) &&
19753 "Unexpected intrinsic form");
19754 SDValue RoundingMode = Op.getOperand(5);
19756 SDValue Sae = Op.getOperand(6);
19757 if (!isRoundModeCurDirection(Sae))
19758 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19759 dl, VT, Src1, Src2,
19760 RoundingMode, Sae),
19761 Mask, passThru, Subtarget, DAG);
19763 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19764 Src2, RoundingMode),
19765 Mask, passThru, Subtarget, DAG);
19767 case INTR_TYPE_SCALAR_MASK_RM: {
19768 SDValue Src1 = Op.getOperand(1);
19769 SDValue Src2 = Op.getOperand(2);
19770 SDValue Src0 = Op.getOperand(3);
19771 SDValue Mask = Op.getOperand(4);
19772 // There are 2 kinds of intrinsics in this group:
19773 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19774 // (2) With rounding mode and sae - 7 operands.
19775 if (Op.getNumOperands() == 6) {
19776 SDValue Sae = Op.getOperand(5);
19777 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19779 Mask, Src0, Subtarget, DAG);
19781 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19782 SDValue RoundingMode = Op.getOperand(5);
19783 SDValue Sae = Op.getOperand(6);
19784 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19785 RoundingMode, Sae),
19786 Mask, Src0, Subtarget, DAG);
19788 case INTR_TYPE_2OP_MASK:
19789 case INTR_TYPE_2OP_IMM8_MASK: {
19790 SDValue Src1 = Op.getOperand(1);
19791 SDValue Src2 = Op.getOperand(2);
19792 SDValue PassThru = Op.getOperand(3);
19793 SDValue Mask = Op.getOperand(4);
19795 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19796 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19798 // We specify 2 possible opcodes for intrinsics with rounding modes.
19799 // First, we check if the intrinsic may have non-default rounding mode,
19800 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19801 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19802 if (IntrWithRoundingModeOpcode != 0) {
19803 SDValue Rnd = Op.getOperand(5);
19804 if (!isRoundModeCurDirection(Rnd)) {
19805 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19806 dl, Op.getValueType(),
19808 Mask, PassThru, Subtarget, DAG);
19811 // TODO: Intrinsics should have fast-math-flags to propagate.
19812 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19813 Mask, PassThru, Subtarget, DAG);
19815 case INTR_TYPE_2OP_MASK_RM: {
19816 SDValue Src1 = Op.getOperand(1);
19817 SDValue Src2 = Op.getOperand(2);
19818 SDValue PassThru = Op.getOperand(3);
19819 SDValue Mask = Op.getOperand(4);
19820 // We specify 2 possible modes for intrinsics, with/without rounding
19822 // First, we check if the intrinsic have rounding mode (6 operands),
19823 // if not, we set rounding mode to "current".
19825 if (Op.getNumOperands() == 6)
19826 Rnd = Op.getOperand(5);
19828 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19829 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19831 Mask, PassThru, Subtarget, DAG);
19833 case INTR_TYPE_3OP_SCALAR_MASK: {
19834 SDValue Src1 = Op.getOperand(1);
19835 SDValue Src2 = Op.getOperand(2);
19836 SDValue Src3 = Op.getOperand(3);
19837 SDValue PassThru = Op.getOperand(4);
19838 SDValue Mask = Op.getOperand(5);
19840 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19841 if (IntrWithRoundingModeOpcode != 0) {
19842 SDValue Rnd = Op.getOperand(6);
19843 if (!isRoundModeCurDirection(Rnd))
19844 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19845 dl, VT, Src1, Src2, Src3, Rnd),
19846 Mask, PassThru, Subtarget, DAG);
19848 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19850 Mask, PassThru, Subtarget, DAG);
19852 case INTR_TYPE_3OP_MASK_RM: {
19853 SDValue Src1 = Op.getOperand(1);
19854 SDValue Src2 = Op.getOperand(2);
19855 SDValue Imm = Op.getOperand(3);
19856 SDValue PassThru = Op.getOperand(4);
19857 SDValue Mask = Op.getOperand(5);
19858 // We specify 2 possible modes for intrinsics, with/without rounding
19860 // First, we check if the intrinsic have rounding mode (7 operands),
19861 // if not, we set rounding mode to "current".
19863 if (Op.getNumOperands() == 7)
19864 Rnd = Op.getOperand(6);
19866 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19867 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19868 Src1, Src2, Imm, Rnd),
19869 Mask, PassThru, Subtarget, DAG);
19871 case INTR_TYPE_3OP_IMM8_MASK:
19872 case INTR_TYPE_3OP_MASK: {
19873 SDValue Src1 = Op.getOperand(1);
19874 SDValue Src2 = Op.getOperand(2);
19875 SDValue Src3 = Op.getOperand(3);
19876 SDValue PassThru = Op.getOperand(4);
19877 SDValue Mask = Op.getOperand(5);
19879 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19880 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19882 // We specify 2 possible opcodes for intrinsics with rounding modes.
19883 // First, we check if the intrinsic may have non-default rounding mode,
19884 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19885 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19886 if (IntrWithRoundingModeOpcode != 0) {
19887 SDValue Rnd = Op.getOperand(6);
19888 if (!isRoundModeCurDirection(Rnd)) {
19889 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19890 dl, Op.getValueType(),
19891 Src1, Src2, Src3, Rnd),
19892 Mask, PassThru, Subtarget, DAG);
19895 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19897 Mask, PassThru, Subtarget, DAG);
19899 case VPERM_2OP_MASK : {
19900 SDValue Src1 = Op.getOperand(1);
19901 SDValue Src2 = Op.getOperand(2);
19902 SDValue PassThru = Op.getOperand(3);
19903 SDValue Mask = Op.getOperand(4);
19905 // Swap Src1 and Src2 in the node creation
19906 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19907 Mask, PassThru, Subtarget, DAG);
19909 case VPERM_3OP_MASKZ:
19910 case VPERM_3OP_MASK:{
19911 MVT VT = Op.getSimpleValueType();
19912 // Src2 is the PassThru
19913 SDValue Src1 = Op.getOperand(1);
19914 // PassThru needs to be the same type as the destination in order
19915 // to pattern match correctly.
19916 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19917 SDValue Src3 = Op.getOperand(3);
19918 SDValue Mask = Op.getOperand(4);
19919 SDValue PassThru = SDValue();
19921 // set PassThru element
19922 if (IntrData->Type == VPERM_3OP_MASKZ)
19923 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19927 // Swap Src1 and Src2 in the node creation
19928 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19929 dl, Op.getValueType(),
19931 Mask, PassThru, Subtarget, DAG);
19935 case FMA_OP_MASK: {
19936 SDValue Src1 = Op.getOperand(1);
19937 SDValue Src2 = Op.getOperand(2);
19938 SDValue Src3 = Op.getOperand(3);
19939 SDValue Mask = Op.getOperand(4);
19940 MVT VT = Op.getSimpleValueType();
19941 SDValue PassThru = SDValue();
19943 // set PassThru element
19944 if (IntrData->Type == FMA_OP_MASKZ)
19945 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19946 else if (IntrData->Type == FMA_OP_MASK3)
19951 // We specify 2 possible opcodes for intrinsics with rounding modes.
19952 // First, we check if the intrinsic may have non-default rounding mode,
19953 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19954 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19955 if (IntrWithRoundingModeOpcode != 0) {
19956 SDValue Rnd = Op.getOperand(5);
19957 if (!isRoundModeCurDirection(Rnd))
19958 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19959 dl, Op.getValueType(),
19960 Src1, Src2, Src3, Rnd),
19961 Mask, PassThru, Subtarget, DAG);
19963 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19964 dl, Op.getValueType(),
19966 Mask, PassThru, Subtarget, DAG);
19968 case FMA_OP_SCALAR_MASK:
19969 case FMA_OP_SCALAR_MASK3:
19970 case FMA_OP_SCALAR_MASKZ: {
19971 SDValue Src1 = Op.getOperand(1);
19972 SDValue Src2 = Op.getOperand(2);
19973 SDValue Src3 = Op.getOperand(3);
19974 SDValue Mask = Op.getOperand(4);
19975 MVT VT = Op.getSimpleValueType();
19976 SDValue PassThru = SDValue();
19978 // set PassThru element
19979 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19980 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19981 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19986 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19987 if (IntrWithRoundingModeOpcode != 0) {
19988 SDValue Rnd = Op.getOperand(5);
19989 if (!isRoundModeCurDirection(Rnd))
19990 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
19991 Op.getValueType(), Src1, Src2,
19993 Mask, PassThru, Subtarget, DAG);
19996 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19997 Op.getValueType(), Src1, Src2,
19999 Mask, PassThru, Subtarget, DAG);
20001 case IFMA_OP_MASKZ:
20002 case IFMA_OP_MASK: {
20003 SDValue Src1 = Op.getOperand(1);
20004 SDValue Src2 = Op.getOperand(2);
20005 SDValue Src3 = Op.getOperand(3);
20006 SDValue Mask = Op.getOperand(4);
20007 MVT VT = Op.getSimpleValueType();
20008 SDValue PassThru = Src1;
20010 // set PassThru element
20011 if (IntrData->Type == IFMA_OP_MASKZ)
20012 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20014 // Node we need to swizzle the operands to pass the multiply operands
20016 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
20017 dl, Op.getValueType(),
20019 Mask, PassThru, Subtarget, DAG);
20021 case TERLOG_OP_MASK:
20022 case TERLOG_OP_MASKZ: {
20023 SDValue Src1 = Op.getOperand(1);
20024 SDValue Src2 = Op.getOperand(2);
20025 SDValue Src3 = Op.getOperand(3);
20026 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
20027 SDValue Mask = Op.getOperand(5);
20028 MVT VT = Op.getSimpleValueType();
20029 SDValue PassThru = Src1;
20030 // Set PassThru element.
20031 if (IntrData->Type == TERLOG_OP_MASKZ)
20032 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
20034 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20035 Src1, Src2, Src3, Src4),
20036 Mask, PassThru, Subtarget, DAG);
20039 // ISD::FP_ROUND has a second argument that indicates if the truncation
20040 // does not change the value. Set it to 0 since it can change.
20041 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
20042 DAG.getIntPtrConstant(0, dl));
20043 case CVTPD2PS_MASK: {
20044 SDValue Src = Op.getOperand(1);
20045 SDValue PassThru = Op.getOperand(2);
20046 SDValue Mask = Op.getOperand(3);
20047 // We add rounding mode to the Node when
20048 // - RM Opcode is specified and
20049 // - RM is not "current direction".
20050 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
20051 if (IntrWithRoundingModeOpcode != 0) {
20052 SDValue Rnd = Op.getOperand(4);
20053 if (!isRoundModeCurDirection(Rnd)) {
20054 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
20055 dl, Op.getValueType(),
20057 Mask, PassThru, Subtarget, DAG);
20060 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
20061 // ISD::FP_ROUND has a second argument that indicates if the truncation
20062 // does not change the value. Set it to 0 since it can change.
20063 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
20064 DAG.getIntPtrConstant(0, dl)),
20065 Mask, PassThru, Subtarget, DAG);
20068 // FPclass intrinsics with mask
20069 SDValue Src1 = Op.getOperand(1);
20070 MVT VT = Src1.getSimpleValueType();
20071 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20072 SDValue Imm = Op.getOperand(2);
20073 SDValue Mask = Op.getOperand(3);
20074 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20075 Mask.getSimpleValueType().getSizeInBits());
20076 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
20077 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
20078 DAG.getTargetConstant(0, dl, MaskVT),
20080 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20081 DAG.getUNDEF(BitcastVT), FPclassMask,
20082 DAG.getIntPtrConstant(0, dl));
20083 return DAG.getBitcast(Op.getValueType(), Res);
20086 SDValue Src1 = Op.getOperand(1);
20087 SDValue Imm = Op.getOperand(2);
20088 SDValue Mask = Op.getOperand(3);
20089 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
20090 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
20091 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
20092 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
20093 DAG.getIntPtrConstant(0, dl));
20096 case CMP_MASK_CC: {
20097 // Comparison intrinsics with masks.
20098 // Example of transformation:
20099 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
20100 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
20102 // (v8i1 (insert_subvector undef,
20103 // (v2i1 (and (PCMPEQM %a, %b),
20104 // (extract_subvector
20105 // (v8i1 (bitcast %mask)), 0))), 0))))
20106 MVT VT = Op.getOperand(1).getSimpleValueType();
20107 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20108 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
20109 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
20110 Mask.getSimpleValueType().getSizeInBits());
20112 if (IntrData->Type == CMP_MASK_CC) {
20113 SDValue CC = Op.getOperand(3);
20114 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
20115 // We specify 2 possible opcodes for intrinsics with rounding modes.
20116 // First, we check if the intrinsic may have non-default rounding mode,
20117 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
20118 if (IntrData->Opc1 != 0) {
20119 SDValue Rnd = Op.getOperand(5);
20120 if (!isRoundModeCurDirection(Rnd))
20121 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
20122 Op.getOperand(2), CC, Rnd);
20124 //default rounding mode
20126 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20127 Op.getOperand(2), CC);
20130 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
20131 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
20134 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
20135 DAG.getTargetConstant(0, dl,
20138 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20139 DAG.getUNDEF(BitcastVT), CmpMask,
20140 DAG.getIntPtrConstant(0, dl));
20141 return DAG.getBitcast(Op.getValueType(), Res);
20143 case CMP_MASK_SCALAR_CC: {
20144 SDValue Src1 = Op.getOperand(1);
20145 SDValue Src2 = Op.getOperand(2);
20146 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
20147 SDValue Mask = Op.getOperand(4);
20150 if (IntrData->Opc1 != 0) {
20151 SDValue Rnd = Op.getOperand(5);
20152 if (!isRoundModeCurDirection(Rnd))
20153 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
20155 //default rounding mode
20157 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
20159 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
20160 DAG.getTargetConstant(0, dl,
20163 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
20164 DAG.getIntPtrConstant(0, dl));
20166 case COMI: { // Comparison intrinsics
20167 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
20168 SDValue LHS = Op.getOperand(1);
20169 SDValue RHS = Op.getOperand(2);
20170 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
20171 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
20174 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
20175 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
20176 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
20177 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
20180 case ISD::SETNE: { // (ZF = 1 or PF = 1)
20181 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
20182 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
20183 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
20186 case ISD::SETGT: // (CF = 0 and ZF = 0)
20187 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
20189 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
20190 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
20193 case ISD::SETGE: // CF = 0
20194 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
20196 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
20197 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
20200 llvm_unreachable("Unexpected illegal condition!");
20202 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20204 case COMI_RM: { // Comparison intrinsics with Sae
20205 SDValue LHS = Op.getOperand(1);
20206 SDValue RHS = Op.getOperand(2);
20207 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
20208 SDValue Sae = Op.getOperand(4);
20211 if (isRoundModeCurDirection(Sae))
20212 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
20213 DAG.getConstant(CondVal, dl, MVT::i8));
20215 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
20216 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
20217 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
20218 DAG.getIntPtrConstant(0, dl));
20221 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
20222 Op.getOperand(1), Op.getOperand(2), Subtarget,
20224 case COMPRESS_EXPAND_IN_REG: {
20225 SDValue Mask = Op.getOperand(3);
20226 SDValue DataToCompress = Op.getOperand(1);
20227 SDValue PassThru = Op.getOperand(2);
20228 if (isAllOnesConstant(Mask)) // return data as is
20229 return Op.getOperand(1);
20231 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20233 Mask, PassThru, Subtarget, DAG);
20236 SDValue Mask = Op.getOperand(1);
20237 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20238 Mask.getSimpleValueType().getSizeInBits());
20239 Mask = DAG.getBitcast(MaskVT, Mask);
20240 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
20243 MVT VT = Op.getSimpleValueType();
20244 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
20246 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20247 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20248 // Arguments should be swapped.
20249 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
20250 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
20252 return DAG.getBitcast(VT, Res);
20255 MVT VT = Op.getSimpleValueType();
20256 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20258 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
20259 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
20260 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
20261 return DAG.getBitcast(VT, Res);
20264 case FIXUPIMMS_MASKZ:
20266 case FIXUPIMM_MASKZ:{
20267 SDValue Src1 = Op.getOperand(1);
20268 SDValue Src2 = Op.getOperand(2);
20269 SDValue Src3 = Op.getOperand(3);
20270 SDValue Imm = Op.getOperand(4);
20271 SDValue Mask = Op.getOperand(5);
20272 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
20273 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
20274 // We specify 2 possible modes for intrinsics, with/without rounding
20276 // First, we check if the intrinsic have rounding mode (7 operands),
20277 // if not, we set rounding mode to "current".
20279 if (Op.getNumOperands() == 7)
20280 Rnd = Op.getOperand(6);
20282 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
20283 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
20284 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20285 Src1, Src2, Src3, Imm, Rnd),
20286 Mask, Passthru, Subtarget, DAG);
20287 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
20288 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
20289 Src1, Src2, Src3, Imm, Rnd),
20290 Mask, Passthru, Subtarget, DAG);
20292 case CONVERT_TO_MASK: {
20293 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
20294 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
20295 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
20297 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
20299 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
20300 DAG.getUNDEF(BitcastVT), CvtMask,
20301 DAG.getIntPtrConstant(0, dl));
20302 return DAG.getBitcast(Op.getValueType(), Res);
20305 assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
20306 // Clear the upper bits of the rounding immediate so that the legacy
20307 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20308 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20310 DAG.getConstant(0xf, dl, MVT::i32));
20311 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20312 Op.getOperand(1), RoundingMode);
20315 assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
20316 // Clear the upper bits of the rounding immediate so that the legacy
20317 // intrinsic can't trigger the scaling behavior of VRNDSCALE.
20318 SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
20320 DAG.getConstant(0xf, dl, MVT::i32));
20321 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
20322 Op.getOperand(1), Op.getOperand(2), RoundingMode);
20330 default: return SDValue(); // Don't custom lower most intrinsics.
20332 case Intrinsic::x86_avx2_permd:
20333 case Intrinsic::x86_avx2_permps:
20334 // Operands intentionally swapped. Mask is last operand to intrinsic,
20335 // but second operand for node/instruction.
20336 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20337 Op.getOperand(2), Op.getOperand(1));
20339 // ptest and testp intrinsics. The intrinsic these come from are designed to
20340 // return an integer value, not just an instruction so lower it to the ptest
20341 // or testp pattern and a setcc for the result.
20342 case Intrinsic::x86_sse41_ptestz:
20343 case Intrinsic::x86_sse41_ptestc:
20344 case Intrinsic::x86_sse41_ptestnzc:
20345 case Intrinsic::x86_avx_ptestz_256:
20346 case Intrinsic::x86_avx_ptestc_256:
20347 case Intrinsic::x86_avx_ptestnzc_256:
20348 case Intrinsic::x86_avx_vtestz_ps:
20349 case Intrinsic::x86_avx_vtestc_ps:
20350 case Intrinsic::x86_avx_vtestnzc_ps:
20351 case Intrinsic::x86_avx_vtestz_pd:
20352 case Intrinsic::x86_avx_vtestc_pd:
20353 case Intrinsic::x86_avx_vtestnzc_pd:
20354 case Intrinsic::x86_avx_vtestz_ps_256:
20355 case Intrinsic::x86_avx_vtestc_ps_256:
20356 case Intrinsic::x86_avx_vtestnzc_ps_256:
20357 case Intrinsic::x86_avx_vtestz_pd_256:
20358 case Intrinsic::x86_avx_vtestc_pd_256:
20359 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20360 bool IsTestPacked = false;
20361 X86::CondCode X86CC;
20363 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20364 case Intrinsic::x86_avx_vtestz_ps:
20365 case Intrinsic::x86_avx_vtestz_pd:
20366 case Intrinsic::x86_avx_vtestz_ps_256:
20367 case Intrinsic::x86_avx_vtestz_pd_256:
20368 IsTestPacked = true;
20370 case Intrinsic::x86_sse41_ptestz:
20371 case Intrinsic::x86_avx_ptestz_256:
20373 X86CC = X86::COND_E;
20375 case Intrinsic::x86_avx_vtestc_ps:
20376 case Intrinsic::x86_avx_vtestc_pd:
20377 case Intrinsic::x86_avx_vtestc_ps_256:
20378 case Intrinsic::x86_avx_vtestc_pd_256:
20379 IsTestPacked = true;
20381 case Intrinsic::x86_sse41_ptestc:
20382 case Intrinsic::x86_avx_ptestc_256:
20384 X86CC = X86::COND_B;
20386 case Intrinsic::x86_avx_vtestnzc_ps:
20387 case Intrinsic::x86_avx_vtestnzc_pd:
20388 case Intrinsic::x86_avx_vtestnzc_ps_256:
20389 case Intrinsic::x86_avx_vtestnzc_pd_256:
20390 IsTestPacked = true;
20392 case Intrinsic::x86_sse41_ptestnzc:
20393 case Intrinsic::x86_avx_ptestnzc_256:
20395 X86CC = X86::COND_A;
20399 SDValue LHS = Op.getOperand(1);
20400 SDValue RHS = Op.getOperand(2);
20401 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20402 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20403 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20404 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20406 case Intrinsic::x86_avx512_kortestz_w:
20407 case Intrinsic::x86_avx512_kortestc_w: {
20408 X86::CondCode X86CC =
20409 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20410 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20411 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20412 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20413 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20414 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20417 case Intrinsic::x86_avx512_knot_w: {
20418 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20419 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20420 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20421 return DAG.getBitcast(MVT::i16, Res);
20424 case Intrinsic::x86_avx512_kandn_w: {
20425 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20426 // Invert LHS for the not.
20427 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20428 DAG.getConstant(1, dl, MVT::v16i1));
20429 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20430 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20431 return DAG.getBitcast(MVT::i16, Res);
20434 case Intrinsic::x86_avx512_kxnor_w: {
20435 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20436 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20437 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20438 // Invert result for the not.
20439 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20440 DAG.getConstant(1, dl, MVT::v16i1));
20441 return DAG.getBitcast(MVT::i16, Res);
20444 case Intrinsic::x86_sse42_pcmpistria128:
20445 case Intrinsic::x86_sse42_pcmpestria128:
20446 case Intrinsic::x86_sse42_pcmpistric128:
20447 case Intrinsic::x86_sse42_pcmpestric128:
20448 case Intrinsic::x86_sse42_pcmpistrio128:
20449 case Intrinsic::x86_sse42_pcmpestrio128:
20450 case Intrinsic::x86_sse42_pcmpistris128:
20451 case Intrinsic::x86_sse42_pcmpestris128:
20452 case Intrinsic::x86_sse42_pcmpistriz128:
20453 case Intrinsic::x86_sse42_pcmpestriz128: {
20455 X86::CondCode X86CC;
20457 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20458 case Intrinsic::x86_sse42_pcmpistria128:
20459 Opcode = X86ISD::PCMPISTRI;
20460 X86CC = X86::COND_A;
20462 case Intrinsic::x86_sse42_pcmpestria128:
20463 Opcode = X86ISD::PCMPESTRI;
20464 X86CC = X86::COND_A;
20466 case Intrinsic::x86_sse42_pcmpistric128:
20467 Opcode = X86ISD::PCMPISTRI;
20468 X86CC = X86::COND_B;
20470 case Intrinsic::x86_sse42_pcmpestric128:
20471 Opcode = X86ISD::PCMPESTRI;
20472 X86CC = X86::COND_B;
20474 case Intrinsic::x86_sse42_pcmpistrio128:
20475 Opcode = X86ISD::PCMPISTRI;
20476 X86CC = X86::COND_O;
20478 case Intrinsic::x86_sse42_pcmpestrio128:
20479 Opcode = X86ISD::PCMPESTRI;
20480 X86CC = X86::COND_O;
20482 case Intrinsic::x86_sse42_pcmpistris128:
20483 Opcode = X86ISD::PCMPISTRI;
20484 X86CC = X86::COND_S;
20486 case Intrinsic::x86_sse42_pcmpestris128:
20487 Opcode = X86ISD::PCMPESTRI;
20488 X86CC = X86::COND_S;
20490 case Intrinsic::x86_sse42_pcmpistriz128:
20491 Opcode = X86ISD::PCMPISTRI;
20492 X86CC = X86::COND_E;
20494 case Intrinsic::x86_sse42_pcmpestriz128:
20495 Opcode = X86ISD::PCMPESTRI;
20496 X86CC = X86::COND_E;
20499 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20500 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20501 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20502 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20503 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20506 case Intrinsic::x86_sse42_pcmpistri128:
20507 case Intrinsic::x86_sse42_pcmpestri128: {
20509 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20510 Opcode = X86ISD::PCMPISTRI;
20512 Opcode = X86ISD::PCMPESTRI;
20514 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20515 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20516 return DAG.getNode(Opcode, dl, VTs, NewOps);
20519 case Intrinsic::eh_sjlj_lsda: {
20520 MachineFunction &MF = DAG.getMachineFunction();
20521 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20522 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20523 auto &Context = MF.getMMI().getContext();
20524 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20525 Twine(MF.getFunctionNumber()));
20526 return DAG.getNode(getGlobalWrapperKind(), dl, VT,
20527 DAG.getMCSymbol(S, PtrVT));
20530 case Intrinsic::x86_seh_lsda: {
20531 // Compute the symbol for the LSDA. We know it'll get emitted later.
20532 MachineFunction &MF = DAG.getMachineFunction();
20533 SDValue Op1 = Op.getOperand(1);
20534 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20535 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20536 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20538 // Generate a simple absolute symbol reference. This intrinsic is only
20539 // supported on 32-bit Windows, which isn't PIC.
20540 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20541 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20544 case Intrinsic::x86_seh_recoverfp: {
20545 SDValue FnOp = Op.getOperand(1);
20546 SDValue IncomingFPOp = Op.getOperand(2);
20547 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20548 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20550 report_fatal_error(
20551 "llvm.x86.seh.recoverfp must take a function as the first argument");
20552 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20555 case Intrinsic::localaddress: {
20556 // Returns one of the stack, base, or frame pointer registers, depending on
20557 // which is used to reference local variables.
20558 MachineFunction &MF = DAG.getMachineFunction();
20559 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20561 if (RegInfo->hasBasePointer(MF))
20562 Reg = RegInfo->getBaseRegister();
20563 else // This function handles the SP or FP case.
20564 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20565 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20570 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20571 SDValue Src, SDValue Mask, SDValue Base,
20572 SDValue Index, SDValue ScaleOp, SDValue Chain,
20573 const X86Subtarget &Subtarget) {
20575 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20576 // Scale must be constant.
20579 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20580 EVT MaskVT = Mask.getValueType();
20581 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20582 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20583 SDValue Segment = DAG.getRegister(0, MVT::i32);
20584 // If source is undef or we know it won't be used, use a zero vector
20585 // to break register dependency.
20586 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20587 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20588 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20589 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20590 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20591 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20592 return DAG.getMergeValues(RetOps, dl);
20595 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20596 SDValue Src, SDValue Mask, SDValue Base,
20597 SDValue Index, SDValue ScaleOp, SDValue Chain,
20598 const X86Subtarget &Subtarget) {
20600 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20601 // Scale must be constant.
20604 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20605 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20606 Index.getSimpleValueType().getVectorNumElements());
20608 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20609 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20610 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20611 SDValue Segment = DAG.getRegister(0, MVT::i32);
20612 // If source is undef or we know it won't be used, use a zero vector
20613 // to break register dependency.
20614 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20615 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20616 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20617 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20618 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20619 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20620 return DAG.getMergeValues(RetOps, dl);
20623 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20624 SDValue Src, SDValue Mask, SDValue Base,
20625 SDValue Index, SDValue ScaleOp, SDValue Chain,
20626 const X86Subtarget &Subtarget) {
20628 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20629 // Scale must be constant.
20632 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20633 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20634 SDValue Segment = DAG.getRegister(0, MVT::i32);
20635 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20636 Index.getSimpleValueType().getVectorNumElements());
20638 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20639 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20640 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20641 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20642 return SDValue(Res, 1);
20645 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20646 SDValue Mask, SDValue Base, SDValue Index,
20647 SDValue ScaleOp, SDValue Chain,
20648 const X86Subtarget &Subtarget) {
20650 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20651 // Scale must be constant.
20654 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20655 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20656 SDValue Segment = DAG.getRegister(0, MVT::i32);
20658 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20659 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20660 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20661 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20662 return SDValue(Res, 0);
20665 /// Handles the lowering of builtin intrinsic that return the value
20666 /// of the extended control register.
20667 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20669 const X86Subtarget &Subtarget,
20670 SmallVectorImpl<SDValue> &Results) {
20671 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20672 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20675 // The ECX register is used to select the index of the XCR register to
20678 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20679 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20680 Chain = SDValue(N1, 0);
20682 // Reads the content of XCR and returns it in registers EDX:EAX.
20683 if (Subtarget.is64Bit()) {
20684 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20685 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20688 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20689 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20692 Chain = HI.getValue(1);
20694 if (Subtarget.is64Bit()) {
20695 // Merge the two 32-bit values into a 64-bit one..
20696 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20697 DAG.getConstant(32, DL, MVT::i8));
20698 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20699 Results.push_back(Chain);
20703 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20704 SDValue Ops[] = { LO, HI };
20705 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20706 Results.push_back(Pair);
20707 Results.push_back(Chain);
20710 /// Handles the lowering of builtin intrinsics that read performance monitor
20711 /// counters (x86_rdpmc).
20712 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20714 const X86Subtarget &Subtarget,
20715 SmallVectorImpl<SDValue> &Results) {
20716 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20717 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20720 // The ECX register is used to select the index of the performance counter
20722 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20724 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20726 // Reads the content of a 64-bit performance counter and returns it in the
20727 // registers EDX:EAX.
20728 if (Subtarget.is64Bit()) {
20729 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20730 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20733 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20734 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20737 Chain = HI.getValue(1);
20739 if (Subtarget.is64Bit()) {
20740 // The EAX register is loaded with the low-order 32 bits. The EDX register
20741 // is loaded with the supported high-order bits of the counter.
20742 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20743 DAG.getConstant(32, DL, MVT::i8));
20744 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20745 Results.push_back(Chain);
20749 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20750 SDValue Ops[] = { LO, HI };
20751 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20752 Results.push_back(Pair);
20753 Results.push_back(Chain);
20756 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20757 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20758 /// READCYCLECOUNTER nodes.
20759 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20761 const X86Subtarget &Subtarget,
20762 SmallVectorImpl<SDValue> &Results) {
20763 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20764 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20767 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20768 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20769 // and the EAX register is loaded with the low-order 32 bits.
20770 if (Subtarget.is64Bit()) {
20771 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20772 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20775 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20776 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20779 SDValue Chain = HI.getValue(1);
20781 if (Opcode == X86ISD::RDTSCP_DAG) {
20782 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20784 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20785 // the ECX register. Add 'ecx' explicitly to the chain.
20786 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20788 // Explicitly store the content of ECX at the location passed in input
20789 // to the 'rdtscp' intrinsic.
20790 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20791 MachinePointerInfo());
20794 if (Subtarget.is64Bit()) {
20795 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20796 // the EAX register is loaded with the low-order 32 bits.
20797 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20798 DAG.getConstant(32, DL, MVT::i8));
20799 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20800 Results.push_back(Chain);
20804 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20805 SDValue Ops[] = { LO, HI };
20806 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20807 Results.push_back(Pair);
20808 Results.push_back(Chain);
20811 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20812 SelectionDAG &DAG) {
20813 SmallVector<SDValue, 2> Results;
20815 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20817 return DAG.getMergeValues(Results, DL);
20820 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20821 MachineFunction &MF = DAG.getMachineFunction();
20822 SDValue Chain = Op.getOperand(0);
20823 SDValue RegNode = Op.getOperand(2);
20824 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20826 report_fatal_error("EH registrations only live in functions using WinEH");
20828 // Cast the operand to an alloca, and remember the frame index.
20829 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20831 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20832 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20834 // Return the chain operand without making any DAG nodes.
20838 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20839 MachineFunction &MF = DAG.getMachineFunction();
20840 SDValue Chain = Op.getOperand(0);
20841 SDValue EHGuard = Op.getOperand(2);
20842 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20844 report_fatal_error("EHGuard only live in functions using WinEH");
20846 // Cast the operand to an alloca, and remember the frame index.
20847 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20849 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20850 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20852 // Return the chain operand without making any DAG nodes.
20856 /// Emit Truncating Store with signed or unsigned saturation.
20858 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20859 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20860 SelectionDAG &DAG) {
20862 SDVTList VTs = DAG.getVTList(MVT::Other);
20863 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20864 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20866 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20867 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20870 /// Emit Masked Truncating Store with signed or unsigned saturation.
20872 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20873 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20874 MachineMemOperand *MMO, SelectionDAG &DAG) {
20876 SDVTList VTs = DAG.getVTList(MVT::Other);
20877 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20879 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20880 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20883 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20884 SelectionDAG &DAG) {
20885 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20887 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20890 case llvm::Intrinsic::x86_seh_ehregnode:
20891 return MarkEHRegistrationNode(Op, DAG);
20892 case llvm::Intrinsic::x86_seh_ehguard:
20893 return MarkEHGuard(Op, DAG);
20894 case llvm::Intrinsic::x86_flags_read_u32:
20895 case llvm::Intrinsic::x86_flags_read_u64:
20896 case llvm::Intrinsic::x86_flags_write_u32:
20897 case llvm::Intrinsic::x86_flags_write_u64: {
20898 // We need a frame pointer because this will get lowered to a PUSH/POP
20900 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20901 MFI.setHasCopyImplyingStackAdjustment(true);
20902 // Don't do anything here, we will expand these intrinsics out later
20903 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20906 case Intrinsic::x86_lwpins32:
20907 case Intrinsic::x86_lwpins64: {
20909 SDValue Chain = Op->getOperand(0);
20910 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20912 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20913 Op->getOperand(3), Op->getOperand(4));
20914 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20915 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20916 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20917 LwpIns.getValue(1));
20924 switch(IntrData->Type) {
20925 default: llvm_unreachable("Unknown Intrinsic Type");
20928 // Emit the node with the right value type.
20929 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
20930 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20932 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20933 // Otherwise return the value from Rand, which is always 0, casted to i32.
20934 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20935 DAG.getConstant(1, dl, Op->getValueType(1)),
20936 DAG.getConstant(X86::COND_B, dl, MVT::i8),
20937 SDValue(Result.getNode(), 1) };
20938 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
20940 // Return { result, isValid, chain }.
20941 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20942 SDValue(Result.getNode(), 2));
20944 case GATHER_AVX2: {
20945 SDValue Chain = Op.getOperand(0);
20946 SDValue Src = Op.getOperand(2);
20947 SDValue Base = Op.getOperand(3);
20948 SDValue Index = Op.getOperand(4);
20949 SDValue Mask = Op.getOperand(5);
20950 SDValue Scale = Op.getOperand(6);
20951 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20952 Scale, Chain, Subtarget);
20955 //gather(v1, mask, index, base, scale);
20956 SDValue Chain = Op.getOperand(0);
20957 SDValue Src = Op.getOperand(2);
20958 SDValue Base = Op.getOperand(3);
20959 SDValue Index = Op.getOperand(4);
20960 SDValue Mask = Op.getOperand(5);
20961 SDValue Scale = Op.getOperand(6);
20962 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20966 //scatter(base, mask, index, v1, scale);
20967 SDValue Chain = Op.getOperand(0);
20968 SDValue Base = Op.getOperand(2);
20969 SDValue Mask = Op.getOperand(3);
20970 SDValue Index = Op.getOperand(4);
20971 SDValue Src = Op.getOperand(5);
20972 SDValue Scale = Op.getOperand(6);
20973 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20974 Scale, Chain, Subtarget);
20977 SDValue Hint = Op.getOperand(6);
20978 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20979 assert((HintVal == 2 || HintVal == 3) &&
20980 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20981 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20982 SDValue Chain = Op.getOperand(0);
20983 SDValue Mask = Op.getOperand(2);
20984 SDValue Index = Op.getOperand(3);
20985 SDValue Base = Op.getOperand(4);
20986 SDValue Scale = Op.getOperand(5);
20987 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20990 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20992 SmallVector<SDValue, 2> Results;
20993 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20995 return DAG.getMergeValues(Results, dl);
20997 // Read Performance Monitoring Counters.
20999 SmallVector<SDValue, 2> Results;
21000 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
21001 return DAG.getMergeValues(Results, dl);
21003 // Get Extended Control Register.
21005 SmallVector<SDValue, 2> Results;
21006 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
21007 return DAG.getMergeValues(Results, dl);
21009 // XTEST intrinsics.
21011 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
21012 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
21014 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
21015 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
21016 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
21017 Ret, SDValue(InTrans.getNode(), 1));
21021 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
21022 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
21023 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
21024 DAG.getConstant(-1, dl, MVT::i8));
21025 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
21026 Op.getOperand(4), GenCF.getValue(1));
21027 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
21028 Op.getOperand(5), MachinePointerInfo());
21029 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
21030 SDValue Results[] = { SetCC, Store };
21031 return DAG.getMergeValues(Results, dl);
21033 case COMPRESS_TO_MEM: {
21034 SDValue Mask = Op.getOperand(4);
21035 SDValue DataToCompress = Op.getOperand(3);
21036 SDValue Addr = Op.getOperand(2);
21037 SDValue Chain = Op.getOperand(0);
21038 MVT VT = DataToCompress.getSimpleValueType();
21040 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21041 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21043 if (isAllOnesConstant(Mask)) // return just a store
21044 return DAG.getStore(Chain, dl, DataToCompress, Addr,
21045 MemIntr->getMemOperand());
21047 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21048 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21050 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
21051 MemIntr->getMemOperand(),
21052 false /* truncating */, true /* compressing */);
21054 case TRUNCATE_TO_MEM_VI8:
21055 case TRUNCATE_TO_MEM_VI16:
21056 case TRUNCATE_TO_MEM_VI32: {
21057 SDValue Mask = Op.getOperand(4);
21058 SDValue DataToTruncate = Op.getOperand(3);
21059 SDValue Addr = Op.getOperand(2);
21060 SDValue Chain = Op.getOperand(0);
21062 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21063 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21065 EVT MemVT = MemIntr->getMemoryVT();
21067 uint16_t TruncationOp = IntrData->Opc0;
21068 switch (TruncationOp) {
21069 case X86ISD::VTRUNC: {
21070 if (isAllOnesConstant(Mask)) // return just a truncate store
21071 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
21072 MemIntr->getMemOperand());
21074 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21075 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21077 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
21078 MemIntr->getMemOperand(), true /* truncating */);
21080 case X86ISD::VTRUNCUS:
21081 case X86ISD::VTRUNCS: {
21082 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
21083 if (isAllOnesConstant(Mask))
21084 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
21085 MemIntr->getMemOperand(), DAG);
21087 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
21088 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21090 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
21091 VMask, MemVT, MemIntr->getMemOperand(), DAG);
21094 llvm_unreachable("Unsupported truncstore intrinsic");
21098 case EXPAND_FROM_MEM: {
21099 SDValue Mask = Op.getOperand(4);
21100 SDValue PassThru = Op.getOperand(3);
21101 SDValue Addr = Op.getOperand(2);
21102 SDValue Chain = Op.getOperand(0);
21103 MVT VT = Op.getSimpleValueType();
21105 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
21106 assert(MemIntr && "Expected MemIntrinsicSDNode!");
21108 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
21109 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
21110 if (X86::isZeroNode(Mask))
21111 return DAG.getUNDEF(VT);
21113 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21114 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
21115 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
21116 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
21117 true /* expanding */);
21122 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
21123 SelectionDAG &DAG) const {
21124 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
21125 MFI.setReturnAddressIsTaken(true);
21127 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
21130 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21132 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21135 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
21136 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21137 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
21138 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
21139 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
21140 MachinePointerInfo());
21143 // Just load the return address.
21144 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
21145 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
21146 MachinePointerInfo());
21149 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
21150 SelectionDAG &DAG) const {
21151 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
21152 return getReturnAddressFrameIndex(DAG);
21155 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
21156 MachineFunction &MF = DAG.getMachineFunction();
21157 MachineFrameInfo &MFI = MF.getFrameInfo();
21158 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
21159 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21160 EVT VT = Op.getValueType();
21162 MFI.setFrameAddressIsTaken(true);
21164 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
21165 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
21166 // is not possible to crawl up the stack without looking at the unwind codes
21168 int FrameAddrIndex = FuncInfo->getFAIndex();
21169 if (!FrameAddrIndex) {
21170 // Set up a frame object for the return address.
21171 unsigned SlotSize = RegInfo->getSlotSize();
21172 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
21173 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
21174 FuncInfo->setFAIndex(FrameAddrIndex);
21176 return DAG.getFrameIndex(FrameAddrIndex, VT);
21179 unsigned FrameReg =
21180 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21181 SDLoc dl(Op); // FIXME probably not meaningful
21182 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
21183 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
21184 (FrameReg == X86::EBP && VT == MVT::i32)) &&
21185 "Invalid Frame Register!");
21186 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
21188 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
21189 MachinePointerInfo());
21193 // FIXME? Maybe this could be a TableGen attribute on some registers and
21194 // this table could be generated automatically from RegInfo.
21195 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
21196 SelectionDAG &DAG) const {
21197 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21198 const MachineFunction &MF = DAG.getMachineFunction();
21200 unsigned Reg = StringSwitch<unsigned>(RegName)
21201 .Case("esp", X86::ESP)
21202 .Case("rsp", X86::RSP)
21203 .Case("ebp", X86::EBP)
21204 .Case("rbp", X86::RBP)
21207 if (Reg == X86::EBP || Reg == X86::RBP) {
21208 if (!TFI.hasFP(MF))
21209 report_fatal_error("register " + StringRef(RegName) +
21210 " is allocatable: function has no frame pointer");
21213 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21214 unsigned FrameReg =
21215 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
21216 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
21217 "Invalid Frame Register!");
21225 report_fatal_error("Invalid register name global variable");
21228 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
21229 SelectionDAG &DAG) const {
21230 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21231 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
21234 unsigned X86TargetLowering::getExceptionPointerRegister(
21235 const Constant *PersonalityFn) const {
21236 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
21237 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21239 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
21242 unsigned X86TargetLowering::getExceptionSelectorRegister(
21243 const Constant *PersonalityFn) const {
21244 // Funclet personalities don't use selectors (the runtime does the selection).
21245 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
21246 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
21249 bool X86TargetLowering::needsFixedCatchObjects() const {
21250 return Subtarget.isTargetWin64();
21253 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
21254 SDValue Chain = Op.getOperand(0);
21255 SDValue Offset = Op.getOperand(1);
21256 SDValue Handler = Op.getOperand(2);
21259 EVT PtrVT = getPointerTy(DAG.getDataLayout());
21260 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
21261 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
21262 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
21263 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
21264 "Invalid Frame Register!");
21265 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
21266 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
21268 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
21269 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
21271 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
21272 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
21273 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
21275 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
21276 DAG.getRegister(StoreAddrReg, PtrVT));
21279 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
21280 SelectionDAG &DAG) const {
21282 // If the subtarget is not 64bit, we may need the global base reg
21283 // after isel expand pseudo, i.e., after CGBR pass ran.
21284 // Therefore, ask for the GlobalBaseReg now, so that the pass
21285 // inserts the code for us in case we need it.
21286 // Otherwise, we will end up in a situation where we will
21287 // reference a virtual register that is not defined!
21288 if (!Subtarget.is64Bit()) {
21289 const X86InstrInfo *TII = Subtarget.getInstrInfo();
21290 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
21292 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
21293 DAG.getVTList(MVT::i32, MVT::Other),
21294 Op.getOperand(0), Op.getOperand(1));
21297 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
21298 SelectionDAG &DAG) const {
21300 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
21301 Op.getOperand(0), Op.getOperand(1));
21304 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
21305 SelectionDAG &DAG) const {
21307 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
21311 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
21312 return Op.getOperand(0);
21315 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
21316 SelectionDAG &DAG) const {
21317 SDValue Root = Op.getOperand(0);
21318 SDValue Trmp = Op.getOperand(1); // trampoline
21319 SDValue FPtr = Op.getOperand(2); // nested function
21320 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
21323 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
21324 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
21326 if (Subtarget.is64Bit()) {
21327 SDValue OutChains[6];
21329 // Large code-model.
21330 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
21331 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
21333 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
21334 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21336 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21338 // Load the pointer to the nested function into R11.
21339 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21340 SDValue Addr = Trmp;
21341 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21342 Addr, MachinePointerInfo(TrmpAddr));
21344 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21345 DAG.getConstant(2, dl, MVT::i64));
21347 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21348 /* Alignment = */ 2);
21350 // Load the 'nest' parameter value into R10.
21351 // R10 is specified in X86CallingConv.td
21352 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21353 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21354 DAG.getConstant(10, dl, MVT::i64));
21355 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21356 Addr, MachinePointerInfo(TrmpAddr, 10));
21358 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21359 DAG.getConstant(12, dl, MVT::i64));
21361 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21362 /* Alignment = */ 2);
21364 // Jump to the nested function.
21365 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21366 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21367 DAG.getConstant(20, dl, MVT::i64));
21368 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21369 Addr, MachinePointerInfo(TrmpAddr, 20));
21371 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21372 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21373 DAG.getConstant(22, dl, MVT::i64));
21374 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21375 Addr, MachinePointerInfo(TrmpAddr, 22));
21377 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21379 const Function *Func =
21380 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21381 CallingConv::ID CC = Func->getCallingConv();
21386 llvm_unreachable("Unsupported calling convention");
21387 case CallingConv::C:
21388 case CallingConv::X86_StdCall: {
21389 // Pass 'nest' parameter in ECX.
21390 // Must be kept in sync with X86CallingConv.td
21391 NestReg = X86::ECX;
21393 // Check that ECX wasn't needed by an 'inreg' parameter.
21394 FunctionType *FTy = Func->getFunctionType();
21395 const AttributeList &Attrs = Func->getAttributes();
21397 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21398 unsigned InRegCount = 0;
21401 for (FunctionType::param_iterator I = FTy->param_begin(),
21402 E = FTy->param_end(); I != E; ++I, ++Idx)
21403 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21404 auto &DL = DAG.getDataLayout();
21405 // FIXME: should only count parameters that are lowered to integers.
21406 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21409 if (InRegCount > 2) {
21410 report_fatal_error("Nest register in use - reduce number of inreg"
21416 case CallingConv::X86_FastCall:
21417 case CallingConv::X86_ThisCall:
21418 case CallingConv::Fast:
21419 // Pass 'nest' parameter in EAX.
21420 // Must be kept in sync with X86CallingConv.td
21421 NestReg = X86::EAX;
21425 SDValue OutChains[4];
21426 SDValue Addr, Disp;
21428 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21429 DAG.getConstant(10, dl, MVT::i32));
21430 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21432 // This is storing the opcode for MOV32ri.
21433 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21434 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21436 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21437 Trmp, MachinePointerInfo(TrmpAddr));
21439 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21440 DAG.getConstant(1, dl, MVT::i32));
21442 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21443 /* Alignment = */ 1);
21445 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21446 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21447 DAG.getConstant(5, dl, MVT::i32));
21448 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21449 Addr, MachinePointerInfo(TrmpAddr, 5),
21450 /* Alignment = */ 1);
21452 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21453 DAG.getConstant(6, dl, MVT::i32));
21455 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21456 /* Alignment = */ 1);
21458 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21462 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21463 SelectionDAG &DAG) const {
21465 The rounding mode is in bits 11:10 of FPSR, and has the following
21467 00 Round to nearest
21472 FLT_ROUNDS, on the other hand, expects the following:
21479 To perform the conversion, we do:
21480 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21483 MachineFunction &MF = DAG.getMachineFunction();
21484 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21485 unsigned StackAlignment = TFI.getStackAlignment();
21486 MVT VT = Op.getSimpleValueType();
21489 // Save FP Control Word to stack slot
21490 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21491 SDValue StackSlot =
21492 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21494 MachineMemOperand *MMO =
21495 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21496 MachineMemOperand::MOStore, 2, 2);
21498 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21499 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21500 DAG.getVTList(MVT::Other),
21501 Ops, MVT::i16, MMO);
21503 // Load FP Control Word from stack slot
21505 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21507 // Transform as necessary
21509 DAG.getNode(ISD::SRL, DL, MVT::i16,
21510 DAG.getNode(ISD::AND, DL, MVT::i16,
21511 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21512 DAG.getConstant(11, DL, MVT::i8));
21514 DAG.getNode(ISD::SRL, DL, MVT::i16,
21515 DAG.getNode(ISD::AND, DL, MVT::i16,
21516 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21517 DAG.getConstant(9, DL, MVT::i8));
21520 DAG.getNode(ISD::AND, DL, MVT::i16,
21521 DAG.getNode(ISD::ADD, DL, MVT::i16,
21522 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21523 DAG.getConstant(1, DL, MVT::i16)),
21524 DAG.getConstant(3, DL, MVT::i16));
21526 return DAG.getNode((VT.getSizeInBits() < 16 ?
21527 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21530 // Split an unary integer op into 2 half sized ops.
21531 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21532 MVT VT = Op.getSimpleValueType();
21533 unsigned NumElems = VT.getVectorNumElements();
21534 unsigned SizeInBits = VT.getSizeInBits();
21536 // Extract the Lo/Hi vectors
21538 SDValue Src = Op.getOperand(0);
21539 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21540 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21542 MVT EltVT = VT.getVectorElementType();
21543 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21544 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21545 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21546 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21549 // Decompose 256-bit ops into smaller 128-bit ops.
21550 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21551 assert(Op.getSimpleValueType().is256BitVector() &&
21552 Op.getSimpleValueType().isInteger() &&
21553 "Only handle AVX 256-bit vector integer operation");
21554 return LowerVectorIntUnary(Op, DAG);
21557 // Decompose 512-bit ops into smaller 256-bit ops.
21558 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21559 assert(Op.getSimpleValueType().is512BitVector() &&
21560 Op.getSimpleValueType().isInteger() &&
21561 "Only handle AVX 512-bit vector integer operation");
21562 return LowerVectorIntUnary(Op, DAG);
21565 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21567 // i8/i16 vector implemented using dword LZCNT vector instruction
21568 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21569 // split the vector, perform operation on it's Lo a Hi part and
21570 // concatenate the results.
21571 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21572 assert(Op.getOpcode() == ISD::CTLZ);
21574 MVT VT = Op.getSimpleValueType();
21575 MVT EltVT = VT.getVectorElementType();
21576 unsigned NumElems = VT.getVectorNumElements();
21578 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21579 "Unsupported element type");
21581 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21583 return LowerVectorIntUnary(Op, DAG);
21585 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21586 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21587 "Unsupported value type for operation");
21589 // Use native supported vector instruction vplzcntd.
21590 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21591 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21592 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21593 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21595 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21598 // Lower CTLZ using a PSHUFB lookup table implementation.
21599 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21600 const X86Subtarget &Subtarget,
21601 SelectionDAG &DAG) {
21602 MVT VT = Op.getSimpleValueType();
21603 int NumElts = VT.getVectorNumElements();
21604 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21605 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21607 // Per-nibble leading zero PSHUFB lookup table.
21608 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21609 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21610 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21611 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21613 SmallVector<SDValue, 64> LUTVec;
21614 for (int i = 0; i < NumBytes; ++i)
21615 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21616 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21618 // Begin by bitcasting the input to byte vector, then split those bytes
21619 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21620 // If the hi input nibble is zero then we add both results together, otherwise
21621 // we just take the hi result (by masking the lo result to zero before the
21623 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21624 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21626 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21627 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21628 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21629 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21631 if (CurrVT.is512BitVector()) {
21632 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21633 HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
21634 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21636 HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21639 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21640 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21641 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21642 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21644 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21645 // of the current vector width in the same way we did for the nibbles.
21646 // If the upper half of the input element is zero then add the halves'
21647 // leading zero counts together, otherwise just use the upper half's.
21648 // Double the width of the result until we are at target width.
21649 while (CurrVT != VT) {
21650 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21651 int CurrNumElts = CurrVT.getVectorNumElements();
21652 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21653 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21654 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21656 // Check if the upper half of the input element is zero.
21657 if (CurrVT.is512BitVector()) {
21658 MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
21659 HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
21660 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21661 HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
21663 HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21664 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21666 HiZ = DAG.getBitcast(NextVT, HiZ);
21668 // Move the upper/lower halves to the lower bits as we'll be extending to
21669 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21671 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21672 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21673 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21674 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21675 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21682 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21683 const X86Subtarget &Subtarget,
21684 SelectionDAG &DAG) {
21685 MVT VT = Op.getSimpleValueType();
21687 if (Subtarget.hasCDI())
21688 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21690 // Decompose 256-bit ops into smaller 128-bit ops.
21691 if (VT.is256BitVector() && !Subtarget.hasInt256())
21692 return Lower256IntUnary(Op, DAG);
21694 // Decompose 512-bit ops into smaller 256-bit ops.
21695 if (VT.is512BitVector() && !Subtarget.hasBWI())
21696 return Lower512IntUnary(Op, DAG);
21698 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21699 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21702 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21703 SelectionDAG &DAG) {
21704 MVT VT = Op.getSimpleValueType();
21706 unsigned NumBits = VT.getSizeInBits();
21708 unsigned Opc = Op.getOpcode();
21711 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21713 Op = Op.getOperand(0);
21714 if (VT == MVT::i8) {
21715 // Zero extend to i32 since there is not an i8 bsr.
21717 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21720 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21721 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21722 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21724 if (Opc == ISD::CTLZ) {
21725 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21728 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21729 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21732 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21735 // Finally xor with NumBits-1.
21736 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21737 DAG.getConstant(NumBits - 1, dl, OpVT));
21740 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21744 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21745 MVT VT = Op.getSimpleValueType();
21746 unsigned NumBits = VT.getScalarSizeInBits();
21749 if (VT.isVector()) {
21750 SDValue N0 = Op.getOperand(0);
21751 SDValue Zero = DAG.getConstant(0, dl, VT);
21753 // lsb(x) = (x & -x)
21754 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21755 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21757 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21758 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21759 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21760 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21761 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21764 // cttz(x) = ctpop(lsb - 1)
21765 SDValue One = DAG.getConstant(1, dl, VT);
21766 return DAG.getNode(ISD::CTPOP, dl, VT,
21767 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21770 assert(Op.getOpcode() == ISD::CTTZ &&
21771 "Only scalar CTTZ requires custom lowering");
21773 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21774 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21775 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21777 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21780 DAG.getConstant(NumBits, dl, VT),
21781 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21784 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21787 /// Break a 256-bit integer operation into two new 128-bit ones and then
21788 /// concatenate the result back.
21789 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21790 MVT VT = Op.getSimpleValueType();
21792 assert(VT.is256BitVector() && VT.isInteger() &&
21793 "Unsupported value type for operation");
21795 unsigned NumElems = VT.getVectorNumElements();
21798 // Extract the LHS vectors
21799 SDValue LHS = Op.getOperand(0);
21800 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21801 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21803 // Extract the RHS vectors
21804 SDValue RHS = Op.getOperand(1);
21805 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21806 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21808 MVT EltVT = VT.getVectorElementType();
21809 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21811 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21812 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21813 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21816 /// Break a 512-bit integer operation into two new 256-bit ones and then
21817 /// concatenate the result back.
21818 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21819 MVT VT = Op.getSimpleValueType();
21821 assert(VT.is512BitVector() && VT.isInteger() &&
21822 "Unsupported value type for operation");
21824 unsigned NumElems = VT.getVectorNumElements();
21827 // Extract the LHS vectors
21828 SDValue LHS = Op.getOperand(0);
21829 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21830 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21832 // Extract the RHS vectors
21833 SDValue RHS = Op.getOperand(1);
21834 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21835 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21837 MVT EltVT = VT.getVectorElementType();
21838 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21840 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21841 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21842 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21845 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21846 MVT VT = Op.getSimpleValueType();
21847 if (VT.getScalarType() == MVT::i1)
21848 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21849 Op.getOperand(0), Op.getOperand(1));
21850 assert(Op.getSimpleValueType().is256BitVector() &&
21851 Op.getSimpleValueType().isInteger() &&
21852 "Only handle AVX 256-bit vector integer operation");
21853 return Lower256IntArith(Op, DAG);
21856 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21857 MVT VT = Op.getSimpleValueType();
21858 if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
21859 // Since X86 does not have CMOV for 8-bit integer, we don't convert
21860 // 8-bit integer abs to NEG and CMOV.
21862 SDValue N0 = Op.getOperand(0);
21863 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
21864 DAG.getConstant(0, DL, VT), N0);
21865 SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
21866 SDValue(Neg.getNode(), 1)};
21867 return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
21870 assert(Op.getSimpleValueType().is256BitVector() &&
21871 Op.getSimpleValueType().isInteger() &&
21872 "Only handle AVX 256-bit vector integer operation");
21873 return Lower256IntUnary(Op, DAG);
21876 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21877 assert(Op.getSimpleValueType().is256BitVector() &&
21878 Op.getSimpleValueType().isInteger() &&
21879 "Only handle AVX 256-bit vector integer operation");
21880 return Lower256IntArith(Op, DAG);
21883 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21884 SelectionDAG &DAG) {
21886 MVT VT = Op.getSimpleValueType();
21888 if (VT.getScalarType() == MVT::i1)
21889 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21891 // Decompose 256-bit ops into smaller 128-bit ops.
21892 if (VT.is256BitVector() && !Subtarget.hasInt256())
21893 return Lower256IntArith(Op, DAG);
21895 SDValue A = Op.getOperand(0);
21896 SDValue B = Op.getOperand(1);
21898 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21899 // vector pairs, multiply and truncate.
21900 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21901 if (Subtarget.hasInt256()) {
21902 // For 512-bit vectors, split into 256-bit vectors to allow the
21903 // sign-extension to occur.
21904 if (VT == MVT::v64i8)
21905 return Lower512IntArith(Op, DAG);
21907 // For 256-bit vectors, split into 128-bit vectors to allow the
21908 // sign-extension to occur. We don't need this on AVX512BW as we can
21909 // safely sign-extend to v32i16.
21910 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21911 return Lower256IntArith(Op, DAG);
21913 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21914 return DAG.getNode(
21915 ISD::TRUNCATE, dl, VT,
21916 DAG.getNode(ISD::MUL, dl, ExVT,
21917 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21918 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21921 assert(VT == MVT::v16i8 &&
21922 "Pre-AVX2 support only supports v16i8 multiplication");
21923 MVT ExVT = MVT::v8i16;
21925 // Extract the lo parts and sign extend to i16
21927 if (Subtarget.hasSSE41()) {
21928 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21929 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21931 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21932 -1, 4, -1, 5, -1, 6, -1, 7};
21933 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21934 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21935 ALo = DAG.getBitcast(ExVT, ALo);
21936 BLo = DAG.getBitcast(ExVT, BLo);
21937 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21938 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21941 // Extract the hi parts and sign extend to i16
21943 if (Subtarget.hasSSE41()) {
21944 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21945 -1, -1, -1, -1, -1, -1, -1, -1};
21946 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21947 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21948 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21949 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21951 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21952 -1, 12, -1, 13, -1, 14, -1, 15};
21953 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21954 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21955 AHi = DAG.getBitcast(ExVT, AHi);
21956 BHi = DAG.getBitcast(ExVT, BHi);
21957 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21958 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21961 // Multiply, mask the lower 8bits of the lo/hi results and pack
21962 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21963 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21964 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21965 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21966 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21969 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21970 if (VT == MVT::v4i32) {
21971 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21972 "Should not custom lower when pmuldq is available!");
21974 // Extract the odd parts.
21975 static const int UnpackMask[] = { 1, -1, 3, -1 };
21976 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21977 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21979 // Multiply the even parts.
21980 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21981 // Now multiply odd parts.
21982 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21984 Evens = DAG.getBitcast(VT, Evens);
21985 Odds = DAG.getBitcast(VT, Odds);
21987 // Merge the two vectors back together with a shuffle. This expands into 2
21989 static const int ShufMask[] = { 0, 4, 2, 6 };
21990 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21993 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21994 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21996 // 32-bit vector types used for MULDQ/MULUDQ.
21997 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21999 // MULDQ returns the 64-bit result of the signed multiplication of the lower
22000 // 32-bits. We can lower with this if the sign bits stretch that far.
22001 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
22002 DAG.ComputeNumSignBits(B) > 32) {
22003 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
22004 DAG.getBitcast(MulVT, B));
22007 // Ahi = psrlqi(a, 32);
22008 // Bhi = psrlqi(b, 32);
22010 // AloBlo = pmuludq(a, b);
22011 // AloBhi = pmuludq(a, Bhi);
22012 // AhiBlo = pmuludq(Ahi, b);
22014 // Hi = psllqi(AloBhi + AhiBlo, 32);
22015 // return AloBlo + Hi;
22016 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
22017 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
22018 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
22020 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
22021 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
22022 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
22024 // Bit cast to 32-bit vectors for MULUDQ.
22025 SDValue Alo = DAG.getBitcast(MulVT, A);
22026 SDValue Blo = DAG.getBitcast(MulVT, B);
22028 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22030 // Only multiply lo/hi halves that aren't known to be zero.
22031 SDValue AloBlo = Zero;
22032 if (!ALoIsZero && !BLoIsZero)
22033 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
22035 SDValue AloBhi = Zero;
22036 if (!ALoIsZero && !BHiIsZero) {
22037 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
22038 Bhi = DAG.getBitcast(MulVT, Bhi);
22039 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
22042 SDValue AhiBlo = Zero;
22043 if (!AHiIsZero && !BLoIsZero) {
22044 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
22045 Ahi = DAG.getBitcast(MulVT, Ahi);
22046 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
22049 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
22050 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
22052 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
22055 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
22056 SelectionDAG &DAG) {
22058 MVT VT = Op.getSimpleValueType();
22060 // Decompose 256-bit ops into smaller 128-bit ops.
22061 if (VT.is256BitVector() && !Subtarget.hasInt256())
22062 return Lower256IntArith(Op, DAG);
22064 // Only i8 vectors should need custom lowering after this.
22065 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
22066 (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
22067 "Unsupported vector type");
22069 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
22070 // logical shift down the upper half and pack back to i8.
22071 SDValue A = Op.getOperand(0);
22072 SDValue B = Op.getOperand(1);
22074 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
22075 // and then ashr/lshr the upper bits down to the lower bits before multiply.
22076 unsigned Opcode = Op.getOpcode();
22077 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
22078 unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
22080 // For 512-bit vectors, split into 256-bit vectors to allow the
22081 // sign-extension to occur.
22082 if (VT == MVT::v64i8)
22083 return Lower512IntArith(Op, DAG);
22085 // AVX2 implementations - extend xmm subvectors to ymm.
22086 if (Subtarget.hasInt256()) {
22087 unsigned NumElems = VT.getVectorNumElements();
22088 SDValue Lo = DAG.getIntPtrConstant(0, dl);
22089 SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
22091 if (VT == MVT::v32i8) {
22092 if (Subtarget.hasBWI()) {
22093 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
22094 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
22095 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
22096 Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
22097 DAG.getConstant(8, dl, MVT::v32i16));
22098 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22100 SDValue ALo = extract128BitVector(A, 0, DAG, dl);
22101 SDValue BLo = extract128BitVector(B, 0, DAG, dl);
22102 SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
22103 SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
22104 ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
22105 BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
22106 AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
22107 BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
22108 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22109 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
22110 DAG.getConstant(8, dl, MVT::v16i16));
22111 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
22112 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
22113 DAG.getConstant(8, dl, MVT::v16i16));
22114 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
22115 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
22116 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
22117 16, 17, 18, 19, 20, 21, 22, 23};
22118 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22119 24, 25, 26, 27, 28, 29, 30, 31};
22120 return DAG.getNode(X86ISD::PACKUS, dl, VT,
22121 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
22122 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
22125 SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
22126 SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
22127 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
22128 Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
22129 DAG.getConstant(8, dl, MVT::v16i16));
22130 // If we have BWI we can use truncate instruction.
22131 if (Subtarget.hasBWI())
22132 return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
22133 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
22134 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
22135 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22138 assert(VT == MVT::v16i8 &&
22139 "Pre-AVX2 support only supports v16i8 multiplication");
22140 MVT ExVT = MVT::v8i16;
22141 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
22143 // Extract the lo parts and zero/sign extend to i16.
22145 if (Subtarget.hasSSE41()) {
22146 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
22147 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
22149 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
22150 -1, 4, -1, 5, -1, 6, -1, 7};
22151 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22152 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22153 ALo = DAG.getBitcast(ExVT, ALo);
22154 BLo = DAG.getBitcast(ExVT, BLo);
22155 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
22156 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
22159 // Extract the hi parts and zero/sign extend to i16.
22161 if (Subtarget.hasSSE41()) {
22162 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
22163 -1, -1, -1, -1, -1, -1, -1, -1};
22164 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22165 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22166 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
22167 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
22169 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
22170 -1, 12, -1, 13, -1, 14, -1, 15};
22171 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
22172 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
22173 AHi = DAG.getBitcast(ExVT, AHi);
22174 BHi = DAG.getBitcast(ExVT, BHi);
22175 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
22176 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
22179 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
22180 // pack back to v16i8.
22181 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
22182 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
22183 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
22184 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
22185 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22188 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
22189 assert(Subtarget.isTargetWin64() && "Unexpected target");
22190 EVT VT = Op.getValueType();
22191 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
22192 "Unexpected return type for lowering");
22196 switch (Op->getOpcode()) {
22197 default: llvm_unreachable("Unexpected request for libcall!");
22198 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
22199 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
22200 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
22201 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
22202 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
22203 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
22207 SDValue InChain = DAG.getEntryNode();
22209 TargetLowering::ArgListTy Args;
22210 TargetLowering::ArgListEntry Entry;
22211 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
22212 EVT ArgVT = Op->getOperand(i).getValueType();
22213 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
22214 "Unexpected argument type for lowering");
22215 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
22216 Entry.Node = StackPtr;
22217 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
22218 MachinePointerInfo(), /* Alignment = */ 16);
22219 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22220 Entry.Ty = PointerType::get(ArgTy,0);
22221 Entry.IsSExt = false;
22222 Entry.IsZExt = false;
22223 Args.push_back(Entry);
22226 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
22227 getPointerTy(DAG.getDataLayout()));
22229 TargetLowering::CallLoweringInfo CLI(DAG);
22230 CLI.setDebugLoc(dl)
22233 getLibcallCallingConv(LC),
22234 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
22237 .setSExtResult(isSigned)
22238 .setZExtResult(!isSigned);
22240 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
22241 return DAG.getBitcast(VT, CallInfo.first);
22244 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
22245 SelectionDAG &DAG) {
22246 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
22247 MVT VT = Op0.getSimpleValueType();
22250 // Decompose 256-bit ops into smaller 128-bit ops.
22251 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22252 unsigned Opcode = Op.getOpcode();
22253 unsigned NumElems = VT.getVectorNumElements();
22254 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
22255 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
22256 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
22257 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
22258 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
22259 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
22260 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
22262 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
22263 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
22265 return DAG.getMergeValues(Ops, dl);
22268 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
22269 (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
22270 (VT == MVT::v16i32 && Subtarget.hasAVX512()));
22272 int NumElts = VT.getVectorNumElements();
22274 // PMULxD operations multiply each even value (starting at 0) of LHS with
22275 // the related value of RHS and produce a widen result.
22276 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22277 // => <2 x i64> <ae|cg>
22279 // In other word, to have all the results, we need to perform two PMULxD:
22280 // 1. one with the even values.
22281 // 2. one with the odd values.
22282 // To achieve #2, with need to place the odd values at an even position.
22284 // Place the odd value at an even position (basically, shift all values 1
22285 // step to the left):
22286 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
22287 // <a|b|c|d> => <b|undef|d|undef>
22288 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
22289 makeArrayRef(&Mask[0], NumElts));
22290 // <e|f|g|h> => <f|undef|h|undef>
22291 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
22292 makeArrayRef(&Mask[0], NumElts));
22294 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
22296 MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
22297 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
22299 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
22300 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
22301 // => <2 x i64> <ae|cg>
22302 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
22303 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
22304 // => <2 x i64> <bf|dh>
22305 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
22307 // Shuffle it back into the right order.
22308 SmallVector<int, 16> HighMask(NumElts);
22309 SmallVector<int, 16> LowMask(NumElts);
22310 for (int i = 0; i != NumElts; ++i) {
22311 HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
22312 LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
22315 SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
22316 SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
22318 // If we have a signed multiply but no PMULDQ fix up the high parts of a
22319 // unsigned multiply.
22320 if (IsSigned && !Subtarget.hasSSE41()) {
22321 SDValue ShAmt = DAG.getConstant(
22323 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
22324 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
22325 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
22326 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
22327 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
22329 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
22330 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
22333 // The first result of MUL_LOHI is actually the low value, followed by the
22335 SDValue Ops[] = {Lows, Highs};
22336 return DAG.getMergeValues(Ops, dl);
22339 // Return true if the required (according to Opcode) shift-imm form is natively
22340 // supported by the Subtarget
22341 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
22343 if (VT.getScalarSizeInBits() < 16)
22346 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
22347 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
22350 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
22351 (VT.is256BitVector() && Subtarget.hasInt256());
22353 bool AShift = LShift && (Subtarget.hasAVX512() ||
22354 (VT != MVT::v2i64 && VT != MVT::v4i64));
22355 return (Opcode == ISD::SRA) ? AShift : LShift;
22358 // The shift amount is a variable, but it is the same for all vector lanes.
22359 // These instructions are defined together with shift-immediate.
22361 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
22363 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
22366 // Return true if the required (according to Opcode) variable-shift form is
22367 // natively supported by the Subtarget
22368 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
22371 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
22374 // vXi16 supported only on AVX-512, BWI
22375 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
22378 if (Subtarget.hasAVX512())
22381 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22382 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22383 return (Opcode == ISD::SRA) ? AShift : LShift;
22386 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22387 const X86Subtarget &Subtarget) {
22388 MVT VT = Op.getSimpleValueType();
22390 SDValue R = Op.getOperand(0);
22391 SDValue Amt = Op.getOperand(1);
22393 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22394 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22396 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22397 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22398 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22399 SDValue Ex = DAG.getBitcast(ExVT, R);
22401 // ashr(R, 63) === cmp_slt(R, 0)
22402 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22403 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22404 "Unsupported PCMPGT op");
22405 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22406 getZeroVector(VT, Subtarget, DAG, dl), R);
22409 if (ShiftAmt >= 32) {
22410 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22412 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22413 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22414 ShiftAmt - 32, DAG);
22415 if (VT == MVT::v2i64)
22416 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22417 if (VT == MVT::v4i64)
22418 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22419 {9, 1, 11, 3, 13, 5, 15, 7});
22421 // SRA upper i32, SHL whole i64 and select lower i32.
22422 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22425 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22426 Lower = DAG.getBitcast(ExVT, Lower);
22427 if (VT == MVT::v2i64)
22428 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22429 if (VT == MVT::v4i64)
22430 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22431 {8, 1, 10, 3, 12, 5, 14, 7});
22433 return DAG.getBitcast(VT, Ex);
22436 // Optimize shl/srl/sra with constant shift amount.
22437 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22438 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22439 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22441 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22442 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22444 // i64 SRA needs to be performed as partial shifts.
22445 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22446 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22447 Op.getOpcode() == ISD::SRA)
22448 return ArithmeticShiftRight64(ShiftAmt);
22450 if (VT == MVT::v16i8 ||
22451 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22452 VT == MVT::v64i8) {
22453 unsigned NumElts = VT.getVectorNumElements();
22454 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22456 // Simple i8 add case
22457 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22458 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22460 // ashr(R, 7) === cmp_slt(R, 0)
22461 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22462 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22463 if (VT.is512BitVector()) {
22464 assert(VT == MVT::v64i8 && "Unexpected element type!");
22465 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22466 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22468 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22471 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22472 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22475 if (Op.getOpcode() == ISD::SHL) {
22476 // Make a large shift.
22477 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22479 SHL = DAG.getBitcast(VT, SHL);
22480 // Zero out the rightmost bits.
22481 return DAG.getNode(ISD::AND, dl, VT, SHL,
22482 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22484 if (Op.getOpcode() == ISD::SRL) {
22485 // Make a large shift.
22486 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22488 SRL = DAG.getBitcast(VT, SRL);
22489 // Zero out the leftmost bits.
22490 return DAG.getNode(ISD::AND, dl, VT, SRL,
22491 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22493 if (Op.getOpcode() == ISD::SRA) {
22494 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22495 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22497 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22498 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22499 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22502 llvm_unreachable("Unknown shift opcode.");
22507 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22508 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22509 if (!Subtarget.hasXOP() &&
22510 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22511 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22513 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22514 unsigned SubVectorScale = 1;
22515 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22517 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22518 Amt = Amt.getOperand(0);
22521 // Peek through any splat that was introduced for i64 shift vectorization.
22522 int SplatIndex = -1;
22523 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22524 if (SVN->isSplat()) {
22525 SplatIndex = SVN->getSplatIndex();
22526 Amt = Amt.getOperand(0);
22527 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22528 "Splat shuffle referencing second operand");
22531 if (Amt.getOpcode() != ISD::BITCAST ||
22532 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22535 Amt = Amt.getOperand(0);
22536 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22537 (SubVectorScale * VT.getVectorNumElements());
22538 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22539 uint64_t ShiftAmt = 0;
22540 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22541 for (unsigned i = 0; i != Ratio; ++i) {
22542 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22546 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22549 // Check remaining shift amounts (if not a splat).
22550 if (SplatIndex < 0) {
22551 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22552 uint64_t ShAmt = 0;
22553 for (unsigned j = 0; j != Ratio; ++j) {
22554 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22558 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22560 if (ShAmt != ShiftAmt)
22565 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22566 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22568 if (Op.getOpcode() == ISD::SRA)
22569 return ArithmeticShiftRight64(ShiftAmt);
22575 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22576 const X86Subtarget &Subtarget) {
22577 MVT VT = Op.getSimpleValueType();
22579 SDValue R = Op.getOperand(0);
22580 SDValue Amt = Op.getOperand(1);
22582 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22583 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22585 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22586 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22588 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22590 MVT EltVT = VT.getVectorElementType();
22592 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22593 // Check if this build_vector node is doing a splat.
22594 // If so, then set BaseShAmt equal to the splat value.
22595 BaseShAmt = BV->getSplatValue();
22596 if (BaseShAmt && BaseShAmt.isUndef())
22597 BaseShAmt = SDValue();
22599 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22600 Amt = Amt.getOperand(0);
22602 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22603 if (SVN && SVN->isSplat()) {
22604 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22605 SDValue InVec = Amt.getOperand(0);
22606 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22607 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22608 "Unexpected shuffle index found!");
22609 BaseShAmt = InVec.getOperand(SplatIdx);
22610 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22611 if (ConstantSDNode *C =
22612 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22613 if (C->getZExtValue() == SplatIdx)
22614 BaseShAmt = InVec.getOperand(1);
22619 // Avoid introducing an extract element from a shuffle.
22620 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22621 DAG.getIntPtrConstant(SplatIdx, dl));
22625 if (BaseShAmt.getNode()) {
22626 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22627 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22628 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22629 else if (EltVT.bitsLT(MVT::i32))
22630 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22632 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22636 // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
22637 if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
22638 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22639 Amt = Amt.getOperand(0);
22640 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22641 VT.getVectorNumElements();
22642 std::vector<SDValue> Vals(Ratio);
22643 for (unsigned i = 0; i != Ratio; ++i)
22644 Vals[i] = Amt.getOperand(i);
22645 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22646 for (unsigned j = 0; j != Ratio; ++j)
22647 if (Vals[j] != Amt.getOperand(i + j))
22651 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22652 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22657 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22658 SelectionDAG &DAG) {
22659 MVT VT = Op.getSimpleValueType();
22661 SDValue R = Op.getOperand(0);
22662 SDValue Amt = Op.getOperand(1);
22663 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22665 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22666 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22668 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22671 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22674 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22677 // XOP has 128-bit variable logical/arithmetic shifts.
22678 // +ve/-ve Amt = shift left/right.
22679 if (Subtarget.hasXOP() &&
22680 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22681 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22682 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22683 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22684 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22686 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22687 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22688 if (Op.getOpcode() == ISD::SRA)
22689 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22692 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22693 // shifts per-lane and then shuffle the partial results back together.
22694 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22695 // Splat the shift amounts so the scalar shifts above will catch it.
22696 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22697 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22698 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22699 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22700 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22703 // i64 vector arithmetic shift can be emulated with the transform:
22704 // M = lshr(SIGN_MASK, Amt)
22705 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22706 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22707 Op.getOpcode() == ISD::SRA) {
22708 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22709 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22710 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22711 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22712 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22716 // If possible, lower this packed shift into a vector multiply instead of
22717 // expanding it into a sequence of scalar shifts.
22718 // Do this only if the vector shift count is a constant build_vector.
22719 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22720 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22721 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22722 SmallVector<SDValue, 8> Elts;
22723 MVT SVT = VT.getVectorElementType();
22724 unsigned SVTBits = SVT.getSizeInBits();
22725 APInt One(SVTBits, 1);
22726 unsigned NumElems = VT.getVectorNumElements();
22728 for (unsigned i=0; i !=NumElems; ++i) {
22729 SDValue Op = Amt->getOperand(i);
22730 if (Op->isUndef()) {
22731 Elts.push_back(Op);
22735 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22736 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22737 uint64_t ShAmt = C.getZExtValue();
22738 if (ShAmt >= SVTBits) {
22739 Elts.push_back(DAG.getUNDEF(SVT));
22742 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22744 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22745 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22748 // Lower SHL with variable shift amount.
22749 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22750 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22752 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22753 DAG.getConstant(0x3f800000U, dl, VT));
22754 Op = DAG.getBitcast(MVT::v4f32, Op);
22755 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22756 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22759 // If possible, lower this shift as a sequence of two shifts by
22760 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22762 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22764 // Could be rewritten as:
22765 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22767 // The advantage is that the two shifts from the example would be
22768 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22769 // the vector shift into four scalar shifts plus four pairs of vector
22771 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22772 bool UseMOVSD = false;
22773 bool CanBeSimplified;
22774 // The splat value for the first packed shift (the 'X' from the example).
22775 SDValue Amt1 = Amt->getOperand(0);
22776 // The splat value for the second packed shift (the 'Y' from the example).
22777 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22779 // See if it is possible to replace this node with a sequence of
22780 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22781 if (VT == MVT::v4i32) {
22782 // Check if it is legal to use a MOVSS.
22783 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22784 Amt2 == Amt->getOperand(3);
22785 if (!CanBeSimplified) {
22786 // Otherwise, check if we can still simplify this node using a MOVSD.
22787 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22788 Amt->getOperand(2) == Amt->getOperand(3);
22790 Amt2 = Amt->getOperand(2);
22793 // Do similar checks for the case where the machine value type
22795 CanBeSimplified = Amt1 == Amt->getOperand(1);
22796 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22797 CanBeSimplified = Amt2 == Amt->getOperand(i);
22799 if (!CanBeSimplified) {
22801 CanBeSimplified = true;
22802 Amt2 = Amt->getOperand(4);
22803 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22804 CanBeSimplified = Amt1 == Amt->getOperand(i);
22805 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22806 CanBeSimplified = Amt2 == Amt->getOperand(j);
22810 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22811 isa<ConstantSDNode>(Amt2)) {
22812 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22814 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22815 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22817 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22818 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22819 SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
22820 SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
22822 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
22823 BitCast2, {0, 1, 6, 7}));
22824 return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
22825 BitCast2, {0, 5, 6, 7}));
22829 // v4i32 Non Uniform Shifts.
22830 // If the shift amount is constant we can shift each lane using the SSE2
22831 // immediate shifts, else we need to zero-extend each lane to the lower i64
22832 // and shift using the SSE2 variable shifts.
22833 // The separate results can then be blended together.
22834 if (VT == MVT::v4i32) {
22835 unsigned Opc = Op.getOpcode();
22836 SDValue Amt0, Amt1, Amt2, Amt3;
22838 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22839 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22840 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22841 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22843 // ISD::SHL is handled above but we include it here for completeness.
22846 llvm_unreachable("Unknown target vector shift node");
22848 Opc = X86ISD::VSHL;
22851 Opc = X86ISD::VSRL;
22854 Opc = X86ISD::VSRA;
22857 // The SSE2 shifts use the lower i64 as the same shift amount for
22858 // all lanes and the upper i64 is ignored. These shuffle masks
22859 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22860 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22861 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22862 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22863 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22864 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22867 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22868 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22869 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22870 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22871 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22872 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22873 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22876 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22877 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22878 // make the existing SSE solution better.
22879 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22880 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22881 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22882 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22883 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22884 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22886 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22887 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22888 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22889 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22890 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22893 if (VT == MVT::v16i8 ||
22894 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22895 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22896 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22897 unsigned ShiftOpcode = Op->getOpcode();
22899 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22900 if (VT.is512BitVector()) {
22901 // On AVX512BW targets we make use of the fact that VSELECT lowers
22902 // to a masked blend which selects bytes based just on the sign bit
22903 // extracted to a mask.
22904 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22905 V0 = DAG.getBitcast(VT, V0);
22906 V1 = DAG.getBitcast(VT, V1);
22907 Sel = DAG.getBitcast(VT, Sel);
22908 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22909 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22910 } else if (Subtarget.hasSSE41()) {
22911 // On SSE41 targets we make use of the fact that VSELECT lowers
22912 // to PBLENDVB which selects bytes based just on the sign bit.
22913 V0 = DAG.getBitcast(VT, V0);
22914 V1 = DAG.getBitcast(VT, V1);
22915 Sel = DAG.getBitcast(VT, Sel);
22916 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22918 // On pre-SSE41 targets we test for the sign bit by comparing to
22919 // zero - a negative value will set all bits of the lanes to true
22920 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22921 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22922 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22923 return DAG.getSelect(dl, SelVT, C, V0, V1);
22926 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22927 // We can safely do this using i16 shifts as we're only interested in
22928 // the 3 lower bits of each byte.
22929 Amt = DAG.getBitcast(ExtVT, Amt);
22930 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22931 Amt = DAG.getBitcast(VT, Amt);
22933 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22934 // r = VSELECT(r, shift(r, 4), a);
22936 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22937 R = SignBitSelect(VT, Amt, M, R);
22940 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22942 // r = VSELECT(r, shift(r, 2), a);
22943 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22944 R = SignBitSelect(VT, Amt, M, R);
22947 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22949 // return VSELECT(r, shift(r, 1), a);
22950 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22951 R = SignBitSelect(VT, Amt, M, R);
22955 if (Op->getOpcode() == ISD::SRA) {
22956 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22957 // so we can correctly sign extend. We don't care what happens to the
22959 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22960 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22961 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22962 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22963 ALo = DAG.getBitcast(ExtVT, ALo);
22964 AHi = DAG.getBitcast(ExtVT, AHi);
22965 RLo = DAG.getBitcast(ExtVT, RLo);
22966 RHi = DAG.getBitcast(ExtVT, RHi);
22968 // r = VSELECT(r, shift(r, 4), a);
22969 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22970 DAG.getConstant(4, dl, ExtVT));
22971 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22972 DAG.getConstant(4, dl, ExtVT));
22973 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22974 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22977 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22978 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22980 // r = VSELECT(r, shift(r, 2), a);
22981 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22982 DAG.getConstant(2, dl, ExtVT));
22983 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22984 DAG.getConstant(2, dl, ExtVT));
22985 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22986 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22989 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22990 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22992 // r = VSELECT(r, shift(r, 1), a);
22993 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22994 DAG.getConstant(1, dl, ExtVT));
22995 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22996 DAG.getConstant(1, dl, ExtVT));
22997 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22998 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
23000 // Logical shift the result back to the lower byte, leaving a zero upper
23002 // meaning that we can safely pack with PACKUSWB.
23004 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
23006 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
23007 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
23011 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
23012 MVT ExtVT = MVT::v8i32;
23013 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
23014 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
23015 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
23016 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
23017 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
23018 ALo = DAG.getBitcast(ExtVT, ALo);
23019 AHi = DAG.getBitcast(ExtVT, AHi);
23020 RLo = DAG.getBitcast(ExtVT, RLo);
23021 RHi = DAG.getBitcast(ExtVT, RHi);
23022 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
23023 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
23024 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
23025 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
23026 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
23029 if (VT == MVT::v8i16) {
23030 unsigned ShiftOpcode = Op->getOpcode();
23032 // If we have a constant shift amount, the non-SSE41 path is best as
23033 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
23034 bool UseSSE41 = Subtarget.hasSSE41() &&
23035 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
23037 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
23038 // On SSE41 targets we make use of the fact that VSELECT lowers
23039 // to PBLENDVB which selects bytes based just on the sign bit.
23041 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
23042 V0 = DAG.getBitcast(ExtVT, V0);
23043 V1 = DAG.getBitcast(ExtVT, V1);
23044 Sel = DAG.getBitcast(ExtVT, Sel);
23045 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
23047 // On pre-SSE41 targets we splat the sign bit - a negative value will
23048 // set all bits of the lanes to true and VSELECT uses that in
23049 // its OR(AND(V0,C),AND(V1,~C)) lowering.
23051 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
23052 return DAG.getSelect(dl, VT, C, V0, V1);
23055 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
23057 // On SSE41 targets we need to replicate the shift mask in both
23058 // bytes for PBLENDVB.
23061 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
23062 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
23064 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
23067 // r = VSELECT(r, shift(r, 8), a);
23068 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
23069 R = SignBitSelect(Amt, M, R);
23072 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23074 // r = VSELECT(r, shift(r, 4), a);
23075 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
23076 R = SignBitSelect(Amt, M, R);
23079 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23081 // r = VSELECT(r, shift(r, 2), a);
23082 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
23083 R = SignBitSelect(Amt, M, R);
23086 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
23088 // return VSELECT(r, shift(r, 1), a);
23089 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
23090 R = SignBitSelect(Amt, M, R);
23094 // Decompose 256-bit shifts into smaller 128-bit shifts.
23095 if (VT.is256BitVector())
23096 return Lower256IntArith(Op, DAG);
23101 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
23102 SelectionDAG &DAG) {
23103 MVT VT = Op.getSimpleValueType();
23105 SDValue R = Op.getOperand(0);
23106 SDValue Amt = Op.getOperand(1);
23107 unsigned Opcode = Op.getOpcode();
23108 unsigned EltSizeInBits = VT.getScalarSizeInBits();
23110 if (Subtarget.hasAVX512()) {
23111 // Attempt to rotate by immediate.
23113 SmallVector<APInt, 16> EltBits;
23114 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
23115 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
23116 return EltBits[0] == V;
23118 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
23119 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
23120 return DAG.getNode(Op, DL, VT, R,
23121 DAG.getConstant(RotateAmt, DL, MVT::i8));
23125 // Else, fall-back on VPROLV/VPRORV.
23129 assert(VT.isVector() && "Custom lowering only for vector rotates!");
23130 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
23131 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
23133 // XOP has 128-bit vector variable + immediate rotates.
23134 // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
23136 // Split 256-bit integers.
23137 if (VT.is256BitVector())
23138 return Lower256IntArith(Op, DAG);
23140 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
23142 // Attempt to rotate by immediate.
23143 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
23144 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
23145 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
23146 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
23147 return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
23148 DAG.getConstant(RotateAmt, DL, MVT::i8));
23152 // Use general rotate by variable (per-element).
23156 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
23157 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
23158 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
23159 // looks for this combo and may remove the "setcc" instruction if the "setcc"
23160 // has only one use.
23161 SDNode *N = Op.getNode();
23162 SDValue LHS = N->getOperand(0);
23163 SDValue RHS = N->getOperand(1);
23164 unsigned BaseOp = 0;
23165 X86::CondCode Cond;
23167 switch (Op.getOpcode()) {
23168 default: llvm_unreachable("Unknown ovf instruction!");
23170 // A subtract of one will be selected as a INC. Note that INC doesn't
23171 // set CF, so we can't do this for UADDO.
23172 if (isOneConstant(RHS)) {
23173 BaseOp = X86ISD::INC;
23174 Cond = X86::COND_O;
23177 BaseOp = X86ISD::ADD;
23178 Cond = X86::COND_O;
23181 BaseOp = X86ISD::ADD;
23182 Cond = X86::COND_B;
23185 // A subtract of one will be selected as a DEC. Note that DEC doesn't
23186 // set CF, so we can't do this for USUBO.
23187 if (isOneConstant(RHS)) {
23188 BaseOp = X86ISD::DEC;
23189 Cond = X86::COND_O;
23192 BaseOp = X86ISD::SUB;
23193 Cond = X86::COND_O;
23196 BaseOp = X86ISD::SUB;
23197 Cond = X86::COND_B;
23200 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
23201 Cond = X86::COND_O;
23203 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
23204 if (N->getValueType(0) == MVT::i8) {
23205 BaseOp = X86ISD::UMUL8;
23206 Cond = X86::COND_O;
23209 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
23211 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
23213 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
23215 if (N->getValueType(1) == MVT::i1)
23216 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23218 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23222 // Also sets EFLAGS.
23223 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
23224 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
23226 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
23228 if (N->getValueType(1) == MVT::i1)
23229 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23231 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23234 /// Returns true if the operand type is exactly twice the native width, and
23235 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
23236 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
23237 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
23238 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
23239 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
23242 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
23243 else if (OpWidth == 128)
23244 return Subtarget.hasCmpxchg16b();
23249 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
23250 return needsCmpXchgNb(SI->getValueOperand()->getType());
23253 // Note: this turns large loads into lock cmpxchg8b/16b.
23254 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
23255 TargetLowering::AtomicExpansionKind
23256 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
23257 auto PTy = cast<PointerType>(LI->getPointerOperandType());
23258 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
23259 : AtomicExpansionKind::None;
23262 TargetLowering::AtomicExpansionKind
23263 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
23264 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23265 Type *MemType = AI->getType();
23267 // If the operand is too big, we must see if cmpxchg8/16b is available
23268 // and default to library calls otherwise.
23269 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
23270 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
23271 : AtomicExpansionKind::None;
23274 AtomicRMWInst::BinOp Op = AI->getOperation();
23277 llvm_unreachable("Unknown atomic operation");
23278 case AtomicRMWInst::Xchg:
23279 case AtomicRMWInst::Add:
23280 case AtomicRMWInst::Sub:
23281 // It's better to use xadd, xsub or xchg for these in all cases.
23282 return AtomicExpansionKind::None;
23283 case AtomicRMWInst::Or:
23284 case AtomicRMWInst::And:
23285 case AtomicRMWInst::Xor:
23286 // If the atomicrmw's result isn't actually used, we can just add a "lock"
23287 // prefix to a normal instruction for these operations.
23288 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
23289 : AtomicExpansionKind::None;
23290 case AtomicRMWInst::Nand:
23291 case AtomicRMWInst::Max:
23292 case AtomicRMWInst::Min:
23293 case AtomicRMWInst::UMax:
23294 case AtomicRMWInst::UMin:
23295 // These always require a non-trivial set of data operations on x86. We must
23296 // use a cmpxchg loop.
23297 return AtomicExpansionKind::CmpXChg;
23302 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
23303 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
23304 Type *MemType = AI->getType();
23305 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
23306 // there is no benefit in turning such RMWs into loads, and it is actually
23307 // harmful as it introduces a mfence.
23308 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
23311 auto Builder = IRBuilder<>(AI);
23312 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
23313 auto SSID = AI->getSyncScopeID();
23314 // We must restrict the ordering to avoid generating loads with Release or
23315 // ReleaseAcquire orderings.
23316 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
23317 auto Ptr = AI->getPointerOperand();
23319 // Before the load we need a fence. Here is an example lifted from
23320 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
23323 // x.store(1, relaxed);
23324 // r1 = y.fetch_add(0, release);
23326 // y.fetch_add(42, acquire);
23327 // r2 = x.load(relaxed);
23328 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
23329 // lowered to just a load without a fence. A mfence flushes the store buffer,
23330 // making the optimization clearly correct.
23331 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
23332 // otherwise, we might be able to be more aggressive on relaxed idempotent
23333 // rmw. In practice, they do not look useful, so we don't try to be
23334 // especially clever.
23335 if (SSID == SyncScope::SingleThread)
23336 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
23337 // the IR level, so we must wrap it in an intrinsic.
23340 if (!Subtarget.hasMFence())
23341 // FIXME: it might make sense to use a locked operation here but on a
23342 // different cache-line to prevent cache-line bouncing. In practice it
23343 // is probably a small win, and x86 processors without mfence are rare
23344 // enough that we do not bother.
23348 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
23349 Builder.CreateCall(MFence, {});
23351 // Finally we can emit the atomic load.
23352 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
23353 AI->getType()->getPrimitiveSizeInBits());
23354 Loaded->setAtomic(Order, SSID);
23355 AI->replaceAllUsesWith(Loaded);
23356 AI->eraseFromParent();
23360 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
23361 SelectionDAG &DAG) {
23363 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
23364 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
23365 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
23366 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
23368 // The only fence that needs an instruction is a sequentially-consistent
23369 // cross-thread fence.
23370 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
23371 FenceSSID == SyncScope::System) {
23372 if (Subtarget.hasMFence())
23373 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
23375 SDValue Chain = Op.getOperand(0);
23376 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
23378 DAG.getRegister(X86::ESP, MVT::i32), // Base
23379 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23380 DAG.getRegister(0, MVT::i32), // Index
23381 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23382 DAG.getRegister(0, MVT::i32), // Segment.
23386 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23387 return SDValue(Res, 0);
23390 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23391 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23394 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23395 SelectionDAG &DAG) {
23396 MVT T = Op.getSimpleValueType();
23400 switch(T.SimpleTy) {
23401 default: llvm_unreachable("Invalid value type!");
23402 case MVT::i8: Reg = X86::AL; size = 1; break;
23403 case MVT::i16: Reg = X86::AX; size = 2; break;
23404 case MVT::i32: Reg = X86::EAX; size = 4; break;
23406 assert(Subtarget.is64Bit() && "Node not type legal!");
23407 Reg = X86::RAX; size = 8;
23410 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23411 Op.getOperand(2), SDValue());
23412 SDValue Ops[] = { cpIn.getValue(0),
23415 DAG.getTargetConstant(size, DL, MVT::i8),
23416 cpIn.getValue(1) };
23417 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23418 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23419 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23423 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23424 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23425 MVT::i32, cpOut.getValue(2));
23426 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23428 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23429 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23430 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23434 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23435 SelectionDAG &DAG) {
23436 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23437 MVT DstVT = Op.getSimpleValueType();
23439 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23440 SrcVT == MVT::i64) {
23441 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23442 if (DstVT != MVT::f64)
23443 // This conversion needs to be expanded.
23446 SDValue Op0 = Op->getOperand(0);
23447 SmallVector<SDValue, 16> Elts;
23451 if (SrcVT.isVector()) {
23452 NumElts = SrcVT.getVectorNumElements();
23453 SVT = SrcVT.getVectorElementType();
23455 // Widen the vector in input in the case of MVT::v2i32.
23456 // Example: from MVT::v2i32 to MVT::v4i32.
23457 for (unsigned i = 0, e = NumElts; i != e; ++i)
23458 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23459 DAG.getIntPtrConstant(i, dl)));
23461 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23462 "Unexpected source type in LowerBITCAST");
23463 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23464 DAG.getIntPtrConstant(0, dl)));
23465 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23466 DAG.getIntPtrConstant(1, dl)));
23470 // Explicitly mark the extra elements as Undef.
23471 Elts.append(NumElts, DAG.getUNDEF(SVT));
23473 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23474 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23475 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23476 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23477 DAG.getIntPtrConstant(0, dl));
23480 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23481 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23482 assert((DstVT == MVT::i64 ||
23483 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23484 "Unexpected custom BITCAST");
23485 // i64 <=> MMX conversions are Legal.
23486 if (SrcVT==MVT::i64 && DstVT.isVector())
23488 if (DstVT==MVT::i64 && SrcVT.isVector())
23490 // MMX <=> MMX conversions are Legal.
23491 if (SrcVT.isVector() && DstVT.isVector())
23493 // All other conversions need to be expanded.
23497 /// Compute the horizontal sum of bytes in V for the elements of VT.
23499 /// Requires V to be a byte vector and VT to be an integer vector type with
23500 /// wider elements than V's type. The width of the elements of VT determines
23501 /// how many bytes of V are summed horizontally to produce each element of the
23503 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23504 const X86Subtarget &Subtarget,
23505 SelectionDAG &DAG) {
23507 MVT ByteVecVT = V.getSimpleValueType();
23508 MVT EltVT = VT.getVectorElementType();
23509 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23510 "Expected value to have byte element type.");
23511 assert(EltVT != MVT::i8 &&
23512 "Horizontal byte sum only makes sense for wider elements!");
23513 unsigned VecSize = VT.getSizeInBits();
23514 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23516 // PSADBW instruction horizontally add all bytes and leave the result in i64
23517 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23518 if (EltVT == MVT::i64) {
23519 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23520 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23521 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23522 return DAG.getBitcast(VT, V);
23525 if (EltVT == MVT::i32) {
23526 // We unpack the low half and high half into i32s interleaved with zeros so
23527 // that we can use PSADBW to horizontally sum them. The most useful part of
23528 // this is that it lines up the results of two PSADBW instructions to be
23529 // two v2i64 vectors which concatenated are the 4 population counts. We can
23530 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23531 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23532 SDValue V32 = DAG.getBitcast(VT, V);
23533 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23534 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23536 // Do the horizontal sums into two v2i64s.
23537 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23538 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23539 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23540 DAG.getBitcast(ByteVecVT, Low), Zeros);
23541 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23542 DAG.getBitcast(ByteVecVT, High), Zeros);
23544 // Merge them together.
23545 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23546 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23547 DAG.getBitcast(ShortVecVT, Low),
23548 DAG.getBitcast(ShortVecVT, High));
23550 return DAG.getBitcast(VT, V);
23553 // The only element type left is i16.
23554 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23556 // To obtain pop count for each i16 element starting from the pop count for
23557 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23558 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23559 // directly supported.
23560 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23561 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23562 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23563 DAG.getBitcast(ByteVecVT, V));
23564 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23567 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23568 const X86Subtarget &Subtarget,
23569 SelectionDAG &DAG) {
23570 MVT VT = Op.getSimpleValueType();
23571 MVT EltVT = VT.getVectorElementType();
23572 unsigned VecSize = VT.getSizeInBits();
23574 // Implement a lookup table in register by using an algorithm based on:
23575 // http://wm.ite.pl/articles/sse-popcount.html
23577 // The general idea is that every lower byte nibble in the input vector is an
23578 // index into a in-register pre-computed pop count table. We then split up the
23579 // input vector in two new ones: (1) a vector with only the shifted-right
23580 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23581 // masked out higher ones) for each byte. PSHUFB is used separately with both
23582 // to index the in-register table. Next, both are added and the result is a
23583 // i8 vector where each element contains the pop count for input byte.
23585 // To obtain the pop count for elements != i8, we follow up with the same
23586 // approach and use additional tricks as described below.
23588 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23589 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23590 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23591 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23593 int NumByteElts = VecSize / 8;
23594 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23595 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23596 SmallVector<SDValue, 64> LUTVec;
23597 for (int i = 0; i < NumByteElts; ++i)
23598 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23599 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23600 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23603 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23604 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23607 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23609 // The input vector is used as the shuffle mask that index elements into the
23610 // LUT. After counting low and high nibbles, add the vector to obtain the
23611 // final pop count per i8 element.
23612 SDValue HighPopCnt =
23613 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23614 SDValue LowPopCnt =
23615 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23616 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23618 if (EltVT == MVT::i8)
23621 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23624 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23625 const X86Subtarget &Subtarget,
23626 SelectionDAG &DAG) {
23627 MVT VT = Op.getSimpleValueType();
23628 assert(VT.is128BitVector() &&
23629 "Only 128-bit vector bitmath lowering supported.");
23631 int VecSize = VT.getSizeInBits();
23632 MVT EltVT = VT.getVectorElementType();
23633 int Len = EltVT.getSizeInBits();
23635 // This is the vectorized version of the "best" algorithm from
23636 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23637 // with a minor tweak to use a series of adds + shifts instead of vector
23638 // multiplications. Implemented for all integer vector types. We only use
23639 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23640 // much faster, even faster than using native popcnt instructions.
23642 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23643 MVT VT = V.getSimpleValueType();
23644 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23645 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23647 auto GetMask = [&](SDValue V, APInt Mask) {
23648 MVT VT = V.getSimpleValueType();
23649 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23650 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23653 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23654 // x86, so set the SRL type to have elements at least i16 wide. This is
23655 // correct because all of our SRLs are followed immediately by a mask anyways
23656 // that handles any bits that sneak into the high bits of the byte elements.
23657 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23661 // v = v - ((v >> 1) & 0x55555555...)
23663 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23664 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23665 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23667 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23668 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23669 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23670 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23671 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23673 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23674 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23675 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23676 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23678 // At this point, V contains the byte-wise population count, and we are
23679 // merely doing a horizontal sum if necessary to get the wider element
23681 if (EltVT == MVT::i8)
23684 return LowerHorizontalByteSum(
23685 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23689 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23690 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23691 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23692 SelectionDAG &DAG) {
23693 MVT VT = Op.getSimpleValueType();
23694 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23695 "Unknown CTPOP type to handle");
23696 SDLoc DL(Op.getNode());
23697 SDValue Op0 = Op.getOperand(0);
23699 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23700 if (Subtarget.hasVPOPCNTDQ()) {
23701 if (VT == MVT::v8i16) {
23702 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23703 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23704 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23706 if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23707 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23708 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23709 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23713 if (!Subtarget.hasSSSE3()) {
23714 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23715 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23716 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23719 // Decompose 256-bit ops into smaller 128-bit ops.
23720 if (VT.is256BitVector() && !Subtarget.hasInt256())
23721 return Lower256IntUnary(Op, DAG);
23723 // Decompose 512-bit ops into smaller 256-bit ops.
23724 if (VT.is512BitVector() && !Subtarget.hasBWI())
23725 return Lower512IntUnary(Op, DAG);
23727 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23730 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23731 SelectionDAG &DAG) {
23732 assert(Op.getSimpleValueType().isVector() &&
23733 "We only do custom lowering for vector population count.");
23734 return LowerVectorCTPOP(Op, Subtarget, DAG);
23737 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23738 MVT VT = Op.getSimpleValueType();
23739 SDValue In = Op.getOperand(0);
23742 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23743 // perform the BITREVERSE.
23744 if (!VT.isVector()) {
23745 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23746 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23747 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23748 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23749 DAG.getIntPtrConstant(0, DL));
23752 int NumElts = VT.getVectorNumElements();
23753 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23755 // Decompose 256-bit ops into smaller 128-bit ops.
23756 if (VT.is256BitVector())
23757 return Lower256IntUnary(Op, DAG);
23759 assert(VT.is128BitVector() &&
23760 "Only 128-bit vector bitreverse lowering supported.");
23762 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23763 // perform the BSWAP in the shuffle.
23764 // Its best to shuffle using the second operand as this will implicitly allow
23765 // memory folding for multiple vectors.
23766 SmallVector<SDValue, 16> MaskElts;
23767 for (int i = 0; i != NumElts; ++i) {
23768 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23769 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23770 int PermuteByte = SourceByte | (2 << 5);
23771 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23775 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23776 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23777 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23779 return DAG.getBitcast(VT, Res);
23782 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23783 SelectionDAG &DAG) {
23784 if (Subtarget.hasXOP())
23785 return LowerBITREVERSE_XOP(Op, DAG);
23787 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23789 MVT VT = Op.getSimpleValueType();
23790 SDValue In = Op.getOperand(0);
23793 unsigned NumElts = VT.getVectorNumElements();
23794 assert(VT.getScalarType() == MVT::i8 &&
23795 "Only byte vector BITREVERSE supported");
23797 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23798 if (VT.is256BitVector() && !Subtarget.hasInt256())
23799 return Lower256IntUnary(Op, DAG);
23801 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23802 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23803 // 0-15 value (moved to the other nibble).
23804 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23805 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23806 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23808 const int LoLUT[16] = {
23809 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23810 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23811 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23812 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23813 const int HiLUT[16] = {
23814 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23815 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23816 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23817 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23819 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23820 for (unsigned i = 0; i < NumElts; ++i) {
23821 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23822 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23825 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23826 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23827 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23828 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23829 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23832 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
23833 const X86Subtarget &Subtarget,
23834 bool AllowIncDec = true) {
23835 unsigned NewOpc = 0;
23836 switch (N->getOpcode()) {
23837 case ISD::ATOMIC_LOAD_ADD:
23838 NewOpc = X86ISD::LADD;
23840 case ISD::ATOMIC_LOAD_SUB:
23841 NewOpc = X86ISD::LSUB;
23843 case ISD::ATOMIC_LOAD_OR:
23844 NewOpc = X86ISD::LOR;
23846 case ISD::ATOMIC_LOAD_XOR:
23847 NewOpc = X86ISD::LXOR;
23849 case ISD::ATOMIC_LOAD_AND:
23850 NewOpc = X86ISD::LAND;
23853 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23856 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23858 if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
23859 // Convert to inc/dec if they aren't slow or we are optimizing for size.
23860 if (AllowIncDec && (!Subtarget.slowIncDec() ||
23861 DAG.getMachineFunction().getFunction()->optForSize())) {
23862 if ((NewOpc == X86ISD::LADD && C->isOne()) ||
23863 (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
23864 return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
23865 DAG.getVTList(MVT::i32, MVT::Other),
23866 {N->getOperand(0), N->getOperand(1)},
23867 /*MemVT=*/N->getSimpleValueType(0), MMO);
23868 if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
23869 (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
23870 return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
23871 DAG.getVTList(MVT::i32, MVT::Other),
23872 {N->getOperand(0), N->getOperand(1)},
23873 /*MemVT=*/N->getSimpleValueType(0), MMO);
23877 return DAG.getMemIntrinsicNode(
23878 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23879 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23880 /*MemVT=*/N->getSimpleValueType(0), MMO);
23883 /// Lower atomic_load_ops into LOCK-prefixed operations.
23884 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23885 const X86Subtarget &Subtarget) {
23886 SDValue Chain = N->getOperand(0);
23887 SDValue LHS = N->getOperand(1);
23888 SDValue RHS = N->getOperand(2);
23889 unsigned Opc = N->getOpcode();
23890 MVT VT = N->getSimpleValueType(0);
23893 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23894 // can only be lowered when the result is unused. They should have already
23895 // been transformed into a cmpxchg loop in AtomicExpand.
23896 if (N->hasAnyUseOfValue(0)) {
23897 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23898 // select LXADD if LOCK_SUB can't be selected.
23899 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23900 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23901 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23902 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23903 RHS, AN->getMemOperand());
23905 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23906 "Used AtomicRMW ops other than Add should have been expanded!");
23910 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
23911 // RAUW the chain, but don't worry about the result, as it's unused.
23912 assert(!N->hasAnyUseOfValue(0));
23913 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23917 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23918 SDNode *Node = Op.getNode();
23920 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23922 // Convert seq_cst store -> xchg
23923 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23924 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23925 // (The only way to get a 16-byte store is cmpxchg16b)
23926 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23927 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23928 AtomicOrdering::SequentiallyConsistent ||
23929 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23930 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23931 cast<AtomicSDNode>(Node)->getMemoryVT(),
23932 Node->getOperand(0),
23933 Node->getOperand(1), Node->getOperand(2),
23934 cast<AtomicSDNode>(Node)->getMemOperand());
23935 return Swap.getValue(1);
23937 // Other atomic stores have a simple pattern.
23941 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23942 SDNode *N = Op.getNode();
23943 MVT VT = N->getSimpleValueType(0);
23945 // Let legalize expand this if it isn't a legal type yet.
23946 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23949 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23952 // Set the carry flag.
23953 SDValue Carry = Op.getOperand(2);
23954 EVT CarryVT = Carry.getValueType();
23955 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23956 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23957 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23959 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23960 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23961 Op.getOperand(1), Carry.getValue(1));
23963 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23964 if (N->getValueType(1) == MVT::i1)
23965 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23967 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23970 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23971 SelectionDAG &DAG) {
23972 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23974 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23975 // which returns the values as { float, float } (in XMM0) or
23976 // { double, double } (which is returned in XMM0, XMM1).
23978 SDValue Arg = Op.getOperand(0);
23979 EVT ArgVT = Arg.getValueType();
23980 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23982 TargetLowering::ArgListTy Args;
23983 TargetLowering::ArgListEntry Entry;
23987 Entry.IsSExt = false;
23988 Entry.IsZExt = false;
23989 Args.push_back(Entry);
23991 bool isF64 = ArgVT == MVT::f64;
23992 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23993 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23994 // the results are returned via SRet in memory.
23995 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23996 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23998 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
24000 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
24001 : (Type *)VectorType::get(ArgTy, 4);
24003 TargetLowering::CallLoweringInfo CLI(DAG);
24004 CLI.setDebugLoc(dl)
24005 .setChain(DAG.getEntryNode())
24006 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
24008 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
24011 // Returned in xmm0 and xmm1.
24012 return CallResult.first;
24014 // Returned in bits 0:31 and 32:64 xmm0.
24015 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24016 CallResult.first, DAG.getIntPtrConstant(0, dl));
24017 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
24018 CallResult.first, DAG.getIntPtrConstant(1, dl));
24019 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
24020 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
24023 /// Widen a vector input to a vector of NVT. The
24024 /// input vector must have the same element type as NVT.
24025 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
24026 bool FillWithZeroes = false) {
24027 // Check if InOp already has the right width.
24028 MVT InVT = InOp.getSimpleValueType();
24032 if (InOp.isUndef())
24033 return DAG.getUNDEF(NVT);
24035 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
24036 "input and widen element type must match");
24038 unsigned InNumElts = InVT.getVectorNumElements();
24039 unsigned WidenNumElts = NVT.getVectorNumElements();
24040 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
24041 "Unexpected request for vector widening");
24044 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
24045 InOp.getNumOperands() == 2) {
24046 SDValue N1 = InOp.getOperand(1);
24047 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
24049 InOp = InOp.getOperand(0);
24050 InVT = InOp.getSimpleValueType();
24051 InNumElts = InVT.getVectorNumElements();
24054 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
24055 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
24056 SmallVector<SDValue, 16> Ops;
24057 for (unsigned i = 0; i < InNumElts; ++i)
24058 Ops.push_back(InOp.getOperand(i));
24060 EVT EltVT = InOp.getOperand(0).getValueType();
24062 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
24063 DAG.getUNDEF(EltVT);
24064 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
24065 Ops.push_back(FillVal);
24066 return DAG.getBuildVector(NVT, dl, Ops);
24068 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
24070 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
24071 InOp, DAG.getIntPtrConstant(0, dl));
24074 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
24075 SelectionDAG &DAG) {
24076 assert(Subtarget.hasAVX512() &&
24077 "MGATHER/MSCATTER are supported on AVX-512 arch only");
24079 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
24080 SDValue Src = N->getValue();
24081 MVT VT = Src.getSimpleValueType();
24082 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
24085 SDValue Index = N->getIndex();
24086 SDValue Mask = N->getMask();
24087 SDValue Chain = N->getChain();
24088 SDValue BasePtr = N->getBasePtr();
24089 MVT MemVT = N->getMemoryVT().getSimpleVT();
24090 MVT IndexVT = Index.getSimpleValueType();
24091 MVT MaskVT = Mask.getSimpleValueType();
24093 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
24094 // The v2i32 value was promoted to v2i64.
24095 // Now we "redo" the type legalizer's work and widen the original
24096 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
24098 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
24099 "Unexpected memory type");
24100 int ShuffleMask[] = {0, 2, -1, -1};
24101 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
24102 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
24103 // Now we have 4 elements instead of 2.
24104 // Expand the index.
24105 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
24106 Index = ExtendToType(Index, NewIndexVT, DAG);
24108 // Expand the mask with zeroes
24109 // Mask may be <2 x i64> or <2 x i1> at this moment
24110 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
24111 "Unexpected mask type");
24112 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
24113 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24117 unsigned NumElts = VT.getVectorNumElements();
24118 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
24119 !Index.getSimpleValueType().is512BitVector()) {
24120 // AVX512F supports only 512-bit vectors. Or data or index should
24121 // be 512 bit wide. If now the both index and data are 256-bit, but
24122 // the vector contains 8 elements, we just sign-extend the index
24123 if (IndexVT == MVT::v8i32)
24124 // Just extend index
24125 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24127 // The minimal number of elts in scatter is 8
24130 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24131 // Use original index here, do not modify the index twice
24132 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
24133 if (IndexVT.getScalarType() == MVT::i32)
24134 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24137 // At this point we have promoted mask operand
24138 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24139 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24140 // Use the original mask here, do not modify the mask twice
24141 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
24143 // The value that should be stored
24144 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24145 Src = ExtendToType(Src, NewVT, DAG);
24148 // If the mask is "wide" at this point - truncate it to i1 vector
24149 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
24150 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
24152 // The mask is killed by scatter, add it to the values
24153 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
24154 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
24155 SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
24156 VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
24157 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
24158 return SDValue(NewScatter.getNode(), 1);
24161 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
24162 SelectionDAG &DAG) {
24164 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
24165 MVT VT = Op.getSimpleValueType();
24166 MVT ScalarVT = VT.getScalarType();
24167 SDValue Mask = N->getMask();
24170 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
24171 "Expanding masked load is supported on AVX-512 target only!");
24173 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
24174 "Expanding masked load is supported for 32 and 64-bit types only!");
24176 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
24177 // VLX. These types for exp-loads are handled here.
24178 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
24181 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24182 "Cannot lower masked load op.");
24184 assert((ScalarVT.getSizeInBits() >= 32 ||
24185 (Subtarget.hasBWI() &&
24186 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24187 "Unsupported masked load op.");
24189 // This operation is legal for targets with VLX, but without
24190 // VLX the vector should be widened to 512 bit
24191 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
24192 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24193 SDValue Src0 = N->getSrc0();
24194 Src0 = ExtendToType(Src0, WideDataVT, DAG);
24196 // Mask element has to be i1.
24197 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24198 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24199 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24201 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24203 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24204 if (MaskEltTy != MVT::i1)
24205 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24206 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24207 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
24208 N->getBasePtr(), Mask, Src0,
24209 N->getMemoryVT(), N->getMemOperand(),
24210 N->getExtensionType(),
24211 N->isExpandingLoad());
24213 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24214 NewLoad.getValue(0),
24215 DAG.getIntPtrConstant(0, dl));
24216 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
24217 return DAG.getMergeValues(RetOps, dl);
24220 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
24221 SelectionDAG &DAG) {
24222 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
24223 SDValue DataToStore = N->getValue();
24224 MVT VT = DataToStore.getSimpleValueType();
24225 MVT ScalarVT = VT.getScalarType();
24226 SDValue Mask = N->getMask();
24229 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
24230 "Expanding masked load is supported on AVX-512 target only!");
24232 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
24233 "Expanding masked load is supported for 32 and 64-bit types only!");
24235 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
24236 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
24239 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24240 "Cannot lower masked store op.");
24242 assert((ScalarVT.getSizeInBits() >= 32 ||
24243 (Subtarget.hasBWI() &&
24244 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
24245 "Unsupported masked store op.");
24247 // This operation is legal for targets with VLX, but without
24248 // VLX the vector should be widened to 512 bit
24249 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
24250 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
24252 // Mask element has to be i1.
24253 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
24254 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
24255 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
24257 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
24259 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
24260 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
24261 if (MaskEltTy != MVT::i1)
24262 Mask = DAG.getNode(ISD::TRUNCATE, dl,
24263 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
24264 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
24265 Mask, N->getMemoryVT(), N->getMemOperand(),
24266 N->isTruncatingStore(), N->isCompressingStore());
24269 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
24270 SelectionDAG &DAG) {
24271 assert(Subtarget.hasAVX2() &&
24272 "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
24274 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
24276 MVT VT = Op.getSimpleValueType();
24277 SDValue Index = N->getIndex();
24278 SDValue Mask = N->getMask();
24279 SDValue Src0 = N->getValue();
24280 MVT IndexVT = Index.getSimpleValueType();
24281 MVT MaskVT = Mask.getSimpleValueType();
24283 unsigned NumElts = VT.getVectorNumElements();
24284 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
24286 // If the index is v2i32, we're being called by type legalization.
24287 if (IndexVT == MVT::v2i32)
24290 if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
24291 !Index.getSimpleValueType().is512BitVector()) {
24292 // AVX512F supports only 512-bit vectors. Or data or index should
24293 // be 512 bit wide. If now the both index and data are 256-bit, but
24294 // the vector contains 8 elements, we just sign-extend the index
24295 if (NumElts == 8) {
24296 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24297 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24298 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24299 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24300 N->getMemOperand());
24301 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24304 // Minimal number of elements in Gather
24307 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
24308 Index = ExtendToType(Index, NewIndexVT, DAG);
24309 if (IndexVT.getScalarType() == MVT::i32)
24310 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
24313 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
24314 // At this point we have promoted mask operand
24315 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
24316 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
24317 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
24318 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
24320 // The pass-through value
24321 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
24322 Src0 = ExtendToType(Src0, NewVT, DAG);
24324 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24325 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24326 DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24327 N->getMemOperand());
24328 SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
24329 NewGather.getValue(0),
24330 DAG.getIntPtrConstant(0, dl));
24331 SDValue RetOps[] = {Extract, NewGather.getValue(2)};
24332 return DAG.getMergeValues(RetOps, dl);
24335 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
24336 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24337 DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
24338 N->getMemOperand());
24339 return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
24342 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
24343 SelectionDAG &DAG) const {
24344 // TODO: Eventually, the lowering of these nodes should be informed by or
24345 // deferred to the GC strategy for the function in which they appear. For
24346 // now, however, they must be lowered to something. Since they are logically
24347 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24348 // require special handling for these nodes), lower them as literal NOOPs for
24350 SmallVector<SDValue, 2> Ops;
24352 Ops.push_back(Op.getOperand(0));
24353 if (Op->getGluedNode())
24354 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24357 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24358 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24363 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24364 SelectionDAG &DAG) const {
24365 // TODO: Eventually, the lowering of these nodes should be informed by or
24366 // deferred to the GC strategy for the function in which they appear. For
24367 // now, however, they must be lowered to something. Since they are logically
24368 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24369 // require special handling for these nodes), lower them as literal NOOPs for
24371 SmallVector<SDValue, 2> Ops;
24373 Ops.push_back(Op.getOperand(0));
24374 if (Op->getGluedNode())
24375 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24378 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24379 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24384 /// Provide custom lowering hooks for some operations.
24385 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24386 switch (Op.getOpcode()) {
24387 default: llvm_unreachable("Should not custom lower this!");
24388 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24389 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24390 return LowerCMP_SWAP(Op, Subtarget, DAG);
24391 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24392 case ISD::ATOMIC_LOAD_ADD:
24393 case ISD::ATOMIC_LOAD_SUB:
24394 case ISD::ATOMIC_LOAD_OR:
24395 case ISD::ATOMIC_LOAD_XOR:
24396 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24397 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24398 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24399 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24400 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24401 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24402 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24403 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24404 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24405 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24406 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24407 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24408 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24409 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24410 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24411 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24412 case ISD::SHL_PARTS:
24413 case ISD::SRA_PARTS:
24414 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24415 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24416 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24417 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24418 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24419 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24420 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24421 case ISD::ZERO_EXTEND_VECTOR_INREG:
24422 case ISD::SIGN_EXTEND_VECTOR_INREG:
24423 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24424 case ISD::FP_TO_SINT:
24425 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24426 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24427 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24429 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24430 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24431 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24432 case ISD::SETCC: return LowerSETCC(Op, DAG);
24433 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24434 case ISD::SELECT: return LowerSELECT(Op, DAG);
24435 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24436 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24437 case ISD::VASTART: return LowerVASTART(Op, DAG);
24438 case ISD::VAARG: return LowerVAARG(Op, DAG);
24439 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24440 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
24441 case ISD::INTRINSIC_VOID:
24442 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24443 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24444 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24445 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24446 case ISD::FRAME_TO_ARGS_OFFSET:
24447 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24448 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24449 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24450 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24451 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24452 case ISD::EH_SJLJ_SETUP_DISPATCH:
24453 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24454 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24455 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24456 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24458 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24460 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24461 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24463 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24464 case ISD::UMUL_LOHI:
24465 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24467 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24470 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24476 case ISD::UMULO: return LowerXALUO(Op, DAG);
24477 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24478 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24479 case ISD::ADDCARRY:
24480 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24482 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24486 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24487 case ISD::ABS: return LowerABS(Op, DAG);
24488 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24489 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24490 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24491 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24492 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24493 case ISD::GC_TRANSITION_START:
24494 return LowerGC_TRANSITION_START(Op, DAG);
24495 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24496 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24500 /// Places new result values for the node in Results (their number
24501 /// and types must exactly match those of the original return values of
24502 /// the node), or leaves Results empty, which indicates that the node is not
24503 /// to be custom lowered after all.
24504 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24505 SmallVectorImpl<SDValue> &Results,
24506 SelectionDAG &DAG) const {
24507 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24509 if (!Res.getNode())
24512 assert((N->getNumValues() <= Res->getNumValues()) &&
24513 "Lowering returned the wrong number of results!");
24515 // Places new result values base on N result number.
24516 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24517 // than original node, chain should be dropped(last value).
24518 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24519 Results.push_back(Res.getValue(I));
24522 /// Replace a node with an illegal result type with a new node built out of
24524 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24525 SmallVectorImpl<SDValue>&Results,
24526 SelectionDAG &DAG) const {
24528 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24529 switch (N->getOpcode()) {
24531 llvm_unreachable("Do not know how to custom type legalize this operation!");
24532 case X86ISD::AVG: {
24533 // Legalize types for X86ISD::AVG by expanding vectors.
24534 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24536 auto InVT = N->getValueType(0);
24537 auto InVTSize = InVT.getSizeInBits();
24538 const unsigned RegSize =
24539 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24540 assert((Subtarget.hasBWI() || RegSize < 512) &&
24541 "512-bit vector requires AVX512BW");
24542 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24543 "256-bit vector requires AVX2");
24545 auto ElemVT = InVT.getVectorElementType();
24546 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24547 RegSize / ElemVT.getSizeInBits());
24548 assert(RegSize % InVT.getSizeInBits() == 0);
24549 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24551 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24552 Ops[0] = N->getOperand(0);
24553 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24554 Ops[0] = N->getOperand(1);
24555 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24557 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24558 if (!ExperimentalVectorWideningLegalization)
24559 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24560 DAG.getIntPtrConstant(0, dl));
24561 Results.push_back(Res);
24564 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24565 case X86ISD::FMINC:
24567 case X86ISD::FMAXC:
24568 case X86ISD::FMAX: {
24569 EVT VT = N->getValueType(0);
24570 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24571 SDValue UNDEF = DAG.getUNDEF(VT);
24572 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24573 N->getOperand(0), UNDEF);
24574 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24575 N->getOperand(1), UNDEF);
24576 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24584 case ISD::UDIVREM: {
24585 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24586 Results.push_back(V);
24589 case ISD::FP_TO_SINT:
24590 case ISD::FP_TO_UINT: {
24591 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24593 if (N->getValueType(0) == MVT::v2i32) {
24594 assert((IsSigned || Subtarget.hasAVX512()) &&
24595 "Can only handle signed conversion without AVX512");
24596 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24597 SDValue Src = N->getOperand(0);
24598 if (Src.getValueType() == MVT::v2f64) {
24599 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24600 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24601 : X86ISD::CVTTP2UI,
24602 dl, MVT::v4i32, Src);
24603 if (!ExperimentalVectorWideningLegalization)
24604 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24605 Results.push_back(Res);
24608 if (Src.getValueType() == MVT::v2f32) {
24609 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24610 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24611 DAG.getUNDEF(MVT::v2f32));
24612 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24613 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24614 if (!ExperimentalVectorWideningLegalization)
24615 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24616 Results.push_back(Res);
24620 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24621 // so early out here.
24625 std::pair<SDValue,SDValue> Vals =
24626 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24627 SDValue FIST = Vals.first, StackSlot = Vals.second;
24628 if (FIST.getNode()) {
24629 EVT VT = N->getValueType(0);
24630 // Return a load from the stack slot.
24631 if (StackSlot.getNode())
24633 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24635 Results.push_back(FIST);
24639 case ISD::SINT_TO_FP: {
24640 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24641 SDValue Src = N->getOperand(0);
24642 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24644 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24647 case ISD::UINT_TO_FP: {
24648 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24649 EVT VT = N->getValueType(0);
24650 if (VT != MVT::v2f32)
24652 SDValue Src = N->getOperand(0);
24653 EVT SrcVT = Src.getValueType();
24654 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24655 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24658 if (SrcVT != MVT::v2i32)
24660 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24662 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24663 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24664 DAG.getBitcast(MVT::v2i64, VBias));
24665 Or = DAG.getBitcast(MVT::v2f64, Or);
24666 // TODO: Are there any fast-math-flags to propagate here?
24667 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24668 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24671 case ISD::FP_ROUND: {
24672 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24674 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24675 Results.push_back(V);
24678 case ISD::FP_EXTEND: {
24679 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24680 // No other ValueType for FP_EXTEND should reach this point.
24681 assert(N->getValueType(0) == MVT::v2f32 &&
24682 "Do not know how to legalize this Node");
24685 case ISD::INTRINSIC_W_CHAIN: {
24686 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24688 default : llvm_unreachable("Do not know how to custom type "
24689 "legalize this intrinsic operation!");
24690 case Intrinsic::x86_rdtsc:
24691 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24693 case Intrinsic::x86_rdtscp:
24694 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24696 case Intrinsic::x86_rdpmc:
24697 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24699 case Intrinsic::x86_xgetbv:
24700 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24703 case ISD::INTRINSIC_WO_CHAIN: {
24704 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
24705 Results.push_back(V);
24708 case ISD::READCYCLECOUNTER: {
24709 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24712 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24713 EVT T = N->getValueType(0);
24714 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24715 bool Regs64bit = T == MVT::i128;
24716 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24717 SDValue cpInL, cpInH;
24718 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24719 DAG.getConstant(0, dl, HalfT));
24720 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24721 DAG.getConstant(1, dl, HalfT));
24722 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24723 Regs64bit ? X86::RAX : X86::EAX,
24725 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24726 Regs64bit ? X86::RDX : X86::EDX,
24727 cpInH, cpInL.getValue(1));
24728 SDValue swapInL, swapInH;
24729 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24730 DAG.getConstant(0, dl, HalfT));
24731 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24732 DAG.getConstant(1, dl, HalfT));
24734 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24735 swapInH, cpInH.getValue(1));
24736 // If the current function needs the base pointer, RBX,
24737 // we shouldn't use cmpxchg directly.
24738 // Indeed the lowering of that instruction will clobber
24739 // that register and since RBX will be a reserved register
24740 // the register allocator will not make sure its value will
24741 // be properly saved and restored around this live-range.
24742 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24744 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24745 unsigned BasePtr = TRI->getBaseRegister();
24746 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24747 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24748 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24749 // ISel prefers the LCMPXCHG64 variant.
24750 // If that assert breaks, that means it is not the case anymore,
24751 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24752 // not just EBX. This is a matter of accepting i64 input for that
24753 // pseudo, and restoring into the register of the right wide
24754 // in expand pseudo. Everything else should just work.
24755 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24756 "Saving only half of the RBX");
24757 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24758 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24759 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24760 Regs64bit ? X86::RBX : X86::EBX,
24761 HalfT, swapInH.getValue(1));
24762 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24764 /*Glue*/ RBXSave.getValue(2)};
24765 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24768 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24769 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24770 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24771 swapInH.getValue(1));
24772 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24773 swapInL.getValue(1)};
24774 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24776 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24777 Regs64bit ? X86::RAX : X86::EAX,
24778 HalfT, Result.getValue(1));
24779 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24780 Regs64bit ? X86::RDX : X86::EDX,
24781 HalfT, cpOutL.getValue(2));
24782 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24784 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24785 MVT::i32, cpOutH.getValue(2));
24786 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24787 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24789 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24790 Results.push_back(Success);
24791 Results.push_back(EFLAGS.getValue(1));
24794 case ISD::ATOMIC_SWAP:
24795 case ISD::ATOMIC_LOAD_ADD:
24796 case ISD::ATOMIC_LOAD_SUB:
24797 case ISD::ATOMIC_LOAD_AND:
24798 case ISD::ATOMIC_LOAD_OR:
24799 case ISD::ATOMIC_LOAD_XOR:
24800 case ISD::ATOMIC_LOAD_NAND:
24801 case ISD::ATOMIC_LOAD_MIN:
24802 case ISD::ATOMIC_LOAD_MAX:
24803 case ISD::ATOMIC_LOAD_UMIN:
24804 case ISD::ATOMIC_LOAD_UMAX:
24805 case ISD::ATOMIC_LOAD: {
24806 // Delegate to generic TypeLegalization. Situations we can really handle
24807 // should have already been dealt with by AtomicExpandPass.cpp.
24810 case ISD::BITCAST: {
24811 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24812 EVT DstVT = N->getValueType(0);
24813 EVT SrcVT = N->getOperand(0)->getValueType(0);
24815 if (SrcVT != MVT::f64 ||
24816 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24819 unsigned NumElts = DstVT.getVectorNumElements();
24820 EVT SVT = DstVT.getVectorElementType();
24821 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24822 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24823 MVT::v2f64, N->getOperand(0));
24824 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24826 if (ExperimentalVectorWideningLegalization) {
24827 // If we are legalizing vectors by widening, we already have the desired
24828 // legal vector type, just return it.
24829 Results.push_back(ToVecInt);
24833 SmallVector<SDValue, 8> Elts;
24834 for (unsigned i = 0, e = NumElts; i != e; ++i)
24835 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24836 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24838 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24841 case ISD::MGATHER: {
24842 EVT VT = N->getValueType(0);
24843 if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
24844 auto *Gather = cast<MaskedGatherSDNode>(N);
24845 SDValue Index = Gather->getIndex();
24846 if (Index.getValueType() != MVT::v2i64)
24848 SDValue Mask = Gather->getMask();
24849 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24850 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24851 Gather->getValue(),
24852 DAG.getUNDEF(MVT::v2f32));
24853 if (!Subtarget.hasVLX()) {
24854 // We need to widen the mask, but the instruction will only use 2
24855 // of its elements. So we can use undef.
24856 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24857 DAG.getUNDEF(MVT::v2i1));
24858 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
24860 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24862 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24863 DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
24864 Gather->getMemoryVT(), Gather->getMemOperand());
24865 Results.push_back(Res);
24866 Results.push_back(Res.getValue(2));
24869 if (VT == MVT::v2i32) {
24870 auto *Gather = cast<MaskedGatherSDNode>(N);
24871 SDValue Index = Gather->getIndex();
24872 SDValue Mask = Gather->getMask();
24873 assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
24874 SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
24875 Gather->getValue(),
24876 DAG.getUNDEF(MVT::v2i32));
24877 // If the index is v2i64 we can use it directly.
24878 if (Index.getValueType() == MVT::v2i64 &&
24879 (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
24880 if (!Subtarget.hasVLX()) {
24881 // We need to widen the mask, but the instruction will only use 2
24882 // of its elements. So we can use undef.
24883 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24884 DAG.getUNDEF(MVT::v2i1));
24885 Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
24887 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24889 SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
24890 DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
24891 Gather->getMemoryVT(), Gather->getMemOperand());
24892 SDValue Chain = Res.getValue(2);
24893 if (!ExperimentalVectorWideningLegalization)
24894 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
24895 DAG.getIntPtrConstant(0, dl));
24896 Results.push_back(Res);
24897 Results.push_back(Chain);
24900 EVT IndexVT = Index.getValueType();
24901 EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
24902 IndexVT.getScalarType(), 4);
24903 // Otherwise we need to custom widen everything to avoid promotion.
24904 Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
24905 DAG.getUNDEF(IndexVT));
24906 Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
24907 DAG.getConstant(0, dl, MVT::v2i1));
24908 SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
24910 SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
24911 Gather->getMemoryVT(), dl, Ops,
24912 Gather->getMemOperand());
24913 SDValue Chain = Res.getValue(1);
24914 if (!ExperimentalVectorWideningLegalization)
24915 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
24916 DAG.getIntPtrConstant(0, dl));
24917 Results.push_back(Res);
24918 Results.push_back(Chain);
24926 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24927 switch ((X86ISD::NodeType)Opcode) {
24928 case X86ISD::FIRST_NUMBER: break;
24929 case X86ISD::BSF: return "X86ISD::BSF";
24930 case X86ISD::BSR: return "X86ISD::BSR";
24931 case X86ISD::SHLD: return "X86ISD::SHLD";
24932 case X86ISD::SHRD: return "X86ISD::SHRD";
24933 case X86ISD::FAND: return "X86ISD::FAND";
24934 case X86ISD::FANDN: return "X86ISD::FANDN";
24935 case X86ISD::FOR: return "X86ISD::FOR";
24936 case X86ISD::FXOR: return "X86ISD::FXOR";
24937 case X86ISD::FILD: return "X86ISD::FILD";
24938 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24939 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24940 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24941 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24942 case X86ISD::FLD: return "X86ISD::FLD";
24943 case X86ISD::FST: return "X86ISD::FST";
24944 case X86ISD::CALL: return "X86ISD::CALL";
24945 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24946 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24947 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24948 case X86ISD::BT: return "X86ISD::BT";
24949 case X86ISD::CMP: return "X86ISD::CMP";
24950 case X86ISD::COMI: return "X86ISD::COMI";
24951 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24952 case X86ISD::CMPM: return "X86ISD::CMPM";
24953 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24954 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24955 case X86ISD::SETCC: return "X86ISD::SETCC";
24956 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24957 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24958 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24959 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24960 case X86ISD::CMOV: return "X86ISD::CMOV";
24961 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24962 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24963 case X86ISD::IRET: return "X86ISD::IRET";
24964 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24965 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24966 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24967 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24968 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24969 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24970 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24971 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24972 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24973 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24974 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24975 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24976 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24977 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24978 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24979 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24980 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24981 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24982 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24983 case X86ISD::HADD: return "X86ISD::HADD";
24984 case X86ISD::HSUB: return "X86ISD::HSUB";
24985 case X86ISD::FHADD: return "X86ISD::FHADD";
24986 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24987 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24988 case X86ISD::FMAX: return "X86ISD::FMAX";
24989 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24990 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24991 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24992 case X86ISD::FMIN: return "X86ISD::FMIN";
24993 case X86ISD::FMINS: return "X86ISD::FMINS";
24994 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24995 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24996 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24997 case X86ISD::FMINC: return "X86ISD::FMINC";
24998 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24999 case X86ISD::FRCP: return "X86ISD::FRCP";
25000 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
25001 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
25002 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
25003 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
25004 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
25005 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
25006 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
25007 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
25008 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
25009 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
25010 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
25011 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
25012 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
25013 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
25014 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
25015 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
25016 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
25017 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
25018 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
25019 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
25020 case X86ISD::LADD: return "X86ISD::LADD";
25021 case X86ISD::LSUB: return "X86ISD::LSUB";
25022 case X86ISD::LOR: return "X86ISD::LOR";
25023 case X86ISD::LXOR: return "X86ISD::LXOR";
25024 case X86ISD::LAND: return "X86ISD::LAND";
25025 case X86ISD::LINC: return "X86ISD::LINC";
25026 case X86ISD::LDEC: return "X86ISD::LDEC";
25027 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
25028 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
25029 case X86ISD::VZEXT: return "X86ISD::VZEXT";
25030 case X86ISD::VSEXT: return "X86ISD::VSEXT";
25031 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
25032 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
25033 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
25034 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
25035 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
25036 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
25037 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
25038 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
25039 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
25040 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
25041 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
25042 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
25043 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
25044 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
25045 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
25046 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
25047 case X86ISD::VSHL: return "X86ISD::VSHL";
25048 case X86ISD::VSRL: return "X86ISD::VSRL";
25049 case X86ISD::VSRA: return "X86ISD::VSRA";
25050 case X86ISD::VSHLI: return "X86ISD::VSHLI";
25051 case X86ISD::VSRLI: return "X86ISD::VSRLI";
25052 case X86ISD::VSRAI: return "X86ISD::VSRAI";
25053 case X86ISD::VSRAV: return "X86ISD::VSRAV";
25054 case X86ISD::VROTLI: return "X86ISD::VROTLI";
25055 case X86ISD::VROTRI: return "X86ISD::VROTRI";
25056 case X86ISD::VPPERM: return "X86ISD::VPPERM";
25057 case X86ISD::CMPP: return "X86ISD::CMPP";
25058 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
25059 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
25060 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
25061 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
25062 case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
25063 case X86ISD::ADD: return "X86ISD::ADD";
25064 case X86ISD::SUB: return "X86ISD::SUB";
25065 case X86ISD::ADC: return "X86ISD::ADC";
25066 case X86ISD::SBB: return "X86ISD::SBB";
25067 case X86ISD::SMUL: return "X86ISD::SMUL";
25068 case X86ISD::UMUL: return "X86ISD::UMUL";
25069 case X86ISD::SMUL8: return "X86ISD::SMUL8";
25070 case X86ISD::UMUL8: return "X86ISD::UMUL8";
25071 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
25072 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
25073 case X86ISD::INC: return "X86ISD::INC";
25074 case X86ISD::DEC: return "X86ISD::DEC";
25075 case X86ISD::OR: return "X86ISD::OR";
25076 case X86ISD::XOR: return "X86ISD::XOR";
25077 case X86ISD::AND: return "X86ISD::AND";
25078 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
25079 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
25080 case X86ISD::PTEST: return "X86ISD::PTEST";
25081 case X86ISD::TESTP: return "X86ISD::TESTP";
25082 case X86ISD::TESTM: return "X86ISD::TESTM";
25083 case X86ISD::TESTNM: return "X86ISD::TESTNM";
25084 case X86ISD::KORTEST: return "X86ISD::KORTEST";
25085 case X86ISD::KTEST: return "X86ISD::KTEST";
25086 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
25087 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
25088 case X86ISD::PACKSS: return "X86ISD::PACKSS";
25089 case X86ISD::PACKUS: return "X86ISD::PACKUS";
25090 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
25091 case X86ISD::VALIGN: return "X86ISD::VALIGN";
25092 case X86ISD::VSHLD: return "X86ISD::VSHLD";
25093 case X86ISD::VSHRD: return "X86ISD::VSHRD";
25094 case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
25095 case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
25096 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
25097 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
25098 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
25099 case X86ISD::SHUFP: return "X86ISD::SHUFP";
25100 case X86ISD::SHUF128: return "X86ISD::SHUF128";
25101 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
25102 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
25103 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
25104 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
25105 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
25106 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
25107 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
25108 case X86ISD::MOVSD: return "X86ISD::MOVSD";
25109 case X86ISD::MOVSS: return "X86ISD::MOVSS";
25110 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
25111 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
25112 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
25113 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
25114 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
25115 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
25116 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
25117 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
25118 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
25119 case X86ISD::VPERMV: return "X86ISD::VPERMV";
25120 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
25121 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
25122 case X86ISD::VPERMI: return "X86ISD::VPERMI";
25123 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
25124 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
25125 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
25126 case X86ISD::VRANGE: return "X86ISD::VRANGE";
25127 case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
25128 case X86ISD::VRANGES: return "X86ISD::VRANGES";
25129 case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
25130 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
25131 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
25132 case X86ISD::PSADBW: return "X86ISD::PSADBW";
25133 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
25134 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
25135 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
25136 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
25137 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
25138 case X86ISD::MFENCE: return "X86ISD::MFENCE";
25139 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
25140 case X86ISD::SAHF: return "X86ISD::SAHF";
25141 case X86ISD::RDRAND: return "X86ISD::RDRAND";
25142 case X86ISD::RDSEED: return "X86ISD::RDSEED";
25143 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
25144 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
25145 case X86ISD::VPSHA: return "X86ISD::VPSHA";
25146 case X86ISD::VPSHL: return "X86ISD::VPSHL";
25147 case X86ISD::VPCOM: return "X86ISD::VPCOM";
25148 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
25149 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
25150 case X86ISD::FMSUB: return "X86ISD::FMSUB";
25151 case X86ISD::FNMADD: return "X86ISD::FNMADD";
25152 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
25153 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
25154 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
25155 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
25156 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
25157 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
25158 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
25159 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
25160 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
25161 case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
25162 case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
25163 case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
25164 case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
25165 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
25166 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
25167 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
25168 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
25169 case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
25170 case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
25171 case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
25172 case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
25173 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
25174 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
25175 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
25176 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
25177 case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
25178 case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
25179 case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
25180 case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
25181 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
25182 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
25183 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
25184 case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
25185 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
25186 case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
25187 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
25188 case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
25189 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
25190 case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
25191 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
25192 case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
25193 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
25194 case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
25195 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
25196 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
25197 case X86ISD::XTEST: return "X86ISD::XTEST";
25198 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
25199 case X86ISD::EXPAND: return "X86ISD::EXPAND";
25200 case X86ISD::SELECT: return "X86ISD::SELECT";
25201 case X86ISD::SELECTS: return "X86ISD::SELECTS";
25202 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
25203 case X86ISD::RCP14: return "X86ISD::RCP14";
25204 case X86ISD::RCP14S: return "X86ISD::RCP14S";
25205 case X86ISD::RCP28: return "X86ISD::RCP28";
25206 case X86ISD::RCP28S: return "X86ISD::RCP28S";
25207 case X86ISD::EXP2: return "X86ISD::EXP2";
25208 case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
25209 case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
25210 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
25211 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
25212 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
25213 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
25214 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
25215 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
25216 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
25217 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
25218 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
25219 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
25220 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
25221 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
25222 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
25223 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
25224 case X86ISD::SCALEF: return "X86ISD::SCALEF";
25225 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
25226 case X86ISD::ADDS: return "X86ISD::ADDS";
25227 case X86ISD::SUBS: return "X86ISD::SUBS";
25228 case X86ISD::AVG: return "X86ISD::AVG";
25229 case X86ISD::MULHRS: return "X86ISD::MULHRS";
25230 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
25231 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
25232 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
25233 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
25234 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
25235 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
25236 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
25237 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
25238 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
25239 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
25240 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
25241 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
25242 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
25243 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
25244 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
25245 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
25246 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
25247 case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
25248 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
25249 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
25250 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
25251 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
25252 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
25253 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
25254 case X86ISD::LWPINS: return "X86ISD::LWPINS";
25255 case X86ISD::MGATHER: return "X86ISD::MGATHER";
25256 case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
25257 case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
25258 case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
25259 case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
25260 case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
25261 case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
25262 case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
25263 case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
25264 case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
25269 /// Return true if the addressing mode represented by AM is legal for this
25270 /// target, for a load/store of the specified type.
25271 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
25272 const AddrMode &AM, Type *Ty,
25274 Instruction *I) const {
25275 // X86 supports extremely general addressing modes.
25276 CodeModel::Model M = getTargetMachine().getCodeModel();
25278 // X86 allows a sign-extended 32-bit immediate field as a displacement.
25279 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
25283 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
25285 // If a reference to this global requires an extra load, we can't fold it.
25286 if (isGlobalStubReference(GVFlags))
25289 // If BaseGV requires a register for the PIC base, we cannot also have a
25290 // BaseReg specified.
25291 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
25294 // If lower 4G is not available, then we must use rip-relative addressing.
25295 if ((M != CodeModel::Small || isPositionIndependent()) &&
25296 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
25300 switch (AM.Scale) {
25306 // These scales always work.
25311 // These scales are formed with basereg+scalereg. Only accept if there is
25316 default: // Other stuff never works.
25323 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
25324 unsigned Bits = Ty->getScalarSizeInBits();
25326 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
25327 // particularly cheaper than those without.
25331 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
25332 // variable shifts just as cheap as scalar ones.
25333 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
25336 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
25337 // fully general vector.
25341 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
25342 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25344 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
25345 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
25346 return NumBits1 > NumBits2;
25349 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
25350 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
25353 if (!isTypeLegal(EVT::getEVT(Ty1)))
25356 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
25358 // Assuming the caller doesn't have a zeroext or signext return parameter,
25359 // truncation all the way down to i1 is valid.
25363 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
25364 return isInt<32>(Imm);
25367 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
25368 // Can also use sub to handle negated immediates.
25369 return isInt<32>(Imm);
25372 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
25373 if (!VT1.isInteger() || !VT2.isInteger())
25375 unsigned NumBits1 = VT1.getSizeInBits();
25376 unsigned NumBits2 = VT2.getSizeInBits();
25377 return NumBits1 > NumBits2;
25380 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
25381 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25382 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
25385 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
25386 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
25387 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
25390 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
25391 EVT VT1 = Val.getValueType();
25392 if (isZExtFree(VT1, VT2))
25395 if (Val.getOpcode() != ISD::LOAD)
25398 if (!VT1.isSimple() || !VT1.isInteger() ||
25399 !VT2.isSimple() || !VT2.isInteger())
25402 switch (VT1.getSimpleVT().SimpleTy) {
25407 // X86 has 8, 16, and 32-bit zero-extending loads.
25414 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
25417 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
25418 if (!Subtarget.hasAnyFMA())
25421 VT = VT.getScalarType();
25423 if (!VT.isSimple())
25426 switch (VT.getSimpleVT().SimpleTy) {
25437 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
25438 // i16 instructions are longer (0x66 prefix) and potentially slower.
25439 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
25442 /// Targets can use this to indicate that they only support *some*
25443 /// VECTOR_SHUFFLE operations, those with specific masks.
25444 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
25445 /// are assumed to be legal.
25446 bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
25447 if (!VT.isSimple())
25450 // Not for i1 vectors
25451 if (VT.getSimpleVT().getScalarType() == MVT::i1)
25454 // Very little shuffling can be done for 64-bit vectors right now.
25455 if (VT.getSimpleVT().getSizeInBits() == 64)
25458 // We only care that the types being shuffled are legal. The lowering can
25459 // handle any possible shuffle mask that results.
25460 return isTypeLegal(VT.getSimpleVT());
25464 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
25466 // Just delegate to the generic legality, clear masks aren't special.
25467 return isShuffleMaskLegal(Mask, VT);
25470 //===----------------------------------------------------------------------===//
25471 // X86 Scheduler Hooks
25472 //===----------------------------------------------------------------------===//
25474 /// Utility function to emit xbegin specifying the start of an RTM region.
25475 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25476 const TargetInstrInfo *TII) {
25477 DebugLoc DL = MI.getDebugLoc();
25479 const BasicBlock *BB = MBB->getBasicBlock();
25480 MachineFunction::iterator I = ++MBB->getIterator();
25482 // For the v = xbegin(), we generate
25491 // eax = # XABORT_DEF
25495 // v = phi(s0/mainBB, s1/fallBB)
25497 MachineBasicBlock *thisMBB = MBB;
25498 MachineFunction *MF = MBB->getParent();
25499 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25500 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25501 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25502 MF->insert(I, mainMBB);
25503 MF->insert(I, fallMBB);
25504 MF->insert(I, sinkMBB);
25506 // Transfer the remainder of BB and its successor edges to sinkMBB.
25507 sinkMBB->splice(sinkMBB->begin(), MBB,
25508 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25509 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25511 MachineRegisterInfo &MRI = MF->getRegInfo();
25512 unsigned DstReg = MI.getOperand(0).getReg();
25513 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25514 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25515 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25519 // # fallthrough to mainMBB
25520 // # abortion to fallMBB
25521 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25522 thisMBB->addSuccessor(mainMBB);
25523 thisMBB->addSuccessor(fallMBB);
25526 // mainDstReg := -1
25527 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25528 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25529 mainMBB->addSuccessor(sinkMBB);
25532 // ; pseudo instruction to model hardware's definition from XABORT
25533 // EAX := XABORT_DEF
25534 // fallDstReg := EAX
25535 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25536 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25538 fallMBB->addSuccessor(sinkMBB);
25541 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25542 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25543 .addReg(mainDstReg).addMBB(mainMBB)
25544 .addReg(fallDstReg).addMBB(fallMBB);
25546 MI.eraseFromParent();
25550 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25551 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25552 // in the .td file.
25553 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25554 const TargetInstrInfo *TII) {
25556 switch (MI.getOpcode()) {
25557 default: llvm_unreachable("illegal opcode!");
25558 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25559 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25560 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25561 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25562 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25563 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25564 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25565 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25568 DebugLoc dl = MI.getDebugLoc();
25569 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25571 unsigned NumArgs = MI.getNumOperands();
25572 for (unsigned i = 1; i < NumArgs; ++i) {
25573 MachineOperand &Op = MI.getOperand(i);
25574 if (!(Op.isReg() && Op.isImplicit()))
25577 if (MI.hasOneMemOperand())
25578 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25580 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25581 .addReg(X86::XMM0);
25583 MI.eraseFromParent();
25587 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25588 // defs in an instruction pattern
25589 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25590 const TargetInstrInfo *TII) {
25592 switch (MI.getOpcode()) {
25593 default: llvm_unreachable("illegal opcode!");
25594 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25595 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25596 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25597 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25598 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25599 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25600 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25601 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25604 DebugLoc dl = MI.getDebugLoc();
25605 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25607 unsigned NumArgs = MI.getNumOperands(); // remove the results
25608 for (unsigned i = 1; i < NumArgs; ++i) {
25609 MachineOperand &Op = MI.getOperand(i);
25610 if (!(Op.isReg() && Op.isImplicit()))
25613 if (MI.hasOneMemOperand())
25614 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25616 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25619 MI.eraseFromParent();
25623 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25624 const X86Subtarget &Subtarget) {
25625 DebugLoc dl = MI.getDebugLoc();
25626 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25628 // insert input VAL into EAX
25629 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25630 .addReg(MI.getOperand(0).getReg());
25631 // insert zero to ECX
25632 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25634 // insert zero to EDX
25635 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25637 // insert WRPKRU instruction
25638 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25640 MI.eraseFromParent(); // The pseudo is gone now.
25644 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25645 const X86Subtarget &Subtarget) {
25646 DebugLoc dl = MI.getDebugLoc();
25647 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25649 // insert zero to ECX
25650 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25652 // insert RDPKRU instruction
25653 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25654 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25657 MI.eraseFromParent(); // The pseudo is gone now.
25661 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25662 const X86Subtarget &Subtarget,
25664 DebugLoc dl = MI.getDebugLoc();
25665 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25666 // Address into RAX/EAX, other two args into ECX, EDX.
25667 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25668 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25669 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25670 for (int i = 0; i < X86::AddrNumOperands; ++i)
25671 MIB.add(MI.getOperand(i));
25673 unsigned ValOps = X86::AddrNumOperands;
25674 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25675 .addReg(MI.getOperand(ValOps).getReg());
25676 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25677 .addReg(MI.getOperand(ValOps + 1).getReg());
25679 // The instruction doesn't actually take any operands though.
25680 BuildMI(*BB, MI, dl, TII->get(Opc));
25682 MI.eraseFromParent(); // The pseudo is gone now.
25686 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25687 const X86Subtarget &Subtarget) {
25688 DebugLoc dl = MI->getDebugLoc();
25689 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25690 // Address into RAX/EAX
25691 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25692 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25693 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25694 for (int i = 0; i < X86::AddrNumOperands; ++i)
25695 MIB.add(MI->getOperand(i));
25697 // The instruction doesn't actually take any operands though.
25698 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25700 MI->eraseFromParent(); // The pseudo is gone now.
25706 MachineBasicBlock *
25707 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25708 MachineBasicBlock *MBB) const {
25709 // Emit va_arg instruction on X86-64.
25711 // Operands to this pseudo-instruction:
25712 // 0 ) Output : destination address (reg)
25713 // 1-5) Input : va_list address (addr, i64mem)
25714 // 6 ) ArgSize : Size (in bytes) of vararg type
25715 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25716 // 8 ) Align : Alignment of type
25717 // 9 ) EFLAGS (implicit-def)
25719 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25720 static_assert(X86::AddrNumOperands == 5,
25721 "VAARG_64 assumes 5 address operands");
25723 unsigned DestReg = MI.getOperand(0).getReg();
25724 MachineOperand &Base = MI.getOperand(1);
25725 MachineOperand &Scale = MI.getOperand(2);
25726 MachineOperand &Index = MI.getOperand(3);
25727 MachineOperand &Disp = MI.getOperand(4);
25728 MachineOperand &Segment = MI.getOperand(5);
25729 unsigned ArgSize = MI.getOperand(6).getImm();
25730 unsigned ArgMode = MI.getOperand(7).getImm();
25731 unsigned Align = MI.getOperand(8).getImm();
25733 // Memory Reference
25734 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25735 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25736 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25738 // Machine Information
25739 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25740 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25741 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25742 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25743 DebugLoc DL = MI.getDebugLoc();
25745 // struct va_list {
25748 // i64 overflow_area (address)
25749 // i64 reg_save_area (address)
25751 // sizeof(va_list) = 24
25752 // alignment(va_list) = 8
25754 unsigned TotalNumIntRegs = 6;
25755 unsigned TotalNumXMMRegs = 8;
25756 bool UseGPOffset = (ArgMode == 1);
25757 bool UseFPOffset = (ArgMode == 2);
25758 unsigned MaxOffset = TotalNumIntRegs * 8 +
25759 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25761 /* Align ArgSize to a multiple of 8 */
25762 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25763 bool NeedsAlign = (Align > 8);
25765 MachineBasicBlock *thisMBB = MBB;
25766 MachineBasicBlock *overflowMBB;
25767 MachineBasicBlock *offsetMBB;
25768 MachineBasicBlock *endMBB;
25770 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25771 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25772 unsigned OffsetReg = 0;
25774 if (!UseGPOffset && !UseFPOffset) {
25775 // If we only pull from the overflow region, we don't create a branch.
25776 // We don't need to alter control flow.
25777 OffsetDestReg = 0; // unused
25778 OverflowDestReg = DestReg;
25780 offsetMBB = nullptr;
25781 overflowMBB = thisMBB;
25784 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25785 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25786 // If not, pull from overflow_area. (branch to overflowMBB)
25791 // offsetMBB overflowMBB
25796 // Registers for the PHI in endMBB
25797 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25798 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25800 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25801 MachineFunction *MF = MBB->getParent();
25802 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25803 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25804 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25806 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25808 // Insert the new basic blocks
25809 MF->insert(MBBIter, offsetMBB);
25810 MF->insert(MBBIter, overflowMBB);
25811 MF->insert(MBBIter, endMBB);
25813 // Transfer the remainder of MBB and its successor edges to endMBB.
25814 endMBB->splice(endMBB->begin(), thisMBB,
25815 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25816 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25818 // Make offsetMBB and overflowMBB successors of thisMBB
25819 thisMBB->addSuccessor(offsetMBB);
25820 thisMBB->addSuccessor(overflowMBB);
25822 // endMBB is a successor of both offsetMBB and overflowMBB
25823 offsetMBB->addSuccessor(endMBB);
25824 overflowMBB->addSuccessor(endMBB);
25826 // Load the offset value into a register
25827 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25828 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25832 .addDisp(Disp, UseFPOffset ? 4 : 0)
25834 .setMemRefs(MMOBegin, MMOEnd);
25836 // Check if there is enough room left to pull this argument.
25837 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25839 .addImm(MaxOffset + 8 - ArgSizeA8);
25841 // Branch to "overflowMBB" if offset >= max
25842 // Fall through to "offsetMBB" otherwise
25843 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25844 .addMBB(overflowMBB);
25847 // In offsetMBB, emit code to use the reg_save_area.
25849 assert(OffsetReg != 0);
25851 // Read the reg_save_area address.
25852 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25853 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25859 .setMemRefs(MMOBegin, MMOEnd);
25861 // Zero-extend the offset
25862 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25863 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25866 .addImm(X86::sub_32bit);
25868 // Add the offset to the reg_save_area to get the final address.
25869 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25870 .addReg(OffsetReg64)
25871 .addReg(RegSaveReg);
25873 // Compute the offset for the next argument
25874 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25875 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25877 .addImm(UseFPOffset ? 16 : 8);
25879 // Store it back into the va_list.
25880 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25884 .addDisp(Disp, UseFPOffset ? 4 : 0)
25886 .addReg(NextOffsetReg)
25887 .setMemRefs(MMOBegin, MMOEnd);
25890 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25895 // Emit code to use overflow area
25898 // Load the overflow_area address into a register.
25899 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25900 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25906 .setMemRefs(MMOBegin, MMOEnd);
25908 // If we need to align it, do so. Otherwise, just copy the address
25909 // to OverflowDestReg.
25911 // Align the overflow address
25912 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25913 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25915 // aligned_addr = (addr + (align-1)) & ~(align-1)
25916 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25917 .addReg(OverflowAddrReg)
25920 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25922 .addImm(~(uint64_t)(Align-1));
25924 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25925 .addReg(OverflowAddrReg);
25928 // Compute the next overflow address after this argument.
25929 // (the overflow address should be kept 8-byte aligned)
25930 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25931 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25932 .addReg(OverflowDestReg)
25933 .addImm(ArgSizeA8);
25935 // Store the new overflow address.
25936 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25942 .addReg(NextAddrReg)
25943 .setMemRefs(MMOBegin, MMOEnd);
25945 // If we branched, emit the PHI to the front of endMBB.
25947 BuildMI(*endMBB, endMBB->begin(), DL,
25948 TII->get(X86::PHI), DestReg)
25949 .addReg(OffsetDestReg).addMBB(offsetMBB)
25950 .addReg(OverflowDestReg).addMBB(overflowMBB);
25953 // Erase the pseudo instruction
25954 MI.eraseFromParent();
25959 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25960 MachineInstr &MI, MachineBasicBlock *MBB) const {
25961 // Emit code to save XMM registers to the stack. The ABI says that the
25962 // number of registers to save is given in %al, so it's theoretically
25963 // possible to do an indirect jump trick to avoid saving all of them,
25964 // however this code takes a simpler approach and just executes all
25965 // of the stores if %al is non-zero. It's less code, and it's probably
25966 // easier on the hardware branch predictor, and stores aren't all that
25967 // expensive anyway.
25969 // Create the new basic blocks. One block contains all the XMM stores,
25970 // and one block is the final destination regardless of whether any
25971 // stores were performed.
25972 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25973 MachineFunction *F = MBB->getParent();
25974 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25975 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25976 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25977 F->insert(MBBIter, XMMSaveMBB);
25978 F->insert(MBBIter, EndMBB);
25980 // Transfer the remainder of MBB and its successor edges to EndMBB.
25981 EndMBB->splice(EndMBB->begin(), MBB,
25982 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25983 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25985 // The original block will now fall through to the XMM save block.
25986 MBB->addSuccessor(XMMSaveMBB);
25987 // The XMMSaveMBB will fall through to the end block.
25988 XMMSaveMBB->addSuccessor(EndMBB);
25990 // Now add the instructions.
25991 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25992 DebugLoc DL = MI.getDebugLoc();
25994 unsigned CountReg = MI.getOperand(0).getReg();
25995 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25996 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25998 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25999 // If %al is 0, branch around the XMM save block.
26000 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
26001 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
26002 MBB->addSuccessor(EndMBB);
26005 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
26006 // that was just emitted, but clearly shouldn't be "saved".
26007 assert((MI.getNumOperands() <= 3 ||
26008 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
26009 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
26010 "Expected last argument to be EFLAGS");
26011 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
26012 // In the XMM save block, save all the XMM argument registers.
26013 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
26014 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
26015 MachineMemOperand *MMO = F->getMachineMemOperand(
26016 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
26017 MachineMemOperand::MOStore,
26018 /*Size=*/16, /*Align=*/16);
26019 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
26020 .addFrameIndex(RegSaveFrameIndex)
26021 .addImm(/*Scale=*/1)
26022 .addReg(/*IndexReg=*/0)
26023 .addImm(/*Disp=*/Offset)
26024 .addReg(/*Segment=*/0)
26025 .addReg(MI.getOperand(i).getReg())
26026 .addMemOperand(MMO);
26029 MI.eraseFromParent(); // The pseudo instruction is gone now.
26034 // The EFLAGS operand of SelectItr might be missing a kill marker
26035 // because there were multiple uses of EFLAGS, and ISel didn't know
26036 // which to mark. Figure out whether SelectItr should have had a
26037 // kill marker, and set it if it should. Returns the correct kill
26039 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
26040 MachineBasicBlock* BB,
26041 const TargetRegisterInfo* TRI) {
26042 // Scan forward through BB for a use/def of EFLAGS.
26043 MachineBasicBlock::iterator miI(std::next(SelectItr));
26044 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
26045 const MachineInstr& mi = *miI;
26046 if (mi.readsRegister(X86::EFLAGS))
26048 if (mi.definesRegister(X86::EFLAGS))
26049 break; // Should have kill-flag - update below.
26052 // If we hit the end of the block, check whether EFLAGS is live into a
26054 if (miI == BB->end()) {
26055 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
26056 sEnd = BB->succ_end();
26057 sItr != sEnd; ++sItr) {
26058 MachineBasicBlock* succ = *sItr;
26059 if (succ->isLiveIn(X86::EFLAGS))
26064 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
26065 // out. SelectMI should have a kill flag on EFLAGS.
26066 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
26070 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
26071 // together with other CMOV pseudo-opcodes into a single basic-block with
26072 // conditional jump around it.
26073 static bool isCMOVPseudo(MachineInstr &MI) {
26074 switch (MI.getOpcode()) {
26075 case X86::CMOV_FR32:
26076 case X86::CMOV_FR64:
26077 case X86::CMOV_GR8:
26078 case X86::CMOV_GR16:
26079 case X86::CMOV_GR32:
26080 case X86::CMOV_RFP32:
26081 case X86::CMOV_RFP64:
26082 case X86::CMOV_RFP80:
26083 case X86::CMOV_V2F64:
26084 case X86::CMOV_V2I64:
26085 case X86::CMOV_V4F32:
26086 case X86::CMOV_V4F64:
26087 case X86::CMOV_V4I64:
26088 case X86::CMOV_V16F32:
26089 case X86::CMOV_V8F32:
26090 case X86::CMOV_V8F64:
26091 case X86::CMOV_V8I64:
26092 case X86::CMOV_V8I1:
26093 case X86::CMOV_V16I1:
26094 case X86::CMOV_V32I1:
26095 case X86::CMOV_V64I1:
26103 // Helper function, which inserts PHI functions into SinkMBB:
26104 // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
26105 // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
26106 // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
26107 // the last PHI function inserted.
26108 static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
26109 MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
26110 MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
26111 MachineBasicBlock *SinkMBB) {
26112 MachineFunction *MF = TrueMBB->getParent();
26113 const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
26114 DebugLoc DL = MIItBegin->getDebugLoc();
26116 X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
26117 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26119 MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
26121 // As we are creating the PHIs, we have to be careful if there is more than
26122 // one. Later CMOVs may reference the results of earlier CMOVs, but later
26123 // PHIs have to reference the individual true/false inputs from earlier PHIs.
26124 // That also means that PHI construction must work forward from earlier to
26125 // later, and that the code must maintain a mapping from earlier PHI's
26126 // destination registers, and the registers that went into the PHI.
26127 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
26128 MachineInstrBuilder MIB;
26130 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
26131 unsigned DestReg = MIIt->getOperand(0).getReg();
26132 unsigned Op1Reg = MIIt->getOperand(1).getReg();
26133 unsigned Op2Reg = MIIt->getOperand(2).getReg();
26135 // If this CMOV we are generating is the opposite condition from
26136 // the jump we generated, then we have to swap the operands for the
26137 // PHI that is going to be generated.
26138 if (MIIt->getOperand(3).getImm() == OppCC)
26139 std::swap(Op1Reg, Op2Reg);
26141 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
26142 Op1Reg = RegRewriteTable[Op1Reg].first;
26144 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
26145 Op2Reg = RegRewriteTable[Op2Reg].second;
26147 MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
26153 // Add this PHI to the rewrite table.
26154 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
26160 // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
26161 MachineBasicBlock *
26162 X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
26163 MachineInstr &SecondCascadedCMOV,
26164 MachineBasicBlock *ThisMBB) const {
26165 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26166 DebugLoc DL = FirstCMOV.getDebugLoc();
26168 // We lower cascaded CMOVs such as
26170 // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
26172 // to two successive branches.
26174 // Without this, we would add a PHI between the two jumps, which ends up
26175 // creating a few copies all around. For instance, for
26177 // (sitofp (zext (fcmp une)))
26179 // we would generate:
26181 // ucomiss %xmm1, %xmm0
26182 // movss <1.0f>, %xmm0
26183 // movaps %xmm0, %xmm1
26185 // xorps %xmm1, %xmm1
26188 // movaps %xmm1, %xmm0
26192 // because this custom-inserter would have generated:
26204 // A: X = ...; Y = ...
26206 // C: Z = PHI [X, A], [Y, B]
26208 // E: PHI [X, C], [Z, D]
26210 // If we lower both CMOVs in a single step, we can instead generate:
26222 // A: X = ...; Y = ...
26224 // E: PHI [X, A], [X, C], [Y, D]
26226 // Which, in our sitofp/fcmp example, gives us something like:
26228 // ucomiss %xmm1, %xmm0
26229 // movss <1.0f>, %xmm0
26232 // xorps %xmm0, %xmm0
26237 // We lower cascaded CMOV into two successive branches to the same block.
26238 // EFLAGS is used by both, so mark it as live in the second.
26239 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26240 MachineFunction *F = ThisMBB->getParent();
26241 MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26242 MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
26243 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26245 MachineFunction::iterator It = ++ThisMBB->getIterator();
26246 F->insert(It, FirstInsertedMBB);
26247 F->insert(It, SecondInsertedMBB);
26248 F->insert(It, SinkMBB);
26250 // For a cascaded CMOV, we lower it to two successive branches to
26251 // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
26252 // the FirstInsertedMBB.
26253 FirstInsertedMBB->addLiveIn(X86::EFLAGS);
26255 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26256 // live into the sink and copy blocks.
26257 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26258 if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
26259 !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
26260 SecondInsertedMBB->addLiveIn(X86::EFLAGS);
26261 SinkMBB->addLiveIn(X86::EFLAGS);
26264 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26265 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26266 std::next(MachineBasicBlock::iterator(FirstCMOV)),
26268 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26270 // Fallthrough block for ThisMBB.
26271 ThisMBB->addSuccessor(FirstInsertedMBB);
26272 // The true block target of the first branch is always SinkMBB.
26273 ThisMBB->addSuccessor(SinkMBB);
26274 // Fallthrough block for FirstInsertedMBB.
26275 FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
26276 // The true block for the branch of FirstInsertedMBB.
26277 FirstInsertedMBB->addSuccessor(SinkMBB);
26278 // This is fallthrough.
26279 SecondInsertedMBB->addSuccessor(SinkMBB);
26281 // Create the conditional branch instructions.
26282 X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
26283 unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
26284 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26286 X86::CondCode SecondCC =
26287 X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
26288 unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
26289 BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
26292 // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
26293 unsigned DestReg = FirstCMOV.getOperand(0).getReg();
26294 unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
26295 unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
26296 MachineInstrBuilder MIB =
26297 BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
26299 .addMBB(SecondInsertedMBB)
26303 // The second SecondInsertedMBB provides the same incoming value as the
26304 // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
26305 MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
26306 // Copy the PHI result to the register defined by the second CMOV.
26307 BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
26308 TII->get(TargetOpcode::COPY),
26309 SecondCascadedCMOV.getOperand(0).getReg())
26310 .addReg(FirstCMOV.getOperand(0).getReg());
26312 // Now remove the CMOVs.
26313 FirstCMOV.eraseFromParent();
26314 SecondCascadedCMOV.eraseFromParent();
26319 MachineBasicBlock *
26320 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
26321 MachineBasicBlock *ThisMBB) const {
26322 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26323 DebugLoc DL = MI.getDebugLoc();
26325 // To "insert" a SELECT_CC instruction, we actually have to insert the
26326 // diamond control-flow pattern. The incoming instruction knows the
26327 // destination vreg to set, the condition code register to branch on, the
26328 // true/false values to select between and a branch opcode to use.
26333 // cmpTY ccX, r1, r2
26335 // fallthrough --> FalseMBB
26337 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
26338 // as described above, by inserting a BB, and then making a PHI at the join
26339 // point to select the true and false operands of the CMOV in the PHI.
26341 // The code also handles two different cases of multiple CMOV opcodes
26345 // In this case, there are multiple CMOVs in a row, all which are based on
26346 // the same condition setting (or the exact opposite condition setting).
26347 // In this case we can lower all the CMOVs using a single inserted BB, and
26348 // then make a number of PHIs at the join point to model the CMOVs. The only
26349 // trickiness here, is that in a case like:
26351 // t2 = CMOV cond1 t1, f1
26352 // t3 = CMOV cond1 t2, f2
26354 // when rewriting this into PHIs, we have to perform some renaming on the
26355 // temps since you cannot have a PHI operand refer to a PHI result earlier
26356 // in the same block. The "simple" but wrong lowering would be:
26358 // t2 = PHI t1(BB1), f1(BB2)
26359 // t3 = PHI t2(BB1), f2(BB2)
26361 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
26362 // renaming is to note that on the path through BB1, t2 is really just a
26363 // copy of t1, and do that renaming, properly generating:
26365 // t2 = PHI t1(BB1), f1(BB2)
26366 // t3 = PHI t1(BB1), f2(BB2)
26369 // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
26370 // function - EmitLoweredCascadedSelect.
26372 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
26373 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
26374 MachineInstr *LastCMOV = &MI;
26375 MachineBasicBlock::iterator NextMIIt =
26376 std::next(MachineBasicBlock::iterator(MI));
26378 // Check for case 1, where there are multiple CMOVs with the same condition
26379 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
26380 // number of jumps the most.
26382 if (isCMOVPseudo(MI)) {
26383 // See if we have a string of CMOVS with the same condition.
26384 while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
26385 (NextMIIt->getOperand(3).getImm() == CC ||
26386 NextMIIt->getOperand(3).getImm() == OppCC)) {
26387 LastCMOV = &*NextMIIt;
26392 // This checks for case 2, but only do this if we didn't already find
26393 // case 1, as indicated by LastCMOV == MI.
26394 if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
26395 NextMIIt->getOpcode() == MI.getOpcode() &&
26396 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
26397 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
26398 NextMIIt->getOperand(1).isKill()) {
26399 return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
26402 const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
26403 MachineFunction *F = ThisMBB->getParent();
26404 MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
26405 MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
26407 MachineFunction::iterator It = ++ThisMBB->getIterator();
26408 F->insert(It, FalseMBB);
26409 F->insert(It, SinkMBB);
26411 // If the EFLAGS register isn't dead in the terminator, then claim that it's
26412 // live into the sink and copy blocks.
26413 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26414 if (!LastCMOV->killsRegister(X86::EFLAGS) &&
26415 !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
26416 FalseMBB->addLiveIn(X86::EFLAGS);
26417 SinkMBB->addLiveIn(X86::EFLAGS);
26420 // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
26421 SinkMBB->splice(SinkMBB->begin(), ThisMBB,
26422 std::next(MachineBasicBlock::iterator(LastCMOV)),
26424 SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
26426 // Fallthrough block for ThisMBB.
26427 ThisMBB->addSuccessor(FalseMBB);
26428 // The true block target of the first (or only) branch is always a SinkMBB.
26429 ThisMBB->addSuccessor(SinkMBB);
26430 // Fallthrough block for FalseMBB.
26431 FalseMBB->addSuccessor(SinkMBB);
26433 // Create the conditional branch instruction.
26434 unsigned Opc = X86::GetCondBranchFromCond(CC);
26435 BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
26438 // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
26440 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
26441 MachineBasicBlock::iterator MIItEnd =
26442 std::next(MachineBasicBlock::iterator(LastCMOV));
26443 createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
26445 // Now remove the CMOV(s).
26446 ThisMBB->erase(MIItBegin, MIItEnd);
26451 MachineBasicBlock *
26452 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
26453 MachineBasicBlock *BB) const {
26454 // Combine the following atomic floating-point modification pattern:
26455 // a.store(reg OP a.load(acquire), release)
26456 // Transform them into:
26457 // OPss (%gpr), %xmm
26458 // movss %xmm, (%gpr)
26459 // Or sd equivalent for 64-bit operations.
26461 switch (MI.getOpcode()) {
26462 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
26463 case X86::RELEASE_FADD32mr:
26464 FOp = X86::ADDSSrm;
26465 MOp = X86::MOVSSmr;
26467 case X86::RELEASE_FADD64mr:
26468 FOp = X86::ADDSDrm;
26469 MOp = X86::MOVSDmr;
26472 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26473 DebugLoc DL = MI.getDebugLoc();
26474 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
26475 unsigned ValOpIdx = X86::AddrNumOperands;
26476 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
26477 MachineInstrBuilder MIB =
26478 BuildMI(*BB, MI, DL, TII->get(FOp),
26479 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
26481 for (int i = 0; i < X86::AddrNumOperands; ++i) {
26482 MachineOperand &Operand = MI.getOperand(i);
26483 // Clear any kill flags on register operands as we'll create a second
26484 // instruction using the same address operands.
26485 if (Operand.isReg())
26486 Operand.setIsKill(false);
26489 MachineInstr *FOpMI = MIB;
26490 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
26491 for (int i = 0; i < X86::AddrNumOperands; ++i)
26492 MIB.add(MI.getOperand(i));
26493 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
26494 MI.eraseFromParent(); // The pseudo instruction is gone now.
26498 MachineBasicBlock *
26499 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
26500 MachineBasicBlock *BB) const {
26501 MachineFunction *MF = BB->getParent();
26502 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26503 DebugLoc DL = MI.getDebugLoc();
26504 const BasicBlock *LLVM_BB = BB->getBasicBlock();
26506 assert(MF->shouldSplitStack());
26508 const bool Is64Bit = Subtarget.is64Bit();
26509 const bool IsLP64 = Subtarget.isTarget64BitLP64();
26511 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
26512 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
26515 // ... [Till the alloca]
26516 // If stacklet is not large enough, jump to mallocMBB
26519 // Allocate by subtracting from RSP
26520 // Jump to continueMBB
26523 // Allocate by call to runtime
26527 // [rest of original BB]
26530 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26531 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26532 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
26534 MachineRegisterInfo &MRI = MF->getRegInfo();
26535 const TargetRegisterClass *AddrRegClass =
26536 getRegClassFor(getPointerTy(MF->getDataLayout()));
26538 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26539 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
26540 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
26541 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
26542 sizeVReg = MI.getOperand(1).getReg(),
26544 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26546 MachineFunction::iterator MBBIter = ++BB->getIterator();
26548 MF->insert(MBBIter, bumpMBB);
26549 MF->insert(MBBIter, mallocMBB);
26550 MF->insert(MBBIter, continueMBB);
26552 continueMBB->splice(continueMBB->begin(), BB,
26553 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26554 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26556 // Add code to the main basic block to check if the stack limit has been hit,
26557 // and if so, jump to mallocMBB otherwise to bumpMBB.
26558 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26559 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26560 .addReg(tmpSPVReg).addReg(sizeVReg);
26561 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26562 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26563 .addReg(SPLimitVReg);
26564 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26566 // bumpMBB simply decreases the stack pointer, since we know the current
26567 // stacklet has enough space.
26568 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26569 .addReg(SPLimitVReg);
26570 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26571 .addReg(SPLimitVReg);
26572 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26574 // Calls into a routine in libgcc to allocate more space from the heap.
26575 const uint32_t *RegMask =
26576 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26578 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26580 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26581 .addExternalSymbol("__morestack_allocate_stack_space")
26582 .addRegMask(RegMask)
26583 .addReg(X86::RDI, RegState::Implicit)
26584 .addReg(X86::RAX, RegState::ImplicitDefine);
26585 } else if (Is64Bit) {
26586 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26588 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26589 .addExternalSymbol("__morestack_allocate_stack_space")
26590 .addRegMask(RegMask)
26591 .addReg(X86::EDI, RegState::Implicit)
26592 .addReg(X86::EAX, RegState::ImplicitDefine);
26594 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26596 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26597 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26598 .addExternalSymbol("__morestack_allocate_stack_space")
26599 .addRegMask(RegMask)
26600 .addReg(X86::EAX, RegState::ImplicitDefine);
26604 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26607 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26608 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26609 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26611 // Set up the CFG correctly.
26612 BB->addSuccessor(bumpMBB);
26613 BB->addSuccessor(mallocMBB);
26614 mallocMBB->addSuccessor(continueMBB);
26615 bumpMBB->addSuccessor(continueMBB);
26617 // Take care of the PHI nodes.
26618 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26619 MI.getOperand(0).getReg())
26620 .addReg(mallocPtrVReg)
26622 .addReg(bumpSPPtrVReg)
26625 // Delete the original pseudo instruction.
26626 MI.eraseFromParent();
26629 return continueMBB;
26632 MachineBasicBlock *
26633 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26634 MachineBasicBlock *BB) const {
26635 MachineFunction *MF = BB->getParent();
26636 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26637 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26638 DebugLoc DL = MI.getDebugLoc();
26640 assert(!isAsynchronousEHPersonality(
26641 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26642 "SEH does not use catchret!");
26644 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26645 if (!Subtarget.is32Bit())
26648 // C++ EH creates a new target block to hold the restore code, and wires up
26649 // the new block to the return destination with a normal JMP_4.
26650 MachineBasicBlock *RestoreMBB =
26651 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26652 assert(BB->succ_size() == 1);
26653 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26654 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26655 BB->addSuccessor(RestoreMBB);
26656 MI.getOperand(0).setMBB(RestoreMBB);
26658 auto RestoreMBBI = RestoreMBB->begin();
26659 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26660 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26664 MachineBasicBlock *
26665 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26666 MachineBasicBlock *BB) const {
26667 MachineFunction *MF = BB->getParent();
26668 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26669 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26670 // Only 32-bit SEH requires special handling for catchpad.
26671 if (IsSEH && Subtarget.is32Bit()) {
26672 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26673 DebugLoc DL = MI.getDebugLoc();
26674 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26676 MI.eraseFromParent();
26680 MachineBasicBlock *
26681 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26682 MachineBasicBlock *BB) const {
26683 // So, here we replace TLSADDR with the sequence:
26684 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26685 // We need this because TLSADDR is lowered into calls
26686 // inside MC, therefore without the two markers shrink-wrapping
26687 // may push the prologue/epilogue pass them.
26688 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26689 DebugLoc DL = MI.getDebugLoc();
26690 MachineFunction &MF = *BB->getParent();
26692 // Emit CALLSEQ_START right before the instruction.
26693 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26694 MachineInstrBuilder CallseqStart =
26695 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26696 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26698 // Emit CALLSEQ_END right after the instruction.
26699 // We don't call erase from parent because we want to keep the
26700 // original instruction around.
26701 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26702 MachineInstrBuilder CallseqEnd =
26703 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26704 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26709 MachineBasicBlock *
26710 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26711 MachineBasicBlock *BB) const {
26712 // This is pretty easy. We're taking the value that we received from
26713 // our load from the relocation, sticking it in either RDI (x86-64)
26714 // or EAX and doing an indirect call. The return value will then
26715 // be in the normal return register.
26716 MachineFunction *F = BB->getParent();
26717 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26718 DebugLoc DL = MI.getDebugLoc();
26720 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26721 assert(MI.getOperand(3).isGlobal() && "This should be a global");
26723 // Get a register mask for the lowered call.
26724 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26725 // proper register mask.
26726 const uint32_t *RegMask =
26727 Subtarget.is64Bit() ?
26728 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26729 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26730 if (Subtarget.is64Bit()) {
26731 MachineInstrBuilder MIB =
26732 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26736 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26737 MI.getOperand(3).getTargetFlags())
26739 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26740 addDirectMem(MIB, X86::RDI);
26741 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26742 } else if (!isPositionIndependent()) {
26743 MachineInstrBuilder MIB =
26744 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26748 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26749 MI.getOperand(3).getTargetFlags())
26751 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26752 addDirectMem(MIB, X86::EAX);
26753 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26755 MachineInstrBuilder MIB =
26756 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26757 .addReg(TII->getGlobalBaseReg(F))
26760 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26761 MI.getOperand(3).getTargetFlags())
26763 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26764 addDirectMem(MIB, X86::EAX);
26765 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26768 MI.eraseFromParent(); // The pseudo instruction is gone now.
26772 MachineBasicBlock *
26773 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26774 MachineBasicBlock *MBB) const {
26775 DebugLoc DL = MI.getDebugLoc();
26776 MachineFunction *MF = MBB->getParent();
26777 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26778 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26779 MachineRegisterInfo &MRI = MF->getRegInfo();
26781 const BasicBlock *BB = MBB->getBasicBlock();
26782 MachineFunction::iterator I = ++MBB->getIterator();
26784 // Memory Reference
26785 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26786 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26789 unsigned MemOpndSlot = 0;
26791 unsigned CurOp = 0;
26793 DstReg = MI.getOperand(CurOp++).getReg();
26794 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26795 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26797 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26798 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26800 MemOpndSlot = CurOp;
26802 MVT PVT = getPointerTy(MF->getDataLayout());
26803 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26804 "Invalid Pointer Size!");
26806 // For v = setjmp(buf), we generate
26809 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26810 // SjLjSetup restoreMBB
26816 // v = phi(main, restore)
26819 // if base pointer being used, load it from frame
26822 MachineBasicBlock *thisMBB = MBB;
26823 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26824 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26825 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26826 MF->insert(I, mainMBB);
26827 MF->insert(I, sinkMBB);
26828 MF->push_back(restoreMBB);
26829 restoreMBB->setHasAddressTaken();
26831 MachineInstrBuilder MIB;
26833 // Transfer the remainder of BB and its successor edges to sinkMBB.
26834 sinkMBB->splice(sinkMBB->begin(), MBB,
26835 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26836 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26839 unsigned PtrStoreOpc = 0;
26840 unsigned LabelReg = 0;
26841 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26842 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26843 !isPositionIndependent();
26845 // Prepare IP either in reg or imm.
26846 if (!UseImmLabel) {
26847 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26848 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26849 LabelReg = MRI.createVirtualRegister(PtrRC);
26850 if (Subtarget.is64Bit()) {
26851 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26855 .addMBB(restoreMBB)
26858 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26859 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26860 .addReg(XII->getGlobalBaseReg(MF))
26863 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26867 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26869 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26870 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26871 if (i == X86::AddrDisp)
26872 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26874 MIB.add(MI.getOperand(MemOpndSlot + i));
26877 MIB.addReg(LabelReg);
26879 MIB.addMBB(restoreMBB);
26880 MIB.setMemRefs(MMOBegin, MMOEnd);
26882 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26883 .addMBB(restoreMBB);
26885 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26886 MIB.addRegMask(RegInfo->getNoPreservedMask());
26887 thisMBB->addSuccessor(mainMBB);
26888 thisMBB->addSuccessor(restoreMBB);
26892 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26893 mainMBB->addSuccessor(sinkMBB);
26896 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26897 TII->get(X86::PHI), DstReg)
26898 .addReg(mainDstReg).addMBB(mainMBB)
26899 .addReg(restoreDstReg).addMBB(restoreMBB);
26902 if (RegInfo->hasBasePointer(*MF)) {
26903 const bool Uses64BitFramePtr =
26904 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26905 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26906 X86FI->setRestoreBasePointer(MF);
26907 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26908 unsigned BasePtr = RegInfo->getBaseRegister();
26909 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26910 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26911 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26912 .setMIFlag(MachineInstr::FrameSetup);
26914 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26915 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26916 restoreMBB->addSuccessor(sinkMBB);
26918 MI.eraseFromParent();
26922 MachineBasicBlock *
26923 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26924 MachineBasicBlock *MBB) const {
26925 DebugLoc DL = MI.getDebugLoc();
26926 MachineFunction *MF = MBB->getParent();
26927 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26928 MachineRegisterInfo &MRI = MF->getRegInfo();
26930 // Memory Reference
26931 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26932 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26934 MVT PVT = getPointerTy(MF->getDataLayout());
26935 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26936 "Invalid Pointer Size!");
26938 const TargetRegisterClass *RC =
26939 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26940 unsigned Tmp = MRI.createVirtualRegister(RC);
26941 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26942 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26943 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26944 unsigned SP = RegInfo->getStackRegister();
26946 MachineInstrBuilder MIB;
26948 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26949 const int64_t SPOffset = 2 * PVT.getStoreSize();
26951 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26952 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26955 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26956 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26957 MIB.add(MI.getOperand(i));
26958 MIB.setMemRefs(MMOBegin, MMOEnd);
26960 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26961 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26962 if (i == X86::AddrDisp)
26963 MIB.addDisp(MI.getOperand(i), LabelOffset);
26965 MIB.add(MI.getOperand(i));
26967 MIB.setMemRefs(MMOBegin, MMOEnd);
26969 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26970 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26971 if (i == X86::AddrDisp)
26972 MIB.addDisp(MI.getOperand(i), SPOffset);
26974 MIB.add(MI.getOperand(i));
26976 MIB.setMemRefs(MMOBegin, MMOEnd);
26978 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26980 MI.eraseFromParent();
26984 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26985 MachineBasicBlock *MBB,
26986 MachineBasicBlock *DispatchBB,
26988 DebugLoc DL = MI.getDebugLoc();
26989 MachineFunction *MF = MBB->getParent();
26990 MachineRegisterInfo *MRI = &MF->getRegInfo();
26991 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26993 MVT PVT = getPointerTy(MF->getDataLayout());
26994 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26999 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
27000 !isPositionIndependent();
27003 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
27005 const TargetRegisterClass *TRC =
27006 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
27007 VR = MRI->createVirtualRegister(TRC);
27008 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
27010 if (Subtarget.is64Bit())
27011 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
27015 .addMBB(DispatchBB)
27018 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
27019 .addReg(0) /* TII->getGlobalBaseReg(MF) */
27022 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
27026 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
27027 addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
27029 MIB.addMBB(DispatchBB);
27034 MachineBasicBlock *
27035 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
27036 MachineBasicBlock *BB) const {
27037 DebugLoc DL = MI.getDebugLoc();
27038 MachineFunction *MF = BB->getParent();
27039 MachineFrameInfo &MFI = MF->getFrameInfo();
27040 MachineRegisterInfo *MRI = &MF->getRegInfo();
27041 const X86InstrInfo *TII = Subtarget.getInstrInfo();
27042 int FI = MFI.getFunctionContextIndex();
27044 // Get a mapping of the call site numbers to all of the landing pads they're
27045 // associated with.
27046 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
27047 unsigned MaxCSNum = 0;
27048 for (auto &MBB : *MF) {
27049 if (!MBB.isEHPad())
27052 MCSymbol *Sym = nullptr;
27053 for (const auto &MI : MBB) {
27054 if (MI.isDebugValue())
27057 assert(MI.isEHLabel() && "expected EH_LABEL");
27058 Sym = MI.getOperand(0).getMCSymbol();
27062 if (!MF->hasCallSiteLandingPad(Sym))
27065 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
27066 CallSiteNumToLPad[CSI].push_back(&MBB);
27067 MaxCSNum = std::max(MaxCSNum, CSI);
27071 // Get an ordered list of the machine basic blocks for the jump table.
27072 std::vector<MachineBasicBlock *> LPadList;
27073 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
27074 LPadList.reserve(CallSiteNumToLPad.size());
27076 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
27077 for (auto &LP : CallSiteNumToLPad[CSI]) {
27078 LPadList.push_back(LP);
27079 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
27083 assert(!LPadList.empty() &&
27084 "No landing pad destinations for the dispatch jump table!");
27086 // Create the MBBs for the dispatch code.
27088 // Shove the dispatch's address into the return slot in the function context.
27089 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
27090 DispatchBB->setIsEHPad(true);
27092 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
27093 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
27094 DispatchBB->addSuccessor(TrapBB);
27096 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
27097 DispatchBB->addSuccessor(DispContBB);
27100 MF->push_back(DispatchBB);
27101 MF->push_back(DispContBB);
27102 MF->push_back(TrapBB);
27104 // Insert code into the entry block that creates and registers the function
27106 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
27108 // Create the jump table and associated information
27109 unsigned JTE = getJumpTableEncoding();
27110 MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
27111 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
27113 const X86RegisterInfo &RI = TII->getRegisterInfo();
27114 // Add a register mask with no preserved registers. This results in all
27115 // registers being marked as clobbered.
27116 if (RI.hasBasePointer(*MF)) {
27117 const bool FPIs64Bit =
27118 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
27119 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
27120 MFI->setRestoreBasePointer(MF);
27122 unsigned FP = RI.getFrameRegister(*MF);
27123 unsigned BP = RI.getBaseRegister();
27124 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
27125 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
27126 MFI->getRestoreBasePointerOffset())
27127 .addRegMask(RI.getNoPreservedMask());
27129 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
27130 .addRegMask(RI.getNoPreservedMask());
27133 // IReg is used as an index in a memory operand and therefore can't be SP
27134 unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
27135 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
27136 Subtarget.is64Bit() ? 8 : 4);
27137 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
27139 .addImm(LPadList.size());
27140 BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
27142 if (Subtarget.is64Bit()) {
27143 unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27144 unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
27146 // leaq .LJTI0_0(%rip), BReg
27147 BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
27151 .addJumpTableIndex(MJTI)
27153 // movzx IReg64, IReg
27154 BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
27157 .addImm(X86::sub_32bit);
27160 case MachineJumpTableInfo::EK_BlockAddress:
27161 // jmpq *(BReg,IReg64,8)
27162 BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
27169 case MachineJumpTableInfo::EK_LabelDifference32: {
27170 unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
27171 unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
27172 unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
27174 // movl (BReg,IReg64,4), OReg
27175 BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
27181 // movsx OReg64, OReg
27182 BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
27183 // addq BReg, OReg64, TReg
27184 BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
27188 BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
27192 llvm_unreachable("Unexpected jump table encoding");
27195 // jmpl *.LJTI0_0(,IReg,4)
27196 BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
27200 .addJumpTableIndex(MJTI)
27204 // Add the jump table entries as successors to the MBB.
27205 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
27206 for (auto &LP : LPadList)
27207 if (SeenMBBs.insert(LP).second)
27208 DispContBB->addSuccessor(LP);
27210 // N.B. the order the invoke BBs are processed in doesn't matter here.
27211 SmallVector<MachineBasicBlock *, 64> MBBLPads;
27212 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
27213 for (MachineBasicBlock *MBB : InvokeBBs) {
27214 // Remove the landing pad successor from the invoke block and replace it
27215 // with the new dispatch block.
27216 // Keep a copy of Successors since it's modified inside the loop.
27217 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
27219 // FIXME: Avoid quadratic complexity.
27220 for (auto MBBS : Successors) {
27221 if (MBBS->isEHPad()) {
27222 MBB->removeSuccessor(MBBS);
27223 MBBLPads.push_back(MBBS);
27227 MBB->addSuccessor(DispatchBB);
27229 // Find the invoke call and mark all of the callee-saved registers as
27230 // 'implicit defined' so that they're spilled. This prevents code from
27231 // moving instructions to before the EH block, where they will never be
27233 for (auto &II : reverse(*MBB)) {
27237 DenseMap<unsigned, bool> DefRegs;
27238 for (auto &MOp : II.operands())
27240 DefRegs[MOp.getReg()] = true;
27242 MachineInstrBuilder MIB(*MF, &II);
27243 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
27244 unsigned Reg = SavedRegs[RI];
27246 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
27253 // Mark all former landing pads as non-landing pads. The dispatch is the only
27254 // landing pad now.
27255 for (auto &LP : MBBLPads)
27256 LP->setIsEHPad(false);
27258 // The instruction is gone now.
27259 MI.eraseFromParent();
27263 MachineBasicBlock *
27264 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
27265 MachineBasicBlock *BB) const {
27266 MachineFunction *MF = BB->getParent();
27267 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
27268 DebugLoc DL = MI.getDebugLoc();
27270 switch (MI.getOpcode()) {
27271 default: llvm_unreachable("Unexpected instr type to insert");
27272 case X86::TAILJMPd64:
27273 case X86::TAILJMPr64:
27274 case X86::TAILJMPm64:
27275 case X86::TAILJMPr64_REX:
27276 case X86::TAILJMPm64_REX:
27277 llvm_unreachable("TAILJMP64 would not be touched here.");
27278 case X86::TCRETURNdi64:
27279 case X86::TCRETURNri64:
27280 case X86::TCRETURNmi64:
27282 case X86::TLS_addr32:
27283 case X86::TLS_addr64:
27284 case X86::TLS_base_addr32:
27285 case X86::TLS_base_addr64:
27286 return EmitLoweredTLSAddr(MI, BB);
27287 case X86::CATCHRET:
27288 return EmitLoweredCatchRet(MI, BB);
27289 case X86::CATCHPAD:
27290 return EmitLoweredCatchPad(MI, BB);
27291 case X86::SEG_ALLOCA_32:
27292 case X86::SEG_ALLOCA_64:
27293 return EmitLoweredSegAlloca(MI, BB);
27294 case X86::TLSCall_32:
27295 case X86::TLSCall_64:
27296 return EmitLoweredTLSCall(MI, BB);
27297 case X86::CMOV_FR32:
27298 case X86::CMOV_FR64:
27299 case X86::CMOV_FR128:
27300 case X86::CMOV_GR8:
27301 case X86::CMOV_GR16:
27302 case X86::CMOV_GR32:
27303 case X86::CMOV_RFP32:
27304 case X86::CMOV_RFP64:
27305 case X86::CMOV_RFP80:
27306 case X86::CMOV_V2F64:
27307 case X86::CMOV_V2I64:
27308 case X86::CMOV_V4F32:
27309 case X86::CMOV_V4F64:
27310 case X86::CMOV_V4I64:
27311 case X86::CMOV_V16F32:
27312 case X86::CMOV_V8F32:
27313 case X86::CMOV_V8F64:
27314 case X86::CMOV_V8I64:
27315 case X86::CMOV_V8I1:
27316 case X86::CMOV_V16I1:
27317 case X86::CMOV_V32I1:
27318 case X86::CMOV_V64I1:
27319 return EmitLoweredSelect(MI, BB);
27321 case X86::RDFLAGS32:
27322 case X86::RDFLAGS64: {
27324 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
27325 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
27326 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
27327 // Permit reads of the FLAGS register without it being defined.
27328 // This intrinsic exists to read external processor state in flags, such as
27329 // the trap flag, interrupt flag, and direction flag, none of which are
27330 // modeled by the backend.
27331 Push->getOperand(2).setIsUndef();
27332 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
27334 MI.eraseFromParent(); // The pseudo is gone now.
27338 case X86::WRFLAGS32:
27339 case X86::WRFLAGS64: {
27341 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
27343 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
27344 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
27345 BuildMI(*BB, MI, DL, TII->get(PopF));
27347 MI.eraseFromParent(); // The pseudo is gone now.
27351 case X86::RELEASE_FADD32mr:
27352 case X86::RELEASE_FADD64mr:
27353 return EmitLoweredAtomicFP(MI, BB);
27355 case X86::FP32_TO_INT16_IN_MEM:
27356 case X86::FP32_TO_INT32_IN_MEM:
27357 case X86::FP32_TO_INT64_IN_MEM:
27358 case X86::FP64_TO_INT16_IN_MEM:
27359 case X86::FP64_TO_INT32_IN_MEM:
27360 case X86::FP64_TO_INT64_IN_MEM:
27361 case X86::FP80_TO_INT16_IN_MEM:
27362 case X86::FP80_TO_INT32_IN_MEM:
27363 case X86::FP80_TO_INT64_IN_MEM: {
27364 // Change the floating point control register to use "round towards zero"
27365 // mode when truncating to an integer value.
27366 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
27367 addFrameReference(BuildMI(*BB, MI, DL,
27368 TII->get(X86::FNSTCW16m)), CWFrameIdx);
27370 // Load the old value of the high byte of the control word...
27372 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
27373 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
27376 // Set the high part to be round to zero...
27377 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
27380 // Reload the modified control word now...
27381 addFrameReference(BuildMI(*BB, MI, DL,
27382 TII->get(X86::FLDCW16m)), CWFrameIdx);
27384 // Restore the memory image of control word to original value
27385 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
27388 // Get the X86 opcode to use.
27390 switch (MI.getOpcode()) {
27391 default: llvm_unreachable("illegal opcode!");
27392 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
27393 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
27394 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
27395 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
27396 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
27397 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
27398 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
27399 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
27400 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
27403 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27404 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
27405 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
27407 // Reload the original control word now.
27408 addFrameReference(BuildMI(*BB, MI, DL,
27409 TII->get(X86::FLDCW16m)), CWFrameIdx);
27411 MI.eraseFromParent(); // The pseudo instruction is gone now.
27414 // String/text processing lowering.
27415 case X86::PCMPISTRM128REG:
27416 case X86::VPCMPISTRM128REG:
27417 case X86::PCMPISTRM128MEM:
27418 case X86::VPCMPISTRM128MEM:
27419 case X86::PCMPESTRM128REG:
27420 case X86::VPCMPESTRM128REG:
27421 case X86::PCMPESTRM128MEM:
27422 case X86::VPCMPESTRM128MEM:
27423 assert(Subtarget.hasSSE42() &&
27424 "Target must have SSE4.2 or AVX features enabled");
27425 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
27427 // String/text processing lowering.
27428 case X86::PCMPISTRIREG:
27429 case X86::VPCMPISTRIREG:
27430 case X86::PCMPISTRIMEM:
27431 case X86::VPCMPISTRIMEM:
27432 case X86::PCMPESTRIREG:
27433 case X86::VPCMPESTRIREG:
27434 case X86::PCMPESTRIMEM:
27435 case X86::VPCMPESTRIMEM:
27436 assert(Subtarget.hasSSE42() &&
27437 "Target must have SSE4.2 or AVX features enabled");
27438 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
27440 // Thread synchronization.
27442 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
27443 case X86::MONITORX:
27444 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
27448 return emitClzero(&MI, BB, Subtarget);
27452 return emitWRPKRU(MI, BB, Subtarget);
27454 return emitRDPKRU(MI, BB, Subtarget);
27457 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
27459 case X86::VASTART_SAVE_XMM_REGS:
27460 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
27462 case X86::VAARG_64:
27463 return EmitVAARG64WithCustomInserter(MI, BB);
27465 case X86::EH_SjLj_SetJmp32:
27466 case X86::EH_SjLj_SetJmp64:
27467 return emitEHSjLjSetJmp(MI, BB);
27469 case X86::EH_SjLj_LongJmp32:
27470 case X86::EH_SjLj_LongJmp64:
27471 return emitEHSjLjLongJmp(MI, BB);
27473 case X86::Int_eh_sjlj_setup_dispatch:
27474 return EmitSjLjDispatchBlock(MI, BB);
27476 case TargetOpcode::STATEPOINT:
27477 // As an implementation detail, STATEPOINT shares the STACKMAP format at
27478 // this point in the process. We diverge later.
27479 return emitPatchPoint(MI, BB);
27481 case TargetOpcode::STACKMAP:
27482 case TargetOpcode::PATCHPOINT:
27483 return emitPatchPoint(MI, BB);
27485 case TargetOpcode::PATCHABLE_EVENT_CALL:
27486 // Do nothing here, handle in xray instrumentation pass.
27489 case X86::LCMPXCHG8B: {
27490 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
27491 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
27492 // requires a memory operand. If it happens that current architecture is
27493 // i686 and for current function we need a base pointer
27494 // - which is ESI for i686 - register allocator would not be able to
27495 // allocate registers for an address in form of X(%reg, %reg, Y)
27496 // - there never would be enough unreserved registers during regalloc
27497 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
27498 // We are giving a hand to register allocator by precomputing the address in
27499 // a new vreg using LEA.
27501 // If it is not i686 or there is no base pointer - nothing to do here.
27502 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
27505 // Even though this code does not necessarily needs the base pointer to
27506 // be ESI, we check for that. The reason: if this assert fails, there are
27507 // some changes happened in the compiler base pointer handling, which most
27508 // probably have to be addressed somehow here.
27509 assert(TRI->getBaseRegister() == X86::ESI &&
27510 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
27511 "base pointer in mind");
27513 MachineRegisterInfo &MRI = MF->getRegInfo();
27514 MVT SPTy = getPointerTy(MF->getDataLayout());
27515 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
27516 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
27518 X86AddressMode AM = getAddressFromInstr(&MI, 0);
27519 // Regalloc does not need any help when the memory operand of CMPXCHG8B
27520 // does not use index register.
27521 if (AM.IndexReg == X86::NoRegister)
27524 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
27525 // four operand definitions that are E[ABCD] registers. We skip them and
27526 // then insert the LEA.
27527 MachineBasicBlock::iterator MBBI(MI);
27528 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
27529 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
27532 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
27534 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
27538 case X86::LCMPXCHG16B:
27540 case X86::LCMPXCHG8B_SAVE_EBX:
27541 case X86::LCMPXCHG16B_SAVE_RBX: {
27543 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
27544 if (!BB->isLiveIn(BasePtr))
27545 BB->addLiveIn(BasePtr);
27551 //===----------------------------------------------------------------------===//
27552 // X86 Optimization Hooks
27553 //===----------------------------------------------------------------------===//
27555 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
27557 const APInt &DemandedElts,
27558 const SelectionDAG &DAG,
27559 unsigned Depth) const {
27560 unsigned BitWidth = Known.getBitWidth();
27561 unsigned Opc = Op.getOpcode();
27562 EVT VT = Op.getValueType();
27563 assert((Opc >= ISD::BUILTIN_OP_END ||
27564 Opc == ISD::INTRINSIC_WO_CHAIN ||
27565 Opc == ISD::INTRINSIC_W_CHAIN ||
27566 Opc == ISD::INTRINSIC_VOID) &&
27567 "Should use MaskedValueIsZero if you don't know whether Op"
27568 " is a target node!");
27573 case X86ISD::SETCC:
27574 Known.Zero.setBitsFrom(1);
27576 case X86ISD::MOVMSK: {
27577 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
27578 Known.Zero.setBitsFrom(NumLoBits);
27581 case X86ISD::PEXTRB:
27582 case X86ISD::PEXTRW: {
27583 SDValue Src = Op.getOperand(0);
27584 EVT SrcVT = Src.getValueType();
27585 APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
27586 Op.getConstantOperandVal(1));
27587 DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
27588 Known = Known.zextOrTrunc(BitWidth);
27589 Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
27592 case X86ISD::VSHLI:
27593 case X86ISD::VSRLI: {
27594 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27595 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
27596 Known.setAllZero();
27600 DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
27601 unsigned ShAmt = ShiftImm->getZExtValue();
27602 if (Opc == X86ISD::VSHLI) {
27603 Known.Zero <<= ShAmt;
27604 Known.One <<= ShAmt;
27605 // Low bits are known zero.
27606 Known.Zero.setLowBits(ShAmt);
27608 Known.Zero.lshrInPlace(ShAmt);
27609 Known.One.lshrInPlace(ShAmt);
27610 // High bits are known zero.
27611 Known.Zero.setHighBits(ShAmt);
27616 case X86ISD::VZEXT: {
27617 // TODO: Add DemandedElts support.
27618 SDValue N0 = Op.getOperand(0);
27619 unsigned NumElts = VT.getVectorNumElements();
27621 EVT SrcVT = N0.getValueType();
27622 unsigned InNumElts = SrcVT.getVectorNumElements();
27623 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27624 assert(InNumElts >= NumElts && "Illegal VZEXT input");
27626 Known = KnownBits(InBitWidth);
27627 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27628 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27629 Known = Known.zext(BitWidth);
27630 Known.Zero.setBitsFrom(InBitWidth);
27633 case X86ISD::CMOV: {
27634 DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
27635 // If we don't know any bits, early out.
27636 if (Known.isUnknown())
27639 DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
27641 // Only known if known in both the LHS and RHS.
27642 Known.One &= Known2.One;
27643 Known.Zero &= Known2.Zero;
27646 case X86ISD::UDIVREM8_ZEXT_HREG:
27647 // TODO: Support more than just the zero extended bits?
27648 if (Op.getResNo() != 1)
27650 // The remainder is zero extended.
27651 Known.Zero.setBitsFrom(8);
27656 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27657 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27658 unsigned Depth) const {
27659 unsigned VTBits = Op.getScalarValueSizeInBits();
27660 unsigned Opcode = Op.getOpcode();
27662 case X86ISD::SETCC_CARRY:
27663 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27666 case X86ISD::VSEXT: {
27667 // TODO: Add DemandedElts support.
27668 SDValue Src = Op.getOperand(0);
27669 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27670 Tmp += VTBits - Src.getScalarValueSizeInBits();
27674 case X86ISD::VTRUNC: {
27675 // TODO: Add DemandedElts support.
27676 SDValue Src = Op.getOperand(0);
27677 unsigned NumSrcBits = Src.getScalarValueSizeInBits();
27678 assert(VTBits < NumSrcBits && "Illegal truncation input type");
27679 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27680 if (Tmp > (NumSrcBits - VTBits))
27681 return Tmp - (NumSrcBits - VTBits);
27685 case X86ISD::PACKSS: {
27686 // PACKSS is just a truncation if the sign bits extend to the packed size.
27687 // TODO: Add DemandedElts support.
27688 unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
27689 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
27690 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
27691 unsigned Tmp = std::min(Tmp0, Tmp1);
27692 if (Tmp > (SrcBits - VTBits))
27693 return Tmp - (SrcBits - VTBits);
27697 case X86ISD::VSHLI: {
27698 SDValue Src = Op.getOperand(0);
27699 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27700 if (ShiftVal.uge(VTBits))
27701 return VTBits; // Shifted all bits out --> zero.
27702 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
27703 if (ShiftVal.uge(Tmp))
27704 return 1; // Shifted all sign bits out --> unknown.
27705 return Tmp - ShiftVal.getZExtValue();
27708 case X86ISD::VSRAI: {
27709 SDValue Src = Op.getOperand(0);
27710 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27711 if (ShiftVal.uge(VTBits - 1))
27712 return VTBits; // Sign splat.
27713 unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
27715 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27718 case X86ISD::PCMPGT:
27719 case X86ISD::PCMPEQ:
27721 case X86ISD::VPCOM:
27722 case X86ISD::VPCOMU:
27723 // Vector compares return zero/all-bits result values.
27726 case X86ISD::CMOV: {
27727 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
27728 if (Tmp0 == 1) return 1; // Early out.
27729 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
27730 return std::min(Tmp0, Tmp1);
27732 case X86ISD::SDIVREM8_SEXT_HREG:
27733 // TODO: Support more than just the sign extended bits?
27734 if (Op.getResNo() != 1)
27736 // The remainder is sign extended.
27744 SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
27745 if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
27746 return N->getOperand(0);
27750 /// Returns true (and the GlobalValue and the offset) if the node is a
27751 /// GlobalAddress + offset.
27752 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27753 const GlobalValue* &GA,
27754 int64_t &Offset) const {
27755 if (N->getOpcode() == X86ISD::Wrapper) {
27756 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27757 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27758 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27762 return TargetLowering::isGAPlusOffset(N, GA, Offset);
27765 // Attempt to match a combined shuffle mask against supported unary shuffle
27767 // TODO: Investigate sharing more of this with shuffle lowering.
27768 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27769 bool AllowFloatDomain, bool AllowIntDomain,
27770 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27771 const X86Subtarget &Subtarget,
27772 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27773 unsigned NumMaskElts = Mask.size();
27774 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27776 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27777 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27778 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27779 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27780 unsigned MaxScale = 64 / MaskEltSize;
27781 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27783 unsigned NumDstElts = NumMaskElts / Scale;
27784 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27785 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27786 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27789 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27790 MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
27791 MVT::getIntegerVT(MaskEltSize);
27792 SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
27794 if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
27795 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27796 Shuffle = unsigned(X86ISD::VZEXT);
27798 Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27800 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27801 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27807 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27808 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27809 isUndefOrEqual(Mask[0], 0) &&
27810 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27811 Shuffle = X86ISD::VZEXT_MOVL;
27812 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27816 // Check if we have SSE3 which will let us use MOVDDUP etc. The
27817 // instructions are no slower than UNPCKLPD but has the option to
27818 // fold the input operand into even an unaligned memory load.
27819 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27820 if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
27821 Shuffle = X86ISD::MOVDDUP;
27822 SrcVT = DstVT = MVT::v2f64;
27825 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27826 Shuffle = X86ISD::MOVSLDUP;
27827 SrcVT = DstVT = MVT::v4f32;
27830 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27831 Shuffle = X86ISD::MOVSHDUP;
27832 SrcVT = DstVT = MVT::v4f32;
27837 if (MaskVT.is256BitVector() && AllowFloatDomain) {
27838 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27839 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27840 Shuffle = X86ISD::MOVDDUP;
27841 SrcVT = DstVT = MVT::v4f64;
27844 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27845 Shuffle = X86ISD::MOVSLDUP;
27846 SrcVT = DstVT = MVT::v8f32;
27849 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27850 Shuffle = X86ISD::MOVSHDUP;
27851 SrcVT = DstVT = MVT::v8f32;
27856 if (MaskVT.is512BitVector() && AllowFloatDomain) {
27857 assert(Subtarget.hasAVX512() &&
27858 "AVX512 required for 512-bit vector shuffles");
27859 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27860 Shuffle = X86ISD::MOVDDUP;
27861 SrcVT = DstVT = MVT::v8f64;
27864 if (isTargetShuffleEquivalent(
27865 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27866 Shuffle = X86ISD::MOVSLDUP;
27867 SrcVT = DstVT = MVT::v16f32;
27870 if (isTargetShuffleEquivalent(
27871 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27872 Shuffle = X86ISD::MOVSHDUP;
27873 SrcVT = DstVT = MVT::v16f32;
27878 // Attempt to match against broadcast-from-vector.
27879 if (Subtarget.hasAVX2()) {
27880 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27881 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27882 SrcVT = DstVT = MaskVT;
27883 Shuffle = X86ISD::VBROADCAST;
27891 // Attempt to match a combined shuffle mask against supported unary immediate
27892 // permute instructions.
27893 // TODO: Investigate sharing more of this with shuffle lowering.
27894 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27895 const APInt &Zeroable,
27896 bool AllowFloatDomain,
27897 bool AllowIntDomain,
27898 const X86Subtarget &Subtarget,
27899 unsigned &Shuffle, MVT &ShuffleVT,
27900 unsigned &PermuteImm) {
27901 unsigned NumMaskElts = Mask.size();
27902 unsigned InputSizeInBits = MaskVT.getSizeInBits();
27903 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27904 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27906 bool ContainsZeros =
27907 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27909 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27910 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27911 // Check for lane crossing permutes.
27912 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27913 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27914 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27915 Shuffle = X86ISD::VPERMI;
27916 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27917 PermuteImm = getV4X86ShuffleImm(Mask);
27920 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27921 SmallVector<int, 4> RepeatedMask;
27922 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27923 Shuffle = X86ISD::VPERMI;
27924 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27925 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27929 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27930 // VPERMILPD can permute with a non-repeating shuffle.
27931 Shuffle = X86ISD::VPERMILPI;
27932 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27934 for (int i = 0, e = Mask.size(); i != e; ++i) {
27936 if (M == SM_SentinelUndef)
27938 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27939 PermuteImm |= (M & 1) << i;
27945 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27946 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27947 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27948 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27949 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27950 SmallVector<int, 4> RepeatedMask;
27951 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27952 // Narrow the repeated mask to create 32-bit element permutes.
27953 SmallVector<int, 4> WordMask = RepeatedMask;
27954 if (MaskScalarSizeInBits == 64)
27955 scaleShuffleMask<int>(2, RepeatedMask, WordMask);
27957 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
27958 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
27959 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27960 PermuteImm = getV4X86ShuffleImm(WordMask);
27965 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27966 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
27967 SmallVector<int, 4> RepeatedMask;
27968 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27969 ArrayRef<int> LoMask(Mask.data() + 0, 4);
27970 ArrayRef<int> HiMask(Mask.data() + 4, 4);
27972 // PSHUFLW: permute lower 4 elements only.
27973 if (isUndefOrInRange(LoMask, 0, 4) &&
27974 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27975 Shuffle = X86ISD::PSHUFLW;
27976 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27977 PermuteImm = getV4X86ShuffleImm(LoMask);
27981 // PSHUFHW: permute upper 4 elements only.
27982 if (isUndefOrInRange(HiMask, 4, 8) &&
27983 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27984 // Offset the HiMask so that we can create the shuffle immediate.
27985 int OffsetHiMask[4];
27986 for (int i = 0; i != 4; ++i)
27987 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27989 Shuffle = X86ISD::PSHUFHW;
27990 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27991 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27997 // Attempt to match against byte/bit shifts.
27998 // FIXME: Add 512-bit support.
27999 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28000 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28001 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
28002 MaskScalarSizeInBits, Mask,
28003 0, Zeroable, Subtarget);
28004 if (0 < ShiftAmt) {
28005 PermuteImm = (unsigned)ShiftAmt;
28013 // Attempt to match a combined unary shuffle mask against supported binary
28014 // shuffle instructions.
28015 // TODO: Investigate sharing more of this with shuffle lowering.
28016 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28017 bool AllowFloatDomain, bool AllowIntDomain,
28018 SDValue &V1, SDValue &V2, SDLoc &DL,
28020 const X86Subtarget &Subtarget,
28021 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
28023 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28025 if (MaskVT.is128BitVector()) {
28026 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
28028 Shuffle = X86ISD::MOVLHPS;
28029 SrcVT = DstVT = MVT::v4f32;
28032 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
28034 Shuffle = X86ISD::MOVHLPS;
28035 SrcVT = DstVT = MVT::v4f32;
28038 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
28039 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28041 Shuffle = X86ISD::MOVSD;
28042 SrcVT = DstVT = MaskVT;
28045 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
28046 (AllowFloatDomain || !Subtarget.hasSSE41())) {
28047 Shuffle = X86ISD::MOVSS;
28048 SrcVT = DstVT = MaskVT;
28053 // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
28054 // TODO add support for 256/512-bit types.
28055 if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
28056 if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
28063 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
28064 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
28065 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28066 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
28067 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
28068 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
28069 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
28071 SrcVT = DstVT = MaskVT;
28072 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
28073 SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
28081 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
28082 const APInt &Zeroable,
28083 bool AllowFloatDomain,
28084 bool AllowIntDomain,
28085 SDValue &V1, SDValue &V2, SDLoc &DL,
28087 const X86Subtarget &Subtarget,
28088 unsigned &Shuffle, MVT &ShuffleVT,
28089 unsigned &PermuteImm) {
28090 unsigned NumMaskElts = Mask.size();
28091 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
28093 // Attempt to match against PALIGNR byte rotate.
28094 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28095 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
28096 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
28097 if (0 < ByteRotation) {
28098 Shuffle = X86ISD::PALIGNR;
28099 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
28100 PermuteImm = ByteRotation;
28105 // Attempt to combine to X86ISD::BLENDI.
28106 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
28107 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
28108 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
28109 uint64_t BlendMask = 0;
28110 bool ForceV1Zero = false, ForceV2Zero = false;
28111 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
28112 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
28114 if (MaskVT == MVT::v16i16) {
28115 // We can only use v16i16 PBLENDW if the lanes are repeated.
28116 SmallVector<int, 8> RepeatedMask;
28117 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
28119 assert(RepeatedMask.size() == 8 &&
28120 "Repeated mask size doesn't match!");
28122 for (int i = 0; i < 8; ++i)
28123 if (RepeatedMask[i] >= 8)
28124 PermuteImm |= 1 << i;
28125 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28126 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28127 Shuffle = X86ISD::BLENDI;
28128 ShuffleVT = MaskVT;
28132 // Determine a type compatible with X86ISD::BLENDI.
28133 ShuffleVT = MaskVT;
28134 if (Subtarget.hasAVX2()) {
28135 if (ShuffleVT == MVT::v4i64)
28136 ShuffleVT = MVT::v8i32;
28137 else if (ShuffleVT == MVT::v2i64)
28138 ShuffleVT = MVT::v4i32;
28140 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
28141 ShuffleVT = MVT::v8i16;
28142 else if (ShuffleVT == MVT::v4i64)
28143 ShuffleVT = MVT::v4f64;
28144 else if (ShuffleVT == MVT::v8i32)
28145 ShuffleVT = MVT::v8f32;
28148 if (!ShuffleVT.isFloatingPoint()) {
28149 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
28151 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
28152 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
28153 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
28156 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
28157 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
28158 PermuteImm = (unsigned)BlendMask;
28159 Shuffle = X86ISD::BLENDI;
28165 // Attempt to combine to INSERTPS.
28166 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
28167 MaskVT.is128BitVector()) {
28168 if (Zeroable.getBoolValue() &&
28169 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
28170 Shuffle = X86ISD::INSERTPS;
28171 ShuffleVT = MVT::v4f32;
28176 // Attempt to combine to SHUFPD.
28177 if (AllowFloatDomain && EltSizeInBits == 64 &&
28178 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
28179 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28180 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28181 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
28182 Shuffle = X86ISD::SHUFP;
28183 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
28188 // Attempt to combine to SHUFPS.
28189 if (AllowFloatDomain && EltSizeInBits == 32 &&
28190 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
28191 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
28192 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
28193 SmallVector<int, 4> RepeatedMask;
28194 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
28195 // Match each half of the repeated mask, to determine if its just
28196 // referencing one of the vectors, is zeroable or entirely undef.
28197 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
28198 int M0 = RepeatedMask[Offset];
28199 int M1 = RepeatedMask[Offset + 1];
28201 if (isUndefInRange(RepeatedMask, Offset, 2)) {
28202 return DAG.getUNDEF(MaskVT);
28203 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
28204 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
28205 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
28206 return getZeroVector(MaskVT, Subtarget, DAG, DL);
28207 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
28208 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28209 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28211 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
28212 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
28213 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
28220 int ShufMask[4] = {-1, -1, -1, -1};
28221 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
28222 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
28227 Shuffle = X86ISD::SHUFP;
28228 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
28229 PermuteImm = getV4X86ShuffleImm(ShufMask);
28238 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
28241 /// This is the leaf of the recursive combine below. When we have found some
28242 /// chain of single-use x86 shuffle instructions and accumulated the combined
28243 /// shuffle mask represented by them, this will try to pattern match that mask
28244 /// into either a single instruction if there is a special purpose instruction
28245 /// for this operation, or into a PSHUFB instruction which is a fully general
28246 /// instruction but should only be used to replace chains over a certain depth.
28247 static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
28248 ArrayRef<int> BaseMask, int Depth,
28249 bool HasVariableMask, SelectionDAG &DAG,
28250 TargetLowering::DAGCombinerInfo &DCI,
28251 const X86Subtarget &Subtarget) {
28252 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
28253 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
28254 "Unexpected number of shuffle inputs!");
28256 // Find the inputs that enter the chain. Note that multiple uses are OK
28257 // here, we're not going to remove the operands we find.
28258 bool UnaryShuffle = (Inputs.size() == 1);
28259 SDValue V1 = peekThroughBitcasts(Inputs[0]);
28260 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
28261 : peekThroughBitcasts(Inputs[1]));
28263 MVT VT1 = V1.getSimpleValueType();
28264 MVT VT2 = V2.getSimpleValueType();
28265 MVT RootVT = Root.getSimpleValueType();
28266 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
28267 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
28268 "Vector size mismatch");
28273 unsigned NumBaseMaskElts = BaseMask.size();
28274 if (NumBaseMaskElts == 1) {
28275 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
28276 return DAG.getBitcast(RootVT, V1);
28279 unsigned RootSizeInBits = RootVT.getSizeInBits();
28280 unsigned NumRootElts = RootVT.getVectorNumElements();
28281 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
28282 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
28283 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
28285 // Don't combine if we are a AVX512/EVEX target and the mask element size
28286 // is different from the root element size - this would prevent writemasks
28287 // from being reused.
28288 // TODO - this currently prevents all lane shuffles from occurring.
28289 // TODO - check for writemasks usage instead of always preventing combining.
28290 // TODO - attempt to narrow Mask back to writemask size.
28291 bool IsEVEXShuffle =
28292 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
28293 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
28296 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
28298 // Handle 128-bit lane shuffles of 256-bit vectors.
28299 // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
28300 // we need to use the zeroing feature.
28301 // TODO - this should support binary shuffles.
28302 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
28303 !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
28304 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
28305 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
28306 return SDValue(); // Nothing to do!
28307 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
28308 unsigned PermMask = 0;
28309 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
28310 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
28312 Res = DAG.getBitcast(ShuffleVT, V1);
28313 DCI.AddToWorklist(Res.getNode());
28314 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
28315 DAG.getUNDEF(ShuffleVT),
28316 DAG.getConstant(PermMask, DL, MVT::i8));
28317 DCI.AddToWorklist(Res.getNode());
28318 return DAG.getBitcast(RootVT, Res);
28321 // For masks that have been widened to 128-bit elements or more,
28322 // narrow back down to 64-bit elements.
28323 SmallVector<int, 64> Mask;
28324 if (BaseMaskEltSizeInBits > 64) {
28325 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
28326 int MaskScale = BaseMaskEltSizeInBits / 64;
28327 scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
28329 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
28332 unsigned NumMaskElts = Mask.size();
28333 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
28335 // Determine the effective mask value type.
28336 FloatDomain &= (32 <= MaskEltSizeInBits);
28337 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
28338 : MVT::getIntegerVT(MaskEltSizeInBits);
28339 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
28341 // Only allow legal mask types.
28342 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
28345 // Attempt to match the mask against known shuffle patterns.
28346 MVT ShuffleSrcVT, ShuffleVT;
28347 unsigned Shuffle, PermuteImm;
28349 // Which shuffle domains are permitted?
28350 // Permit domain crossing at higher combine depths.
28351 bool AllowFloatDomain = FloatDomain || (Depth > 3);
28352 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
28353 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
28355 // Determine zeroable mask elements.
28356 APInt Zeroable(NumMaskElts, 0);
28357 for (unsigned i = 0; i != NumMaskElts; ++i)
28358 if (isUndefOrZero(Mask[i]))
28359 Zeroable.setBit(i);
28361 if (UnaryShuffle) {
28362 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
28363 // directly if we don't shuffle the lower element and we shuffle the upper
28364 // (zero) elements within themselves.
28365 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
28366 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
28367 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
28368 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
28369 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
28370 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
28371 return DAG.getBitcast(RootVT, V1);
28375 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28376 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
28378 if (Depth == 1 && Root.getOpcode() == Shuffle)
28379 return SDValue(); // Nothing to do!
28380 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28381 return SDValue(); // AVX512 Writemask clash.
28382 Res = DAG.getBitcast(ShuffleSrcVT, V1);
28383 DCI.AddToWorklist(Res.getNode());
28384 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
28385 DCI.AddToWorklist(Res.getNode());
28386 return DAG.getBitcast(RootVT, Res);
28389 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28390 AllowIntDomain, Subtarget, Shuffle,
28391 ShuffleVT, PermuteImm)) {
28392 if (Depth == 1 && Root.getOpcode() == Shuffle)
28393 return SDValue(); // Nothing to do!
28394 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28395 return SDValue(); // AVX512 Writemask clash.
28396 Res = DAG.getBitcast(ShuffleVT, V1);
28397 DCI.AddToWorklist(Res.getNode());
28398 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
28399 DAG.getConstant(PermuteImm, DL, MVT::i8));
28400 DCI.AddToWorklist(Res.getNode());
28401 return DAG.getBitcast(RootVT, Res);
28405 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
28406 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
28407 ShuffleVT, UnaryShuffle)) {
28408 if (Depth == 1 && Root.getOpcode() == Shuffle)
28409 return SDValue(); // Nothing to do!
28410 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28411 return SDValue(); // AVX512 Writemask clash.
28412 V1 = DAG.getBitcast(ShuffleSrcVT, V1);
28413 DCI.AddToWorklist(V1.getNode());
28414 V2 = DAG.getBitcast(ShuffleSrcVT, V2);
28415 DCI.AddToWorklist(V2.getNode());
28416 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
28417 DCI.AddToWorklist(Res.getNode());
28418 return DAG.getBitcast(RootVT, Res);
28421 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
28422 AllowIntDomain, V1, V2, DL, DAG,
28423 Subtarget, Shuffle, ShuffleVT,
28425 if (Depth == 1 && Root.getOpcode() == Shuffle)
28426 return SDValue(); // Nothing to do!
28427 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
28428 return SDValue(); // AVX512 Writemask clash.
28429 V1 = DAG.getBitcast(ShuffleVT, V1);
28430 DCI.AddToWorklist(V1.getNode());
28431 V2 = DAG.getBitcast(ShuffleVT, V2);
28432 DCI.AddToWorklist(V2.getNode());
28433 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
28434 DAG.getConstant(PermuteImm, DL, MVT::i8));
28435 DCI.AddToWorklist(Res.getNode());
28436 return DAG.getBitcast(RootVT, Res);
28439 // Typically from here on, we need an integer version of MaskVT.
28440 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
28441 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
28443 // Annoyingly, SSE4A instructions don't map into the above match helpers.
28444 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
28445 uint64_t BitLen, BitIdx;
28446 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
28448 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
28449 return SDValue(); // Nothing to do!
28450 V1 = DAG.getBitcast(IntMaskVT, V1);
28451 DCI.AddToWorklist(V1.getNode());
28452 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
28453 DAG.getConstant(BitLen, DL, MVT::i8),
28454 DAG.getConstant(BitIdx, DL, MVT::i8));
28455 DCI.AddToWorklist(Res.getNode());
28456 return DAG.getBitcast(RootVT, Res);
28459 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
28460 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
28461 return SDValue(); // Nothing to do!
28462 V1 = DAG.getBitcast(IntMaskVT, V1);
28463 DCI.AddToWorklist(V1.getNode());
28464 V2 = DAG.getBitcast(IntMaskVT, V2);
28465 DCI.AddToWorklist(V2.getNode());
28466 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
28467 DAG.getConstant(BitLen, DL, MVT::i8),
28468 DAG.getConstant(BitIdx, DL, MVT::i8));
28469 DCI.AddToWorklist(Res.getNode());
28470 return DAG.getBitcast(RootVT, Res);
28474 // Don't try to re-form single instruction chains under any circumstances now
28475 // that we've done encoding canonicalization for them.
28479 // Depth threshold above which we can efficiently use variable mask shuffles.
28480 // TODO This should probably be target specific.
28481 bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
28483 bool MaskContainsZeros =
28484 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
28486 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
28487 // If we have a single input lane-crossing shuffle then lower to VPERMV.
28488 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28489 ((Subtarget.hasAVX2() &&
28490 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28491 (Subtarget.hasAVX512() &&
28492 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28493 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28494 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28495 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28496 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28497 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28498 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28499 DCI.AddToWorklist(VPermMask.getNode());
28500 Res = DAG.getBitcast(MaskVT, V1);
28501 DCI.AddToWorklist(Res.getNode());
28502 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
28503 DCI.AddToWorklist(Res.getNode());
28504 return DAG.getBitcast(RootVT, Res);
28507 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
28508 // vector as the second source.
28509 if (UnaryShuffle && AllowVariableMask &&
28510 ((Subtarget.hasAVX512() &&
28511 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28512 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28513 (Subtarget.hasVLX() &&
28514 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28515 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28516 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28517 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28518 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28519 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28520 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
28521 for (unsigned i = 0; i != NumMaskElts; ++i)
28522 if (Mask[i] == SM_SentinelZero)
28523 Mask[i] = NumMaskElts + i;
28525 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28526 DCI.AddToWorklist(VPermMask.getNode());
28527 Res = DAG.getBitcast(MaskVT, V1);
28528 DCI.AddToWorklist(Res.getNode());
28529 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
28530 DCI.AddToWorklist(Zero.getNode());
28531 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
28532 DCI.AddToWorklist(Res.getNode());
28533 return DAG.getBitcast(RootVT, Res);
28536 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
28537 if (AllowVariableMask && !MaskContainsZeros &&
28538 ((Subtarget.hasAVX512() &&
28539 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
28540 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
28541 (Subtarget.hasVLX() &&
28542 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
28543 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
28544 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
28545 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
28546 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
28547 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
28548 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
28549 DCI.AddToWorklist(VPermMask.getNode());
28550 V1 = DAG.getBitcast(MaskVT, V1);
28551 DCI.AddToWorklist(V1.getNode());
28552 V2 = DAG.getBitcast(MaskVT, V2);
28553 DCI.AddToWorklist(V2.getNode());
28554 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
28555 DCI.AddToWorklist(Res.getNode());
28556 return DAG.getBitcast(RootVT, Res);
28561 // See if we can combine a single input shuffle with zeros to a bit-mask,
28562 // which is much simpler than any shuffle.
28563 if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
28564 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
28565 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
28566 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
28567 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
28568 APInt UndefElts(NumMaskElts, 0);
28569 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
28570 for (unsigned i = 0; i != NumMaskElts; ++i) {
28572 if (M == SM_SentinelUndef) {
28573 UndefElts.setBit(i);
28576 if (M == SM_SentinelZero)
28578 EltBits[i] = AllOnes;
28580 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
28581 DCI.AddToWorklist(BitMask.getNode());
28582 Res = DAG.getBitcast(MaskVT, V1);
28583 DCI.AddToWorklist(Res.getNode());
28584 unsigned AndOpcode =
28585 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
28586 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
28587 DCI.AddToWorklist(Res.getNode());
28588 return DAG.getBitcast(RootVT, Res);
28591 // If we have a single input shuffle with different shuffle patterns in the
28592 // the 128-bit lanes use the variable mask to VPERMILPS.
28593 // TODO Combine other mask types at higher depths.
28594 if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
28595 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
28596 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
28597 SmallVector<SDValue, 16> VPermIdx;
28598 for (int M : Mask) {
28600 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
28601 VPermIdx.push_back(Idx);
28603 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
28604 DCI.AddToWorklist(VPermMask.getNode());
28605 Res = DAG.getBitcast(MaskVT, V1);
28606 DCI.AddToWorklist(Res.getNode());
28607 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
28608 DCI.AddToWorklist(Res.getNode());
28609 return DAG.getBitcast(RootVT, Res);
28612 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
28613 // to VPERMIL2PD/VPERMIL2PS.
28614 if (AllowVariableMask && Subtarget.hasXOP() &&
28615 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
28616 MaskVT == MVT::v8f32)) {
28617 // VPERMIL2 Operation.
28618 // Bits[3] - Match Bit.
28619 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
28620 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
28621 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
28622 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
28623 SmallVector<int, 8> VPerm2Idx;
28624 unsigned M2ZImm = 0;
28625 for (int M : Mask) {
28626 if (M == SM_SentinelUndef) {
28627 VPerm2Idx.push_back(-1);
28630 if (M == SM_SentinelZero) {
28632 VPerm2Idx.push_back(8);
28635 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
28636 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
28637 VPerm2Idx.push_back(Index);
28639 V1 = DAG.getBitcast(MaskVT, V1);
28640 DCI.AddToWorklist(V1.getNode());
28641 V2 = DAG.getBitcast(MaskVT, V2);
28642 DCI.AddToWorklist(V2.getNode());
28643 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
28644 DCI.AddToWorklist(VPerm2MaskOp.getNode());
28645 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
28646 DAG.getConstant(M2ZImm, DL, MVT::i8));
28647 DCI.AddToWorklist(Res.getNode());
28648 return DAG.getBitcast(RootVT, Res);
28651 // If we have 3 or more shuffle instructions or a chain involving a variable
28652 // mask, we can replace them with a single PSHUFB instruction profitably.
28653 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
28654 // instructions, but in practice PSHUFB tends to be *very* fast so we're
28655 // more aggressive.
28656 if (UnaryShuffle && AllowVariableMask &&
28657 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28658 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
28659 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
28660 SmallVector<SDValue, 16> PSHUFBMask;
28661 int NumBytes = RootVT.getSizeInBits() / 8;
28662 int Ratio = NumBytes / NumMaskElts;
28663 for (int i = 0; i < NumBytes; ++i) {
28664 int M = Mask[i / Ratio];
28665 if (M == SM_SentinelUndef) {
28666 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28669 if (M == SM_SentinelZero) {
28670 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28673 M = Ratio * M + i % Ratio;
28674 assert((M / 16) == (i / 16) && "Lane crossing detected");
28675 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28677 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28678 Res = DAG.getBitcast(ByteVT, V1);
28679 DCI.AddToWorklist(Res.getNode());
28680 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28681 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28682 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28683 DCI.AddToWorklist(Res.getNode());
28684 return DAG.getBitcast(RootVT, Res);
28687 // With XOP, if we have a 128-bit binary input shuffle we can always combine
28688 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28689 // slower than PSHUFB on targets that support both.
28690 if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
28691 // VPPERM Mask Operation
28692 // Bits[4:0] - Byte Index (0 - 31)
28693 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28694 SmallVector<SDValue, 16> VPPERMMask;
28696 int Ratio = NumBytes / NumMaskElts;
28697 for (int i = 0; i < NumBytes; ++i) {
28698 int M = Mask[i / Ratio];
28699 if (M == SM_SentinelUndef) {
28700 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28703 if (M == SM_SentinelZero) {
28704 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28707 M = Ratio * M + i % Ratio;
28708 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28710 MVT ByteVT = MVT::v16i8;
28711 V1 = DAG.getBitcast(ByteVT, V1);
28712 DCI.AddToWorklist(V1.getNode());
28713 V2 = DAG.getBitcast(ByteVT, V2);
28714 DCI.AddToWorklist(V2.getNode());
28715 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28716 DCI.AddToWorklist(VPPERMMaskOp.getNode());
28717 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28718 DCI.AddToWorklist(Res.getNode());
28719 return DAG.getBitcast(RootVT, Res);
28722 // Failed to find any combines.
28726 // Attempt to constant fold all of the constant source ops.
28727 // Returns true if the entire shuffle is folded to a constant.
28728 // TODO: Extend this to merge multiple constant Ops and update the mask.
28729 static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28730 ArrayRef<int> Mask, SDValue Root,
28731 bool HasVariableMask,
28733 TargetLowering::DAGCombinerInfo &DCI,
28734 const X86Subtarget &Subtarget) {
28735 MVT VT = Root.getSimpleValueType();
28737 unsigned SizeInBits = VT.getSizeInBits();
28738 unsigned NumMaskElts = Mask.size();
28739 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28740 unsigned NumOps = Ops.size();
28742 // Extract constant bits from each source op.
28743 bool OneUseConstantOp = false;
28744 SmallVector<APInt, 16> UndefEltsOps(NumOps);
28745 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28746 for (unsigned i = 0; i != NumOps; ++i) {
28747 SDValue SrcOp = Ops[i];
28748 OneUseConstantOp |= SrcOp.hasOneUse();
28749 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28754 // Only fold if at least one of the constants is only used once or
28755 // the combined shuffle has included a variable mask shuffle, this
28756 // is to avoid constant pool bloat.
28757 if (!OneUseConstantOp && !HasVariableMask)
28760 // Shuffle the constant bits according to the mask.
28761 APInt UndefElts(NumMaskElts, 0);
28762 APInt ZeroElts(NumMaskElts, 0);
28763 APInt ConstantElts(NumMaskElts, 0);
28764 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28765 APInt::getNullValue(MaskSizeInBits));
28766 for (unsigned i = 0; i != NumMaskElts; ++i) {
28768 if (M == SM_SentinelUndef) {
28769 UndefElts.setBit(i);
28771 } else if (M == SM_SentinelZero) {
28772 ZeroElts.setBit(i);
28775 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28777 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28778 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28780 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28781 if (SrcUndefElts[SrcMaskIdx]) {
28782 UndefElts.setBit(i);
28786 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28787 APInt &Bits = SrcEltBits[SrcMaskIdx];
28789 ZeroElts.setBit(i);
28793 ConstantElts.setBit(i);
28794 ConstantBitData[i] = Bits;
28796 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28798 // Create the constant data.
28800 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28801 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28803 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28805 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28808 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28809 DCI.AddToWorklist(CstOp.getNode());
28810 return DAG.getBitcast(VT, CstOp);
28813 /// \brief Fully generic combining of x86 shuffle instructions.
28815 /// This should be the last combine run over the x86 shuffle instructions. Once
28816 /// they have been fully optimized, this will recursively consider all chains
28817 /// of single-use shuffle instructions, build a generic model of the cumulative
28818 /// shuffle operation, and check for simpler instructions which implement this
28819 /// operation. We use this primarily for two purposes:
28821 /// 1) Collapse generic shuffles to specialized single instructions when
28822 /// equivalent. In most cases, this is just an encoding size win, but
28823 /// sometimes we will collapse multiple generic shuffles into a single
28824 /// special-purpose shuffle.
28825 /// 2) Look for sequences of shuffle instructions with 3 or more total
28826 /// instructions, and replace them with the slightly more expensive SSSE3
28827 /// PSHUFB instruction if available. We do this as the last combining step
28828 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
28829 /// a suitable short sequence of other instructions. The PSHUFB will either
28830 /// use a register or have to read from memory and so is slightly (but only
28831 /// slightly) more expensive than the other shuffle instructions.
28833 /// Because this is inherently a quadratic operation (for each shuffle in
28834 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28835 /// This should never be an issue in practice as the shuffle lowering doesn't
28836 /// produce sequences of more than 8 instructions.
28838 /// FIXME: We will currently miss some cases where the redundant shuffling
28839 /// would simplify under the threshold for PSHUFB formation because of
28840 /// combine-ordering. To fix this, we should do the redundant instruction
28841 /// combining in this recursive walk.
28842 static SDValue combineX86ShufflesRecursively(
28843 ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
28844 ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
28845 bool HasVariableMask, SelectionDAG &DAG,
28846 TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
28847 // Bound the depth of our recursive combine because this is ultimately
28848 // quadratic in nature.
28852 // Directly rip through bitcasts to find the underlying operand.
28853 SDValue Op = SrcOps[SrcOpIndex];
28854 Op = peekThroughOneUseBitcasts(Op);
28856 MVT VT = Op.getSimpleValueType();
28857 if (!VT.isVector())
28858 return SDValue(); // Bail if we hit a non-vector.
28860 assert(Root.getSimpleValueType().isVector() &&
28861 "Shuffles operate on vector types!");
28862 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28863 "Can only combine shuffles of the same vector register size.");
28865 // Extract target shuffle mask and resolve sentinels and inputs.
28866 SmallVector<int, 64> OpMask;
28867 SmallVector<SDValue, 2> OpInputs;
28868 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28871 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28872 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28873 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28875 // Add the inputs to the Ops list, avoiding duplicates.
28876 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28878 int InputIdx0 = -1, InputIdx1 = -1;
28879 for (int i = 0, e = Ops.size(); i < e; ++i) {
28880 SDValue BC = peekThroughBitcasts(Ops[i]);
28881 if (Input0 && BC == peekThroughBitcasts(Input0))
28883 if (Input1 && BC == peekThroughBitcasts(Input1))
28887 if (Input0 && InputIdx0 < 0) {
28888 InputIdx0 = SrcOpIndex;
28889 Ops[SrcOpIndex] = Input0;
28891 if (Input1 && InputIdx1 < 0) {
28892 InputIdx1 = Ops.size();
28893 Ops.push_back(Input1);
28896 assert(((RootMask.size() > OpMask.size() &&
28897 RootMask.size() % OpMask.size() == 0) ||
28898 (OpMask.size() > RootMask.size() &&
28899 OpMask.size() % RootMask.size() == 0) ||
28900 OpMask.size() == RootMask.size()) &&
28901 "The smaller number of elements must divide the larger.");
28903 // This function can be performance-critical, so we rely on the power-of-2
28904 // knowledge that we have about the mask sizes to replace div/rem ops with
28905 // bit-masks and shifts.
28906 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28907 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28908 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28909 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28911 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28912 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28913 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28914 assert((RootRatio == 1 || OpRatio == 1) &&
28915 "Must not have a ratio for both incoming and op masks!");
28917 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28918 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28919 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28920 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28921 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28923 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28925 // Merge this shuffle operation's mask into our accumulated mask. Note that
28926 // this shuffle's mask will be the first applied to the input, followed by the
28927 // root mask to get us all the way to the root value arrangement. The reason
28928 // for this order is that we are recursing up the operation chain.
28929 for (unsigned i = 0; i < MaskWidth; ++i) {
28930 unsigned RootIdx = i >> RootRatioLog2;
28931 if (RootMask[RootIdx] < 0) {
28932 // This is a zero or undef lane, we're done.
28933 Mask[i] = RootMask[RootIdx];
28937 unsigned RootMaskedIdx =
28939 ? RootMask[RootIdx]
28940 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28942 // Just insert the scaled root mask value if it references an input other
28943 // than the SrcOp we're currently inserting.
28944 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28945 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28946 Mask[i] = RootMaskedIdx;
28950 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28951 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28952 if (OpMask[OpIdx] < 0) {
28953 // The incoming lanes are zero or undef, it doesn't matter which ones we
28955 Mask[i] = OpMask[OpIdx];
28959 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28960 unsigned OpMaskedIdx =
28963 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28965 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28966 if (OpMask[OpIdx] < (int)OpMask.size()) {
28967 assert(0 <= InputIdx0 && "Unknown target shuffle input");
28968 OpMaskedIdx += InputIdx0 * MaskWidth;
28970 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28971 OpMaskedIdx += InputIdx1 * MaskWidth;
28974 Mask[i] = OpMaskedIdx;
28977 // Handle the all undef/zero cases early.
28978 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
28979 return DAG.getUNDEF(Root.getValueType());
28981 // TODO - should we handle the mixed zero/undef case as well? Just returning
28982 // a zero mask will lose information on undef elements possibly reducing
28983 // future combine possibilities.
28984 if (all_of(Mask, [](int Idx) { return Idx < 0; }))
28985 return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
28988 // Remove unused shuffle source ops.
28989 resolveTargetShuffleInputsAndMask(Ops, Mask);
28990 assert(!Ops.empty() && "Shuffle with no inputs detected");
28992 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28994 // Update the list of shuffle nodes that have been combined so far.
28995 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28997 CombinedNodes.push_back(Op.getNode());
28999 // See if we can recurse into each shuffle source op (if it's a target
29000 // shuffle). The source op should only be combined if it either has a
29001 // single use (i.e. current Op) or all its users have already been combined.
29002 for (int i = 0, e = Ops.size(); i < e; ++i)
29003 if (Ops[i].getNode()->hasOneUse() ||
29004 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
29005 if (SDValue Res = combineX86ShufflesRecursively(
29006 Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
29007 DAG, DCI, Subtarget))
29010 // Attempt to constant fold all of the constant source ops.
29011 if (SDValue Cst = combineX86ShufflesConstants(
29012 Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
29015 // We can only combine unary and binary shuffle mask cases.
29016 if (Ops.size() > 2)
29019 // Minor canonicalization of the accumulated shuffle mask to make it easier
29020 // to match below. All this does is detect masks with sequential pairs of
29021 // elements, and shrink them to the half-width mask. It does this in a loop
29022 // so it will reduce the size of the mask to the minimal width mask which
29023 // performs an equivalent shuffle.
29024 SmallVector<int, 64> WidenedMask;
29025 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
29026 Mask = std::move(WidenedMask);
29029 // Canonicalization of binary shuffle masks to improve pattern matching by
29030 // commuting the inputs.
29031 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
29032 ShuffleVectorSDNode::commuteMask(Mask);
29033 std::swap(Ops[0], Ops[1]);
29036 // Finally, try to combine into a single shuffle instruction.
29037 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
29041 /// \brief Get the PSHUF-style mask from PSHUF node.
29043 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
29044 /// PSHUF-style masks that can be reused with such instructions.
29045 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
29046 MVT VT = N.getSimpleValueType();
29047 SmallVector<int, 4> Mask;
29048 SmallVector<SDValue, 2> Ops;
29051 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
29055 // If we have more than 128-bits, only the low 128-bits of shuffle mask
29056 // matter. Check that the upper masks are repeats and remove them.
29057 if (VT.getSizeInBits() > 128) {
29058 int LaneElts = 128 / VT.getScalarSizeInBits();
29060 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
29061 for (int j = 0; j < LaneElts; ++j)
29062 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
29063 "Mask doesn't repeat in high 128-bit lanes!");
29065 Mask.resize(LaneElts);
29068 switch (N.getOpcode()) {
29069 case X86ISD::PSHUFD:
29071 case X86ISD::PSHUFLW:
29074 case X86ISD::PSHUFHW:
29075 Mask.erase(Mask.begin(), Mask.begin() + 4);
29076 for (int &M : Mask)
29080 llvm_unreachable("No valid shuffle instruction found!");
29084 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
29086 /// We walk up the chain and look for a combinable shuffle, skipping over
29087 /// shuffles that we could hoist this shuffle's transformation past without
29088 /// altering anything.
29090 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
29091 SelectionDAG &DAG) {
29092 assert(N.getOpcode() == X86ISD::PSHUFD &&
29093 "Called with something other than an x86 128-bit half shuffle!");
29096 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
29097 // of the shuffles in the chain so that we can form a fresh chain to replace
29099 SmallVector<SDValue, 8> Chain;
29100 SDValue V = N.getOperand(0);
29101 for (; V.hasOneUse(); V = V.getOperand(0)) {
29102 switch (V.getOpcode()) {
29104 return SDValue(); // Nothing combined!
29107 // Skip bitcasts as we always know the type for the target specific
29111 case X86ISD::PSHUFD:
29112 // Found another dword shuffle.
29115 case X86ISD::PSHUFLW:
29116 // Check that the low words (being shuffled) are the identity in the
29117 // dword shuffle, and the high words are self-contained.
29118 if (Mask[0] != 0 || Mask[1] != 1 ||
29119 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
29122 Chain.push_back(V);
29125 case X86ISD::PSHUFHW:
29126 // Check that the high words (being shuffled) are the identity in the
29127 // dword shuffle, and the low words are self-contained.
29128 if (Mask[2] != 2 || Mask[3] != 3 ||
29129 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
29132 Chain.push_back(V);
29135 case X86ISD::UNPCKL:
29136 case X86ISD::UNPCKH:
29137 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
29138 // shuffle into a preceding word shuffle.
29139 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
29140 V.getSimpleValueType().getVectorElementType() != MVT::i16)
29143 // Search for a half-shuffle which we can combine with.
29144 unsigned CombineOp =
29145 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
29146 if (V.getOperand(0) != V.getOperand(1) ||
29147 !V->isOnlyUserOf(V.getOperand(0).getNode()))
29149 Chain.push_back(V);
29150 V = V.getOperand(0);
29152 switch (V.getOpcode()) {
29154 return SDValue(); // Nothing to combine.
29156 case X86ISD::PSHUFLW:
29157 case X86ISD::PSHUFHW:
29158 if (V.getOpcode() == CombineOp)
29161 Chain.push_back(V);
29165 V = V.getOperand(0);
29169 } while (V.hasOneUse());
29172 // Break out of the loop if we break out of the switch.
29176 if (!V.hasOneUse())
29177 // We fell out of the loop without finding a viable combining instruction.
29180 // Merge this node's mask and our incoming mask.
29181 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29182 for (int &M : Mask)
29184 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
29185 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29187 // Rebuild the chain around this new shuffle.
29188 while (!Chain.empty()) {
29189 SDValue W = Chain.pop_back_val();
29191 if (V.getValueType() != W.getOperand(0).getValueType())
29192 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
29194 switch (W.getOpcode()) {
29196 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
29198 case X86ISD::UNPCKL:
29199 case X86ISD::UNPCKH:
29200 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
29203 case X86ISD::PSHUFD:
29204 case X86ISD::PSHUFLW:
29205 case X86ISD::PSHUFHW:
29206 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
29210 if (V.getValueType() != N.getValueType())
29211 V = DAG.getBitcast(N.getValueType(), V);
29213 // Return the new chain to replace N.
29217 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
29220 /// We walk up the chain, skipping shuffles of the other half and looking
29221 /// through shuffles which switch halves trying to find a shuffle of the same
29222 /// pair of dwords.
29223 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
29225 TargetLowering::DAGCombinerInfo &DCI) {
29227 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
29228 "Called with something other than an x86 128-bit half shuffle!");
29230 unsigned CombineOpcode = N.getOpcode();
29232 // Walk up a single-use chain looking for a combinable shuffle.
29233 SDValue V = N.getOperand(0);
29234 for (; V.hasOneUse(); V = V.getOperand(0)) {
29235 switch (V.getOpcode()) {
29237 return false; // Nothing combined!
29240 // Skip bitcasts as we always know the type for the target specific
29244 case X86ISD::PSHUFLW:
29245 case X86ISD::PSHUFHW:
29246 if (V.getOpcode() == CombineOpcode)
29249 // Other-half shuffles are no-ops.
29252 // Break out of the loop if we break out of the switch.
29256 if (!V.hasOneUse())
29257 // We fell out of the loop without finding a viable combining instruction.
29260 // Combine away the bottom node as its shuffle will be accumulated into
29261 // a preceding shuffle.
29262 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29264 // Record the old value.
29267 // Merge this node's mask and our incoming mask (adjusted to account for all
29268 // the pshufd instructions encountered).
29269 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29270 for (int &M : Mask)
29272 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
29273 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
29275 // Check that the shuffles didn't cancel each other out. If not, we need to
29276 // combine to the new one.
29278 // Replace the combinable shuffle with the combined one, updating all users
29279 // so that we re-evaluate the chain here.
29280 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
29285 /// \brief Try to combine x86 target specific shuffles.
29286 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
29287 TargetLowering::DAGCombinerInfo &DCI,
29288 const X86Subtarget &Subtarget) {
29290 MVT VT = N.getSimpleValueType();
29291 SmallVector<int, 4> Mask;
29292 unsigned Opcode = N.getOpcode();
29294 // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
29295 // single instruction.
29296 if (VT.getScalarSizeInBits() == 64 &&
29297 (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
29298 Opcode == X86ISD::UNPCKL)) {
29299 auto BC0 = peekThroughBitcasts(N.getOperand(0));
29300 auto BC1 = peekThroughBitcasts(N.getOperand(1));
29301 EVT VT0 = BC0.getValueType();
29302 EVT VT1 = BC1.getValueType();
29303 unsigned Opcode0 = BC0.getOpcode();
29304 unsigned Opcode1 = BC1.getOpcode();
29305 if (Opcode0 == Opcode1 && VT0 == VT1 &&
29306 (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
29307 Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
29308 Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
29310 if (Opcode == X86ISD::MOVSD) {
29311 Lo = BC1.getOperand(0);
29312 Hi = BC0.getOperand(1);
29314 Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29315 Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
29317 SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
29318 DCI.AddToWorklist(Horiz.getNode());
29319 return DAG.getBitcast(VT, Horiz);
29324 case X86ISD::PSHUFD:
29325 case X86ISD::PSHUFLW:
29326 case X86ISD::PSHUFHW:
29327 Mask = getPSHUFShuffleMask(N);
29328 assert(Mask.size() == 4);
29330 case X86ISD::UNPCKL: {
29331 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
29332 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
29333 // moves upper half elements into the lower half part. For example:
29335 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
29337 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
29339 // will be combined to:
29341 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
29343 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
29344 // happen due to advanced instructions.
29345 if (!VT.is128BitVector())
29348 auto Op0 = N.getOperand(0);
29349 auto Op1 = N.getOperand(1);
29350 if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
29351 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
29353 unsigned NumElts = VT.getVectorNumElements();
29354 SmallVector<int, 8> ExpectedMask(NumElts, -1);
29355 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
29358 auto ShufOp = Op1.getOperand(0);
29359 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
29360 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
29364 case X86ISD::BLENDI: {
29365 SDValue V0 = N->getOperand(0);
29366 SDValue V1 = N->getOperand(1);
29367 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
29368 "Unexpected input vector types");
29370 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
29371 // operands and changing the mask to 1. This saves us a bunch of
29372 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
29373 // x86InstrInfo knows how to commute this back after instruction selection
29374 // if it would help register allocation.
29376 // TODO: If optimizing for size or a processor that doesn't suffer from
29377 // partial register update stalls, this should be transformed into a MOVSD
29378 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
29380 if (VT == MVT::v2f64)
29381 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
29382 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
29383 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
29384 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
29389 case X86ISD::MOVSD:
29390 case X86ISD::MOVSS: {
29391 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
29392 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
29393 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
29394 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
29395 if (isZero0 && isZero1)
29398 // We often lower to MOVSD/MOVSS from integer as well as native float
29399 // types; remove unnecessary domain-crossing bitcasts if we can to make it
29400 // easier to combine shuffles later on. We've already accounted for the
29401 // domain switching cost when we decided to lower with it.
29402 bool isFloat = VT.isFloatingPoint();
29403 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
29404 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
29405 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
29406 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
29407 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
29408 V0 = DAG.getBitcast(NewVT, V0);
29409 V1 = DAG.getBitcast(NewVT, V1);
29410 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
29415 case X86ISD::INSERTPS: {
29416 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
29417 SDValue Op0 = N.getOperand(0);
29418 SDValue Op1 = N.getOperand(1);
29419 SDValue Op2 = N.getOperand(2);
29420 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
29421 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
29422 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
29423 unsigned ZeroMask = InsertPSMask & 0xF;
29425 // If we zero out all elements from Op0 then we don't need to reference it.
29426 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
29427 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
29428 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29430 // If we zero out the element from Op1 then we don't need to reference it.
29431 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
29432 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29433 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29435 // Attempt to merge insertps Op1 with an inner target shuffle node.
29436 SmallVector<int, 8> TargetMask1;
29437 SmallVector<SDValue, 2> Ops1;
29438 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
29439 int M = TargetMask1[SrcIdx];
29440 if (isUndefOrZero(M)) {
29441 // Zero/UNDEF insertion - zero out element and remove dependency.
29442 InsertPSMask |= (1u << DstIdx);
29443 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
29444 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29446 // Update insertps mask srcidx and reference the source input directly.
29447 assert(0 <= M && M < 8 && "Shuffle index out of range");
29448 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
29449 Op1 = Ops1[M < 4 ? 0 : 1];
29450 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29451 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29454 // Attempt to merge insertps Op0 with an inner target shuffle node.
29455 SmallVector<int, 8> TargetMask0;
29456 SmallVector<SDValue, 2> Ops0;
29457 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
29460 bool Updated = false;
29461 bool UseInput00 = false;
29462 bool UseInput01 = false;
29463 for (int i = 0; i != 4; ++i) {
29464 int M = TargetMask0[i];
29465 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
29466 // No change if element is already zero or the inserted element.
29468 } else if (isUndefOrZero(M)) {
29469 // If the target mask is undef/zero then we must zero the element.
29470 InsertPSMask |= (1u << i);
29475 // The input vector element must be inline.
29476 if (M != i && M != (i + 4))
29479 // Determine which inputs of the target shuffle we're using.
29480 UseInput00 |= (0 <= M && M < 4);
29481 UseInput01 |= (4 <= M);
29484 // If we're not using both inputs of the target shuffle then use the
29485 // referenced input directly.
29486 if (UseInput00 && !UseInput01) {
29489 } else if (!UseInput00 && UseInput01) {
29495 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
29496 DAG.getConstant(InsertPSMask, DL, MVT::i8));
29504 // Nuke no-op shuffles that show up after combining.
29505 if (isNoopShuffleMask(Mask))
29506 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
29508 // Look for simplifications involving one or two shuffle instructions.
29509 SDValue V = N.getOperand(0);
29510 switch (N.getOpcode()) {
29513 case X86ISD::PSHUFLW:
29514 case X86ISD::PSHUFHW:
29515 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
29517 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
29518 return SDValue(); // We combined away this shuffle, so we're done.
29520 // See if this reduces to a PSHUFD which is no more expensive and can
29521 // combine with more operations. Note that it has to at least flip the
29522 // dwords as otherwise it would have been removed as a no-op.
29523 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
29524 int DMask[] = {0, 1, 2, 3};
29525 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
29526 DMask[DOffset + 0] = DOffset + 1;
29527 DMask[DOffset + 1] = DOffset + 0;
29528 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29529 V = DAG.getBitcast(DVT, V);
29530 DCI.AddToWorklist(V.getNode());
29531 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
29532 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
29533 DCI.AddToWorklist(V.getNode());
29534 return DAG.getBitcast(VT, V);
29537 // Look for shuffle patterns which can be implemented as a single unpack.
29538 // FIXME: This doesn't handle the location of the PSHUFD generically, and
29539 // only works when we have a PSHUFD followed by two half-shuffles.
29540 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
29541 (V.getOpcode() == X86ISD::PSHUFLW ||
29542 V.getOpcode() == X86ISD::PSHUFHW) &&
29543 V.getOpcode() != N.getOpcode() &&
29545 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
29546 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
29547 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
29548 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
29549 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29550 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
29552 for (int i = 0; i < 4; ++i) {
29553 WordMask[i + NOffset] = Mask[i] + NOffset;
29554 WordMask[i + VOffset] = VMask[i] + VOffset;
29556 // Map the word mask through the DWord mask.
29558 for (int i = 0; i < 8; ++i)
29559 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
29560 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
29561 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
29562 // We can replace all three shuffles with an unpack.
29563 V = DAG.getBitcast(VT, D.getOperand(0));
29564 DCI.AddToWorklist(V.getNode());
29565 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
29574 case X86ISD::PSHUFD:
29575 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
29584 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
29585 /// operation. If true is returned then the operands of ADDSUB operation
29586 /// are written to the parameters \p Opnd0 and \p Opnd1.
29588 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
29589 /// so it is easier to generically match. We also insert dummy vector shuffle
29590 /// nodes for the operands which explicitly discard the lanes which are unused
29591 /// by this operation to try to flow through the rest of the combiner
29592 /// the fact that they're unused.
29593 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
29594 SDValue &Opnd0, SDValue &Opnd1) {
29596 EVT VT = N->getValueType(0);
29597 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
29598 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
29599 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
29602 // We only handle target-independent shuffles.
29603 // FIXME: It would be easy and harmless to use the target shuffle mask
29604 // extraction tool to support more.
29605 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
29608 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
29609 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
29611 SDValue V1 = N->getOperand(0);
29612 SDValue V2 = N->getOperand(1);
29614 // We require the first shuffle operand to be the FSUB node, and the second to
29615 // be the FADD node.
29616 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
29617 ShuffleVectorSDNode::commuteMask(Mask);
29619 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
29622 // If there are other uses of these operations we can't fold them.
29623 if (!V1->hasOneUse() || !V2->hasOneUse())
29626 // Ensure that both operations have the same operands. Note that we can
29627 // commute the FADD operands.
29628 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
29629 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
29630 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
29633 // We're looking for blends between FADD and FSUB nodes. We insist on these
29634 // nodes being lined up in a specific expected pattern.
29635 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
29636 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
29637 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
29638 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
29639 8, 25, 10, 27, 12, 29, 14, 31})))
29647 /// \brief Try to combine a shuffle into a target-specific add-sub or
29648 /// mul-add-sub node.
29649 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
29650 const X86Subtarget &Subtarget,
29651 SelectionDAG &DAG) {
29652 SDValue Opnd0, Opnd1;
29653 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
29656 EVT VT = N->getValueType(0);
29659 // Try to generate X86ISD::FMADDSUB node here.
29661 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
29662 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
29664 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
29665 // the ADDSUB idiom has been successfully recognized. There are no known
29666 // X86 targets with 512-bit ADDSUB instructions!
29667 if (VT.is512BitVector())
29670 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29673 // We are looking for a shuffle where both sources are concatenated with undef
29674 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29675 // if we can express this as a single-source shuffle, that's preferable.
29676 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29677 const X86Subtarget &Subtarget) {
29678 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29681 EVT VT = N->getValueType(0);
29683 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29684 if (!VT.is128BitVector() && !VT.is256BitVector())
29687 if (VT.getVectorElementType() != MVT::i32 &&
29688 VT.getVectorElementType() != MVT::i64 &&
29689 VT.getVectorElementType() != MVT::f32 &&
29690 VT.getVectorElementType() != MVT::f64)
29693 SDValue N0 = N->getOperand(0);
29694 SDValue N1 = N->getOperand(1);
29696 // Check that both sources are concats with undef.
29697 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29698 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29699 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29700 !N1.getOperand(1).isUndef())
29703 // Construct the new shuffle mask. Elements from the first source retain their
29704 // index, but elements from the second source no longer need to skip an undef.
29705 SmallVector<int, 8> Mask;
29706 int NumElts = VT.getVectorNumElements();
29708 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29709 for (int Elt : SVOp->getMask())
29710 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29713 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29715 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29718 /// Eliminate a redundant shuffle of a horizontal math op.
29719 static SDValue foldShuffleOfHorizOp(SDNode *N) {
29720 if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
29723 SDValue HOp = N->getOperand(0);
29724 if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
29725 HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
29728 // 128-bit horizontal math instructions are defined to operate on adjacent
29729 // lanes of each operand as:
29730 // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
29731 // ...similarly for v2f64 and v8i16.
29732 // TODO: 256-bit is not the same because...x86.
29733 if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
29736 // When the operands of a horizontal math op are identical, the low half of
29737 // the result is the same as the high half. If the shuffle is also replicating
29738 // low and high halves, we don't need the shuffle.
29739 // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
29740 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
29741 // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
29742 // but this should be tied to whatever horizontal op matching and shuffle
29743 // canonicalization are producing.
29744 if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
29745 isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
29746 isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
29752 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29753 TargetLowering::DAGCombinerInfo &DCI,
29754 const X86Subtarget &Subtarget) {
29756 EVT VT = N->getValueType(0);
29757 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29758 // If we have legalized the vector types, look for blends of FADD and FSUB
29759 // nodes that we can fuse into an ADDSUB node.
29760 if (TLI.isTypeLegal(VT)) {
29761 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29764 if (SDValue HAddSub = foldShuffleOfHorizOp(N))
29768 // During Type Legalization, when promoting illegal vector types,
29769 // the backend might introduce new shuffle dag nodes and bitcasts.
29771 // This code performs the following transformation:
29772 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29773 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29775 // We do this only if both the bitcast and the BINOP dag nodes have
29776 // one use. Also, perform this transformation only if the new binary
29777 // operation is legal. This is to avoid introducing dag nodes that
29778 // potentially need to be further expanded (or custom lowered) into a
29779 // less optimal sequence of dag nodes.
29780 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29781 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29782 N->getOperand(0).getOpcode() == ISD::BITCAST &&
29783 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29784 SDValue N0 = N->getOperand(0);
29785 SDValue N1 = N->getOperand(1);
29787 SDValue BC0 = N0.getOperand(0);
29788 EVT SVT = BC0.getValueType();
29789 unsigned Opcode = BC0.getOpcode();
29790 unsigned NumElts = VT.getVectorNumElements();
29792 if (BC0.hasOneUse() && SVT.isVector() &&
29793 SVT.getVectorNumElements() * 2 == NumElts &&
29794 TLI.isOperationLegal(Opcode, VT)) {
29795 bool CanFold = false;
29801 // isOperationLegal lies for integer ops on floating point types.
29802 CanFold = VT.isInteger();
29807 // isOperationLegal lies for floating point ops on integer types.
29808 CanFold = VT.isFloatingPoint();
29812 unsigned SVTNumElts = SVT.getVectorNumElements();
29813 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29814 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29815 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29816 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29817 CanFold = SVOp->getMaskElt(i) < 0;
29820 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29821 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29822 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29823 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29828 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29829 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29830 // consecutive, non-overlapping, and in the right order.
29831 SmallVector<SDValue, 16> Elts;
29832 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29833 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29834 Elts.push_back(Elt);
29841 if (Elts.size() == VT.getVectorNumElements())
29843 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29846 // For AVX2, we sometimes want to combine
29847 // (vector_shuffle <mask> (concat_vectors t1, undef)
29848 // (concat_vectors t2, undef))
29850 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29851 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29852 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29855 if (isTargetShuffle(N->getOpcode())) {
29857 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29860 // Try recursively combining arbitrary sequences of x86 shuffle
29861 // instructions into higher-order shuffles. We do this after combining
29862 // specific PSHUF instruction sequences into their minimal form so that we
29863 // can evaluate how many specialized shuffle instructions are involved in
29864 // a particular chain.
29865 if (SDValue Res = combineX86ShufflesRecursively(
29866 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
29867 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
29868 DCI.CombineTo(N, Res);
29876 /// Check if a vector extract from a target-specific shuffle of a load can be
29877 /// folded into a single element load.
29878 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29879 /// shuffles have been custom lowered so we need to handle those here.
29880 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29881 TargetLowering::DAGCombinerInfo &DCI) {
29882 if (DCI.isBeforeLegalizeOps())
29885 SDValue InVec = N->getOperand(0);
29886 SDValue EltNo = N->getOperand(1);
29887 EVT EltVT = N->getValueType(0);
29889 if (!isa<ConstantSDNode>(EltNo))
29892 EVT OriginalVT = InVec.getValueType();
29894 // Peek through bitcasts, don't duplicate a load with other uses.
29895 InVec = peekThroughOneUseBitcasts(InVec);
29897 EVT CurrentVT = InVec.getValueType();
29898 if (!CurrentVT.isVector() ||
29899 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29902 if (!isTargetShuffle(InVec.getOpcode()))
29905 // Don't duplicate a load with other uses.
29906 if (!InVec.hasOneUse())
29909 SmallVector<int, 16> ShuffleMask;
29910 SmallVector<SDValue, 2> ShuffleOps;
29912 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29913 ShuffleOps, ShuffleMask, UnaryShuffle))
29916 // Select the input vector, guarding against out of range extract vector.
29917 unsigned NumElems = CurrentVT.getVectorNumElements();
29918 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29919 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29921 if (Idx == SM_SentinelZero)
29922 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29923 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29924 if (Idx == SM_SentinelUndef)
29925 return DAG.getUNDEF(EltVT);
29927 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29928 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29931 // If inputs to shuffle are the same for both ops, then allow 2 uses
29932 unsigned AllowedUses =
29933 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29935 if (LdNode.getOpcode() == ISD::BITCAST) {
29936 // Don't duplicate a load with other uses.
29937 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29940 AllowedUses = 1; // only allow 1 load use if we have a bitcast
29941 LdNode = LdNode.getOperand(0);
29944 if (!ISD::isNormalLoad(LdNode.getNode()))
29947 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29949 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29952 // If there's a bitcast before the shuffle, check if the load type and
29953 // alignment is valid.
29954 unsigned Align = LN0->getAlignment();
29955 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29956 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29957 EltVT.getTypeForEVT(*DAG.getContext()));
29959 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29962 // All checks match so transform back to vector_shuffle so that DAG combiner
29963 // can finish the job
29966 // Create shuffle node taking into account the case that its a unary shuffle
29967 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29968 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29970 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29971 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29975 // Try to match patterns such as
29976 // (i16 bitcast (v16i1 x))
29978 // (i16 movmsk (16i8 sext (v16i1 x)))
29979 // before the illegal vector is scalarized on subtargets that don't have legal
29981 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29982 const X86Subtarget &Subtarget) {
29983 EVT VT = BitCast.getValueType();
29984 SDValue N0 = BitCast.getOperand(0);
29985 EVT VecVT = N0->getValueType(0);
29987 if (!VT.isScalarInteger() || !VecVT.isSimple())
29990 // With AVX512 vxi1 types are legal and we prefer using k-regs.
29991 // MOVMSK is supported in SSE2 or later.
29992 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
29995 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29996 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29997 // v8i16 and v16i16.
29998 // For these two cases, we can shuffle the upper element bytes to a
29999 // consecutive sequence at the start of the vector and treat the results as
30000 // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
30001 // for v16i16 this is not the case, because the shuffle is expensive, so we
30002 // avoid sign-extending to this type entirely.
30003 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
30004 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
30006 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
30007 switch (VecVT.getSimpleVT().SimpleTy) {
30011 SExtVT = MVT::v2i64;
30012 FPCastVT = MVT::v2f64;
30015 SExtVT = MVT::v4i32;
30016 FPCastVT = MVT::v4f32;
30017 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
30018 // sign-extend to a 256-bit operation to avoid truncation.
30019 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30020 N0->getOperand(0)->getValueType(0).is256BitVector()) {
30021 SExtVT = MVT::v4i64;
30022 FPCastVT = MVT::v4f64;
30026 SExtVT = MVT::v8i16;
30027 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
30028 // sign-extend to a 256-bit operation to match the compare.
30029 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
30030 // 256-bit because the shuffle is cheaper than sign extending the result of
30032 if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
30033 (N0->getOperand(0)->getValueType(0).is256BitVector() ||
30034 N0->getOperand(0)->getValueType(0).is512BitVector())) {
30035 SExtVT = MVT::v8i32;
30036 FPCastVT = MVT::v8f32;
30040 SExtVT = MVT::v16i8;
30041 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
30042 // it is not profitable to sign-extend to 256-bit because this will
30043 // require an extra cross-lane shuffle which is more expensive than
30044 // truncating the result of the compare to 128-bits.
30047 SExtVT = MVT::v32i8;
30052 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
30054 if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
30055 // Handle pre-AVX2 cases by splitting to two v16i1's.
30056 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30057 MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
30058 SDValue Lo = extract128BitVector(V, 0, DAG, DL);
30059 SDValue Hi = extract128BitVector(V, 16, DAG, DL);
30060 Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
30061 Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
30062 Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
30063 DAG.getConstant(16, DL, ShiftTy));
30064 V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
30065 return DAG.getZExtOrTrunc(V, DL, VT);
30068 if (SExtVT == MVT::v8i16) {
30069 assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
30070 V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
30071 DAG.getUNDEF(MVT::v8i16));
30073 assert(SExtVT.getScalarType() != MVT::i16 &&
30074 "Vectors of i16 must be packed");
30075 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
30076 V = DAG.getBitcast(FPCastVT, V);
30077 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
30078 return DAG.getZExtOrTrunc(V, DL, VT);
30081 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
30082 TargetLowering::DAGCombinerInfo &DCI,
30083 const X86Subtarget &Subtarget) {
30084 SDValue N0 = N->getOperand(0);
30085 EVT VT = N->getValueType(0);
30086 EVT SrcVT = N0.getValueType();
30088 // Try to match patterns such as
30089 // (i16 bitcast (v16i1 x))
30091 // (i16 movmsk (16i8 sext (v16i1 x)))
30092 // before the setcc result is scalarized on subtargets that don't have legal
30094 if (DCI.isBeforeLegalize())
30095 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
30097 // Since MMX types are special and don't usually play with other vector types,
30098 // it's better to handle them early to be sure we emit efficient code by
30099 // avoiding store-load conversions.
30101 // Detect bitcasts between i32 to x86mmx low word.
30102 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
30103 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
30104 SDValue N00 = N0->getOperand(0);
30105 if (N00.getValueType() == MVT::i32)
30106 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
30109 // Detect bitcasts between element or subvector extraction to x86mmx.
30110 if (VT == MVT::x86mmx &&
30111 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
30112 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
30113 isNullConstant(N0.getOperand(1))) {
30114 SDValue N00 = N0->getOperand(0);
30115 if (N00.getValueType().is128BitVector())
30116 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
30117 DAG.getBitcast(MVT::v2i64, N00));
30120 // Detect bitcasts from FP_TO_SINT to x86mmx.
30121 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
30122 N0.getOpcode() == ISD::FP_TO_SINT) {
30124 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
30125 DAG.getUNDEF(MVT::v2i32));
30126 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
30127 DAG.getBitcast(MVT::v2i64, Res));
30130 // Convert a bitcasted integer logic operation that has one bitcasted
30131 // floating-point operand into a floating-point logic operation. This may
30132 // create a load of a constant, but that is cheaper than materializing the
30133 // constant in an integer register and transferring it to an SSE register or
30134 // transferring the SSE operand to integer register and back.
30136 switch (N0.getOpcode()) {
30137 case ISD::AND: FPOpcode = X86ISD::FAND; break;
30138 case ISD::OR: FPOpcode = X86ISD::FOR; break;
30139 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
30140 default: return SDValue();
30143 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
30144 (Subtarget.hasSSE2() && VT == MVT::f64)))
30147 SDValue LogicOp0 = N0.getOperand(0);
30148 SDValue LogicOp1 = N0.getOperand(1);
30151 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
30152 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
30153 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
30154 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
30155 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
30156 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
30158 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
30159 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
30160 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
30161 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
30162 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
30163 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
30169 // Match a binop + shuffle pyramid that represents a horizontal reduction over
30170 // the elements of a vector.
30171 // Returns the vector that is being reduced on, or SDValue() if a reduction
30172 // was not matched.
30173 static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
30174 ArrayRef<ISD::NodeType> CandidateBinOps) {
30175 // The pattern must end in an extract from index 0.
30176 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
30177 !isNullConstant(Extract->getOperand(1)))
30180 SDValue Op = Extract->getOperand(0);
30181 unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
30183 // Match against one of the candidate binary ops.
30184 if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
30185 return Op.getOpcode() == BinOp;
30189 // At each stage, we're looking for something that looks like:
30190 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
30191 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
30192 // i32 undef, i32 undef, i32 undef, i32 undef>
30193 // %a = binop <8 x i32> %op, %s
30194 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
30195 // we expect something like:
30196 // <4,5,6,7,u,u,u,u>
30197 // <2,3,u,u,u,u,u,u>
30198 // <1,u,u,u,u,u,u,u>
30199 unsigned CandidateBinOp = Op.getOpcode();
30200 for (unsigned i = 0; i < Stages; ++i) {
30201 if (Op.getOpcode() != CandidateBinOp)
30204 ShuffleVectorSDNode *Shuffle =
30205 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
30207 Op = Op.getOperand(1);
30209 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
30210 Op = Op.getOperand(0);
30213 // The first operand of the shuffle should be the same as the other operand
30215 if (!Shuffle || Shuffle->getOperand(0) != Op)
30218 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
30219 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
30220 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
30224 BinOp = CandidateBinOp;
30228 // Given a select, detect the following pattern:
30229 // 1: %2 = zext <N x i8> %0 to <N x i32>
30230 // 2: %3 = zext <N x i8> %1 to <N x i32>
30231 // 3: %4 = sub nsw <N x i32> %2, %3
30232 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
30233 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
30234 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
30235 // This is useful as it is the input into a SAD pattern.
30236 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
30238 // Check the condition of the select instruction is greater-than.
30239 SDValue SetCC = Select->getOperand(0);
30240 if (SetCC.getOpcode() != ISD::SETCC)
30242 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
30243 if (CC != ISD::SETGT && CC != ISD::SETLT)
30246 SDValue SelectOp1 = Select->getOperand(1);
30247 SDValue SelectOp2 = Select->getOperand(2);
30249 // The following instructions assume SelectOp1 is the subtraction operand
30250 // and SelectOp2 is the negation operand.
30251 // In the case of SETLT this is the other way around.
30252 if (CC == ISD::SETLT)
30253 std::swap(SelectOp1, SelectOp2);
30255 // The second operand of the select should be the negation of the first
30256 // operand, which is implemented as 0 - SelectOp1.
30257 if (!(SelectOp2.getOpcode() == ISD::SUB &&
30258 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
30259 SelectOp2.getOperand(1) == SelectOp1))
30262 // The first operand of SetCC is the first operand of the select, which is the
30263 // difference between the two input vectors.
30264 if (SetCC.getOperand(0) != SelectOp1)
30267 // In SetLT case, The second operand of the comparison can be either 1 or 0.
30269 if ((CC == ISD::SETLT) &&
30270 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
30271 SplatVal.isOneValue()) ||
30272 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
30275 // In SetGT case, The second operand of the comparison can be either -1 or 0.
30276 if ((CC == ISD::SETGT) &&
30277 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
30278 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
30281 // The first operand of the select is the difference between the two input
30283 if (SelectOp1.getOpcode() != ISD::SUB)
30286 Op0 = SelectOp1.getOperand(0);
30287 Op1 = SelectOp1.getOperand(1);
30289 // Check if the operands of the sub are zero-extended from vectors of i8.
30290 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
30291 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
30292 Op1.getOpcode() != ISD::ZERO_EXTEND ||
30293 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
30299 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
30301 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
30302 const SDValue &Zext1, const SDLoc &DL) {
30304 // Find the appropriate width for the PSADBW.
30305 EVT InVT = Zext0.getOperand(0).getValueType();
30306 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
30308 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
30309 // fill in the missing vector elements with 0.
30310 unsigned NumConcat = RegSize / InVT.getSizeInBits();
30311 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
30312 Ops[0] = Zext0.getOperand(0);
30313 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
30314 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30315 Ops[0] = Zext1.getOperand(0);
30316 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
30318 // Actually build the SAD
30319 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
30320 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
30323 // Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
30324 static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
30325 const X86Subtarget &Subtarget) {
30326 // Bail without SSE41.
30327 if (!Subtarget.hasSSE41())
30330 EVT ExtractVT = Extract->getValueType(0);
30331 if (ExtractVT != MVT::i16)
30334 // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
30336 SDValue Src = matchBinOpReduction(
30337 Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
30341 EVT SrcVT = Src.getValueType();
30342 EVT SrcSVT = SrcVT.getScalarType();
30343 if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
30347 SDValue MinPos = Src;
30349 // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
30350 while (SrcVT.getSizeInBits() > 128) {
30351 unsigned NumElts = SrcVT.getVectorNumElements();
30352 unsigned NumSubElts = NumElts / 2;
30353 SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
30354 unsigned SubSizeInBits = SrcVT.getSizeInBits();
30355 SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
30356 SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
30357 MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
30359 assert(SrcVT == MVT::v8i16 && "Unexpected value type");
30361 // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
30362 // to flip the value accordingly.
30364 if (BinOp == ISD::SMAX)
30365 Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
30366 else if (BinOp == ISD::SMIN)
30367 Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
30368 else if (BinOp == ISD::UMAX)
30369 Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
30372 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30374 MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
30377 MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
30379 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
30380 DAG.getIntPtrConstant(0, DL));
30383 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
30384 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
30386 const X86Subtarget &Subtarget) {
30387 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
30388 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
30391 EVT ExtractVT = Extract->getValueType(0);
30392 unsigned BitWidth = ExtractVT.getSizeInBits();
30393 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
30394 ExtractVT != MVT::i8)
30397 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
30398 unsigned BinOp = 0;
30399 SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
30403 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
30404 // which we can't support here for now.
30405 if (Match.getScalarValueSizeInBits() != BitWidth)
30408 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
30409 unsigned MatchSizeInBits = Match.getValueSizeInBits();
30410 if (!(MatchSizeInBits == 128 ||
30411 (MatchSizeInBits == 256 &&
30412 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
30415 // Don't bother performing this for 2-element vectors.
30416 if (Match.getValueType().getVectorNumElements() <= 2)
30419 // Check that we are extracting a reduction of all sign bits.
30420 if (DAG.ComputeNumSignBits(Match) != BitWidth)
30423 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
30425 if (64 == BitWidth || 32 == BitWidth)
30426 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
30427 MatchSizeInBits / BitWidth);
30429 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
30432 ISD::CondCode CondCode;
30433 if (BinOp == ISD::OR) {
30434 // any_of -> MOVMSK != 0
30435 CompareBits = APInt::getNullValue(32);
30436 CondCode = ISD::CondCode::SETNE;
30438 // all_of -> MOVMSK == ((1 << NumElts) - 1)
30439 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
30440 CondCode = ISD::CondCode::SETEQ;
30443 // Perform the select as i32/i64 and then truncate to avoid partial register
30445 unsigned ResWidth = std::max(BitWidth, 32u);
30446 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
30448 SDValue Zero = DAG.getConstant(0, DL, ResVT);
30449 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
30450 SDValue Res = DAG.getBitcast(MaskVT, Match);
30451 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
30452 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
30453 Ones, Zero, CondCode);
30454 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
30457 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
30458 const X86Subtarget &Subtarget) {
30459 // PSADBW is only supported on SSE2 and up.
30460 if (!Subtarget.hasSSE2())
30463 // Verify the type we're extracting from is any integer type above i16.
30464 EVT VT = Extract->getOperand(0).getValueType();
30465 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
30468 unsigned RegSize = 128;
30469 if (Subtarget.hasBWI())
30471 else if (Subtarget.hasAVX2())
30474 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
30475 // TODO: We should be able to handle larger vectors by splitting them before
30476 // feeding them into several SADs, and then reducing over those.
30477 if (RegSize / VT.getVectorNumElements() < 8)
30480 // Match shuffle + add pyramid.
30481 unsigned BinOp = 0;
30482 SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
30484 // The operand is expected to be zero extended from i8
30485 // (verified in detectZextAbsDiff).
30486 // In order to convert to i64 and above, additional any/zero/sign
30487 // extend is expected.
30488 // The zero extend from 32 bit has no mathematical effect on the result.
30489 // Also the sign extend is basically zero extend
30490 // (extends the sign bit which is zero).
30491 // So it is correct to skip the sign/zero extend instruction.
30492 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
30493 Root.getOpcode() == ISD::ZERO_EXTEND ||
30494 Root.getOpcode() == ISD::ANY_EXTEND))
30495 Root = Root.getOperand(0);
30497 // If there was a match, we want Root to be a select that is the root of an
30498 // abs-diff pattern.
30499 if (!Root || (Root.getOpcode() != ISD::VSELECT))
30502 // Check whether we have an abs-diff pattern feeding into the select.
30503 SDValue Zext0, Zext1;
30504 if (!detectZextAbsDiff(Root, Zext0, Zext1))
30507 // Create the SAD instruction.
30509 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
30511 // If the original vector was wider than 8 elements, sum over the results
30512 // in the SAD vector.
30513 unsigned Stages = Log2_32(VT.getVectorNumElements());
30514 MVT SadVT = SAD.getSimpleValueType();
30516 unsigned SadElems = SadVT.getVectorNumElements();
30518 for(unsigned i = Stages - 3; i > 0; --i) {
30519 SmallVector<int, 16> Mask(SadElems, -1);
30520 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
30521 Mask[j] = MaskEnd + j;
30524 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
30525 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
30529 MVT Type = Extract->getSimpleValueType(0);
30530 unsigned TypeSizeInBits = Type.getSizeInBits();
30531 // Return the lowest TypeSizeInBits bits.
30532 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
30533 SAD = DAG.getBitcast(ResVT, SAD);
30534 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
30535 Extract->getOperand(1));
30538 // Attempt to peek through a target shuffle and extract the scalar from the
30540 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
30541 TargetLowering::DAGCombinerInfo &DCI,
30542 const X86Subtarget &Subtarget) {
30543 if (DCI.isBeforeLegalizeOps())
30546 SDValue Src = N->getOperand(0);
30547 SDValue Idx = N->getOperand(1);
30549 EVT VT = N->getValueType(0);
30550 EVT SrcVT = Src.getValueType();
30551 EVT SrcSVT = SrcVT.getVectorElementType();
30552 unsigned NumSrcElts = SrcVT.getVectorNumElements();
30554 // Don't attempt this for boolean mask vectors or unknown extraction indices.
30555 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
30558 // Resolve the target shuffle inputs and mask.
30559 SmallVector<int, 16> Mask;
30560 SmallVector<SDValue, 2> Ops;
30561 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
30564 // Attempt to narrow/widen the shuffle mask to the correct size.
30565 if (Mask.size() != NumSrcElts) {
30566 if ((NumSrcElts % Mask.size()) == 0) {
30567 SmallVector<int, 16> ScaledMask;
30568 int Scale = NumSrcElts / Mask.size();
30569 scaleShuffleMask<int>(Scale, Mask, ScaledMask);
30570 Mask = std::move(ScaledMask);
30571 } else if ((Mask.size() % NumSrcElts) == 0) {
30572 SmallVector<int, 16> WidenedMask;
30573 while (Mask.size() > NumSrcElts &&
30574 canWidenShuffleElements(Mask, WidenedMask))
30575 Mask = std::move(WidenedMask);
30576 // TODO - investigate support for wider shuffle masks with known upper
30577 // undef/zero elements for implicit zero-extension.
30581 // Check if narrowing/widening failed.
30582 if (Mask.size() != NumSrcElts)
30585 int SrcIdx = Mask[N->getConstantOperandVal(1)];
30588 // If the shuffle source element is undef/zero then we can just accept it.
30589 if (SrcIdx == SM_SentinelUndef)
30590 return DAG.getUNDEF(VT);
30592 if (SrcIdx == SM_SentinelZero)
30593 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
30594 : DAG.getConstant(0, dl, VT);
30596 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
30597 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
30598 SrcIdx = SrcIdx % Mask.size();
30600 // We can only extract other elements from 128-bit vectors and in certain
30601 // circumstances, depending on SSE-level.
30602 // TODO: Investigate using extract_subvector for larger vectors.
30603 // TODO: Investigate float/double extraction if it will be just stored.
30604 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
30605 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
30606 assert(SrcSVT == VT && "Unexpected extraction type");
30607 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
30608 DAG.getIntPtrConstant(SrcIdx, dl));
30611 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
30612 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
30613 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
30614 "Unexpected extraction type");
30615 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
30616 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
30617 DAG.getIntPtrConstant(SrcIdx, dl));
30618 return DAG.getZExtOrTrunc(ExtOp, dl, VT);
30624 /// Detect vector gather/scatter index generation and convert it from being a
30625 /// bunch of shuffles and extracts into a somewhat faster sequence.
30626 /// For i686, the best sequence is apparently storing the value and loading
30627 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
30628 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
30629 TargetLowering::DAGCombinerInfo &DCI,
30630 const X86Subtarget &Subtarget) {
30631 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
30634 // TODO - Remove this once we can handle the implicit zero-extension of
30635 // X86ISD::PEXTRW/X86ISD::PEXTRB in:
30636 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30637 // combineBasicSADPattern.
30638 if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
30641 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
30644 SDValue InputVector = N->getOperand(0);
30645 SDValue EltIdx = N->getOperand(1);
30647 EVT SrcVT = InputVector.getValueType();
30648 EVT VT = N->getValueType(0);
30649 SDLoc dl(InputVector);
30651 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
30652 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
30653 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
30654 SDValue MMXSrc = InputVector.getOperand(0);
30656 // The bitcast source is a direct mmx result.
30657 if (MMXSrc.getValueType() == MVT::x86mmx)
30658 return DAG.getBitcast(VT, InputVector);
30661 // Detect mmx to i32 conversion through a v2i32 elt extract.
30662 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
30663 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
30664 SDValue MMXSrc = InputVector.getOperand(0);
30666 // The bitcast source is a direct mmx result.
30667 if (MMXSrc.getValueType() == MVT::x86mmx)
30668 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
30671 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
30672 isa<ConstantSDNode>(EltIdx) &&
30673 isa<ConstantSDNode>(InputVector.getOperand(0))) {
30674 uint64_t ExtractedElt = N->getConstantOperandVal(1);
30675 uint64_t InputValue = InputVector.getConstantOperandVal(0);
30676 uint64_t Res = (InputValue >> ExtractedElt) & 1;
30677 return DAG.getConstant(Res, dl, MVT::i1);
30680 // Check whether this extract is the root of a sum of absolute differences
30681 // pattern. This has to be done here because we really want it to happen
30682 // pre-legalization,
30683 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
30686 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
30687 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
30690 // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
30691 if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
30694 // Only operate on vectors of 4 elements, where the alternative shuffling
30695 // gets to be more expensive.
30696 if (SrcVT != MVT::v4i32)
30699 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
30700 // single use which is a sign-extend or zero-extend, and all elements are
30702 SmallVector<SDNode *, 4> Uses;
30703 unsigned ExtractedElements = 0;
30704 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
30705 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
30706 if (UI.getUse().getResNo() != InputVector.getResNo())
30709 SDNode *Extract = *UI;
30710 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
30713 if (Extract->getValueType(0) != MVT::i32)
30715 if (!Extract->hasOneUse())
30717 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
30718 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
30720 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
30723 // Record which element was extracted.
30724 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
30725 Uses.push_back(Extract);
30728 // If not all the elements were used, this may not be worthwhile.
30729 if (ExtractedElements != 15)
30732 // Ok, we've now decided to do the transformation.
30733 // If 64-bit shifts are legal, use the extract-shift sequence,
30734 // otherwise bounce the vector off the cache.
30735 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30738 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
30739 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
30740 auto &DL = DAG.getDataLayout();
30741 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
30742 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
30743 DAG.getConstant(0, dl, VecIdxTy));
30744 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
30745 DAG.getConstant(1, dl, VecIdxTy));
30747 SDValue ShAmt = DAG.getConstant(
30748 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
30749 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
30750 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
30751 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
30752 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
30753 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
30754 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
30756 // Store the value to a temporary stack slot.
30757 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
30758 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
30759 MachinePointerInfo());
30761 EVT ElementType = SrcVT.getVectorElementType();
30762 unsigned EltSize = ElementType.getSizeInBits() / 8;
30764 // Replace each use (extract) with a load of the appropriate element.
30765 for (unsigned i = 0; i < 4; ++i) {
30766 uint64_t Offset = EltSize * i;
30767 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
30768 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
30770 SDValue ScalarAddr =
30771 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
30773 // Load the scalar.
30775 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
30779 // Replace the extracts
30780 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
30781 UE = Uses.end(); UI != UE; ++UI) {
30782 SDNode *Extract = *UI;
30784 uint64_t IdxVal = Extract->getConstantOperandVal(1);
30785 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
30788 // The replacement was made in place; don't return anything.
30792 /// If a vector select has an operand that is -1 or 0, try to simplify the
30793 /// select to a bitwise logic operation.
30794 /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
30796 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30797 TargetLowering::DAGCombinerInfo &DCI,
30798 const X86Subtarget &Subtarget) {
30799 SDValue Cond = N->getOperand(0);
30800 SDValue LHS = N->getOperand(1);
30801 SDValue RHS = N->getOperand(2);
30802 EVT VT = LHS.getValueType();
30803 EVT CondVT = Cond.getValueType();
30805 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30807 if (N->getOpcode() != ISD::VSELECT)
30810 assert(CondVT.isVector() && "Vector select expects a vector selector!");
30812 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30813 // Check if the first operand is all zeros and Cond type is vXi1.
30814 // This situation only applies to avx512.
30815 if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30816 CondVT.getVectorElementType() == MVT::i1) {
30817 // Invert the cond to not(cond) : xor(op,allones)=not(op)
30818 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30819 DAG.getAllOnesConstant(DL, CondVT));
30820 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30821 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30824 // To use the condition operand as a bitwise mask, it must have elements that
30825 // are the same size as the select elements. Ie, the condition operand must
30826 // have already been promoted from the IR select condition type <N x i1>.
30827 // Don't check if the types themselves are equal because that excludes
30828 // vector floating-point selects.
30829 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30832 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30833 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30835 // Try to invert the condition if true value is not all 1s and false value is
30837 if (!TValIsAllOnes && !FValIsAllZeros &&
30838 // Check if the selector will be produced by CMPP*/PCMP*.
30839 Cond.getOpcode() == ISD::SETCC &&
30840 // Check if SETCC has already been promoted.
30841 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30843 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30845 if (TValIsAllZeros || FValIsAllOnes) {
30846 SDValue CC = Cond.getOperand(2);
30847 ISD::CondCode NewCC =
30848 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30849 Cond.getOperand(0).getValueType().isInteger());
30850 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30852 std::swap(LHS, RHS);
30853 TValIsAllOnes = FValIsAllOnes;
30854 FValIsAllZeros = TValIsAllZeros;
30858 // Cond value must be 'sign splat' to be converted to a logical op.
30859 if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
30862 // vselect Cond, 111..., 000... -> Cond
30863 if (TValIsAllOnes && FValIsAllZeros)
30864 return DAG.getBitcast(VT, Cond);
30866 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30869 // vselect Cond, 111..., X -> or Cond, X
30870 if (TValIsAllOnes) {
30871 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30872 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30873 return DAG.getBitcast(VT, Or);
30876 // vselect Cond, X, 000... -> and Cond, X
30877 if (FValIsAllZeros) {
30878 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30879 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30880 return DAG.getBitcast(VT, And);
30883 // vselect Cond, 000..., X -> andn Cond, X
30884 if (TValIsAllZeros) {
30885 MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
30886 SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
30887 SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
30888 SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
30889 return DAG.getBitcast(VT, AndN);
30895 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30896 SDValue Cond = N->getOperand(0);
30897 SDValue LHS = N->getOperand(1);
30898 SDValue RHS = N->getOperand(2);
30901 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30902 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30903 if (!TrueC || !FalseC)
30906 // Don't do this for crazy integer types.
30907 EVT VT = N->getValueType(0);
30908 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
30911 // We're going to use the condition bit in math or logic ops. We could allow
30912 // this with a wider condition value (post-legalization it becomes an i8),
30913 // but if nothing is creating selects that late, it doesn't matter.
30914 if (Cond.getValueType() != MVT::i1)
30917 // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
30918 // 3, 5, or 9 with i32/i64, so those get transformed too.
30919 // TODO: For constants that overflow or do not differ by power-of-2 or small
30920 // multiplier, convert to 'and' + 'add'.
30921 const APInt &TrueVal = TrueC->getAPIntValue();
30922 const APInt &FalseVal = FalseC->getAPIntValue();
30924 APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
30928 APInt AbsDiff = Diff.abs();
30929 if (AbsDiff.isPowerOf2() ||
30930 ((VT == MVT::i32 || VT == MVT::i64) &&
30931 (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
30933 // We need a positive multiplier constant for shift/LEA codegen. The 'not'
30934 // of the condition can usually be folded into a compare predicate, but even
30935 // without that, the sequence should be cheaper than a CMOV alternative.
30936 if (TrueVal.slt(FalseVal)) {
30937 Cond = DAG.getNOT(DL, Cond, MVT::i1);
30938 std::swap(TrueC, FalseC);
30941 // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
30942 SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
30944 // Multiply condition by the difference if non-one.
30945 if (!AbsDiff.isOneValue())
30946 R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
30948 // Add the base if non-zero.
30949 if (!FalseC->isNullValue())
30950 R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
30958 // If this is a bitcasted op that can be represented as another type, push the
30959 // the bitcast to the inputs. This allows more opportunities for pattern
30960 // matching masked instructions. This is called when we know that the operation
30961 // is used as one of the inputs of a vselect.
30962 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30963 TargetLowering::DAGCombinerInfo &DCI) {
30964 // Make sure we have a bitcast.
30965 if (OrigOp.getOpcode() != ISD::BITCAST)
30968 SDValue Op = OrigOp.getOperand(0);
30970 // If the operation is used by anything other than the bitcast, we shouldn't
30971 // do this combine as that would replicate the operation.
30972 if (!Op.hasOneUse())
30975 MVT VT = OrigOp.getSimpleValueType();
30976 MVT EltVT = VT.getVectorElementType();
30977 SDLoc DL(Op.getNode());
30979 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30981 Op0 = DAG.getBitcast(VT, Op0);
30982 DCI.AddToWorklist(Op0.getNode());
30983 Op1 = DAG.getBitcast(VT, Op1);
30984 DCI.AddToWorklist(Op1.getNode());
30985 DCI.CombineTo(OrigOp.getNode(),
30986 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30990 unsigned Opcode = Op.getOpcode();
30992 case X86ISD::SHUF128: {
30993 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30995 // Only change element size, not type.
30996 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30998 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
31001 case X86ISD::SUBV_BROADCAST: {
31002 unsigned EltSize = EltVT.getSizeInBits();
31003 if (EltSize != 32 && EltSize != 64)
31005 // Only change element size, not type.
31006 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
31008 SDValue Op0 = Op.getOperand(0);
31009 MVT Op0VT = MVT::getVectorVT(EltVT,
31010 Op0.getSimpleValueType().getSizeInBits() / EltSize);
31011 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
31012 DCI.AddToWorklist(Op0.getNode());
31013 DCI.CombineTo(OrigOp.getNode(),
31014 DAG.getNode(Opcode, DL, VT, Op0));
31022 /// Do target-specific dag combines on SELECT and VSELECT nodes.
31023 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
31024 TargetLowering::DAGCombinerInfo &DCI,
31025 const X86Subtarget &Subtarget) {
31027 SDValue Cond = N->getOperand(0);
31028 // Get the LHS/RHS of the select.
31029 SDValue LHS = N->getOperand(1);
31030 SDValue RHS = N->getOperand(2);
31031 EVT VT = LHS.getValueType();
31032 EVT CondVT = Cond.getValueType();
31033 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31035 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
31036 // instructions match the semantics of the common C idiom x<y?x:y but not
31037 // x<=y?x:y, because of how they handle negative zero (which can be
31038 // ignored in unsafe-math mode).
31039 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
31040 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
31041 VT != MVT::f80 && VT != MVT::f128 &&
31042 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
31043 (Subtarget.hasSSE2() ||
31044 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
31045 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31047 unsigned Opcode = 0;
31048 // Check for x CC y ? x : y.
31049 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31050 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31054 // Converting this to a min would handle NaNs incorrectly, and swapping
31055 // the operands would cause it to handle comparisons between positive
31056 // and negative zero incorrectly.
31057 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31058 if (!DAG.getTarget().Options.UnsafeFPMath &&
31059 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31061 std::swap(LHS, RHS);
31063 Opcode = X86ISD::FMIN;
31066 // Converting this to a min would handle comparisons between positive
31067 // and negative zero incorrectly.
31068 if (!DAG.getTarget().Options.UnsafeFPMath &&
31069 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31071 Opcode = X86ISD::FMIN;
31074 // Converting this to a min would handle both negative zeros and NaNs
31075 // incorrectly, but we can swap the operands to fix both.
31076 std::swap(LHS, RHS);
31081 Opcode = X86ISD::FMIN;
31085 // Converting this to a max would handle comparisons between positive
31086 // and negative zero incorrectly.
31087 if (!DAG.getTarget().Options.UnsafeFPMath &&
31088 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
31090 Opcode = X86ISD::FMAX;
31093 // Converting this to a max would handle NaNs incorrectly, and swapping
31094 // the operands would cause it to handle comparisons between positive
31095 // and negative zero incorrectly.
31096 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
31097 if (!DAG.getTarget().Options.UnsafeFPMath &&
31098 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
31100 std::swap(LHS, RHS);
31102 Opcode = X86ISD::FMAX;
31105 // Converting this to a max would handle both negative zeros and NaNs
31106 // incorrectly, but we can swap the operands to fix both.
31107 std::swap(LHS, RHS);
31112 Opcode = X86ISD::FMAX;
31115 // Check for x CC y ? y : x -- a min/max with reversed arms.
31116 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
31117 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
31121 // Converting this to a min would handle comparisons between positive
31122 // and negative zero incorrectly, and swapping the operands would
31123 // cause it to handle NaNs incorrectly.
31124 if (!DAG.getTarget().Options.UnsafeFPMath &&
31125 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
31126 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31128 std::swap(LHS, RHS);
31130 Opcode = X86ISD::FMIN;
31133 // Converting this to a min would handle NaNs incorrectly.
31134 if (!DAG.getTarget().Options.UnsafeFPMath &&
31135 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
31137 Opcode = X86ISD::FMIN;
31140 // Converting this to a min would handle both negative zeros and NaNs
31141 // incorrectly, but we can swap the operands to fix both.
31142 std::swap(LHS, RHS);
31147 Opcode = X86ISD::FMIN;
31151 // Converting this to a max would handle NaNs incorrectly.
31152 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31154 Opcode = X86ISD::FMAX;
31157 // Converting this to a max would handle comparisons between positive
31158 // and negative zero incorrectly, and swapping the operands would
31159 // cause it to handle NaNs incorrectly.
31160 if (!DAG.getTarget().Options.UnsafeFPMath &&
31161 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
31162 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
31164 std::swap(LHS, RHS);
31166 Opcode = X86ISD::FMAX;
31169 // Converting this to a max would handle both negative zeros and NaNs
31170 // incorrectly, but we can swap the operands to fix both.
31171 std::swap(LHS, RHS);
31176 Opcode = X86ISD::FMAX;
31182 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
31185 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
31186 // lowering on KNL. In this case we convert it to
31187 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
31188 // The same situation for all 128 and 256-bit vectors of i8 and i16.
31189 // Since SKX these selects have a proper lowering.
31190 if (Subtarget.hasAVX512() && CondVT.isVector() &&
31191 CondVT.getVectorElementType() == MVT::i1 &&
31192 (VT.is128BitVector() || VT.is256BitVector()) &&
31193 (VT.getVectorElementType() == MVT::i8 ||
31194 VT.getVectorElementType() == MVT::i16) &&
31195 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
31196 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
31197 DCI.AddToWorklist(Cond.getNode());
31198 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
31201 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
31204 // Canonicalize max and min:
31205 // (x > y) ? x : y -> (x >= y) ? x : y
31206 // (x < y) ? x : y -> (x <= y) ? x : y
31207 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
31208 // the need for an extra compare
31209 // against zero. e.g.
31210 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
31212 // testl %edi, %edi
31214 // cmovgl %edi, %eax
31218 // cmovsl %eax, %edi
31219 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
31220 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
31221 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
31222 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31227 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
31228 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
31229 Cond.getOperand(0), Cond.getOperand(1), NewCC);
31230 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
31235 // Early exit check
31236 if (!TLI.isTypeLegal(VT))
31239 // Match VSELECTs into subs with unsigned saturation.
31240 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
31241 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
31242 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
31243 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
31244 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
31246 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
31247 // left side invert the predicate to simplify logic below.
31249 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
31251 CC = ISD::getSetCCInverse(CC, true);
31252 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
31256 if (Other.getNode() && Other->getNumOperands() == 2 &&
31257 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
31258 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
31259 SDValue CondRHS = Cond->getOperand(1);
31261 // Look for a general sub with unsigned saturation first.
31262 // x >= y ? x-y : 0 --> subus x, y
31263 // x > y ? x-y : 0 --> subus x, y
31264 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
31265 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
31266 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
31268 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
31269 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
31270 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
31271 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
31272 // If the RHS is a constant we have to reverse the const
31273 // canonicalization.
31274 // x > C-1 ? x+-C : 0 --> subus x, C
31275 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
31276 CondRHSConst->getAPIntValue() ==
31277 (-OpRHSConst->getAPIntValue() - 1))
31278 return DAG.getNode(
31279 X86ISD::SUBUS, DL, VT, OpLHS,
31280 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
31282 // Another special case: If C was a sign bit, the sub has been
31283 // canonicalized into a xor.
31284 // FIXME: Would it be better to use computeKnownBits to determine
31285 // whether it's safe to decanonicalize the xor?
31286 // x s< 0 ? x^C : 0 --> subus x, C
31287 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
31288 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
31289 OpRHSConst->getAPIntValue().isSignMask())
31290 // Note that we have to rebuild the RHS constant here to ensure we
31291 // don't rely on particular values of undef lanes.
31292 return DAG.getNode(
31293 X86ISD::SUBUS, DL, VT, OpLHS,
31294 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
31299 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
31302 // If this is a *dynamic* select (non-constant condition) and we can match
31303 // this node with one of the variable blend instructions, restructure the
31304 // condition so that blends can use the high (sign) bit of each element and
31305 // use SimplifyDemandedBits to simplify the condition operand.
31306 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
31307 !DCI.isBeforeLegalize() &&
31308 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
31309 unsigned BitWidth = Cond.getScalarValueSizeInBits();
31311 // Don't optimize vector selects that map to mask-registers.
31315 // We can only handle the cases where VSELECT is directly legal on the
31316 // subtarget. We custom lower VSELECT nodes with constant conditions and
31317 // this makes it hard to see whether a dynamic VSELECT will correctly
31318 // lower, so we both check the operation's status and explicitly handle the
31319 // cases where a *dynamic* blend will fail even though a constant-condition
31320 // blend could be custom lowered.
31321 // FIXME: We should find a better way to handle this class of problems.
31322 // Potentially, we should combine constant-condition vselect nodes
31323 // pre-legalization into shuffles and not mark as many types as custom
31325 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
31327 // FIXME: We don't support i16-element blends currently. We could and
31328 // should support them by making *all* the bits in the condition be set
31329 // rather than just the high bit and using an i8-element blend.
31330 if (VT.getVectorElementType() == MVT::i16)
31332 // Dynamic blending was only available from SSE4.1 onward.
31333 if (VT.is128BitVector() && !Subtarget.hasSSE41())
31335 // Byte blends are only available in AVX2
31336 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
31338 // There are no 512-bit blend instructions that use sign bits.
31339 if (VT.is512BitVector())
31342 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
31343 APInt DemandedMask(APInt::getSignMask(BitWidth));
31345 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
31346 !DCI.isBeforeLegalizeOps());
31347 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
31348 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
31349 // If we changed the computation somewhere in the DAG, this change will
31350 // affect all users of Cond. Make sure it is fine and update all the nodes
31351 // so that we do not use the generic VSELECT anymore. Otherwise, we may
31352 // perform wrong optimizations as we messed with the actual expectation
31353 // for the vector boolean values.
31354 if (Cond != TLO.Old) {
31355 // Check all uses of the condition operand to check whether it will be
31356 // consumed by non-BLEND instructions. Those may require that all bits
31357 // are set properly.
31358 for (SDNode *U : Cond->uses()) {
31359 // TODO: Add other opcodes eventually lowered into BLEND.
31360 if (U->getOpcode() != ISD::VSELECT)
31364 // Update all users of the condition before committing the change, so
31365 // that the VSELECT optimizations that expect the correct vector boolean
31366 // value will not be triggered.
31367 for (SDNode *U : Cond->uses()) {
31368 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
31369 U->getValueType(0), Cond, U->getOperand(1),
31371 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
31373 DCI.CommitTargetLoweringOpt(TLO);
31376 // Only Cond (rather than other nodes in the computation chain) was
31377 // changed. Change the condition just for N to keep the opportunity to
31378 // optimize all other users their own way.
31379 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
31380 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
31385 // Look for vselects with LHS/RHS being bitcasted from an operation that
31386 // can be executed on another type. Push the bitcast to the inputs of
31387 // the operation. This exposes opportunities for using masking instructions.
31388 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
31389 CondVT.getVectorElementType() == MVT::i1) {
31390 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
31391 return SDValue(N, 0);
31392 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
31393 return SDValue(N, 0);
31396 // Custom action for SELECT MMX
31397 if (VT == MVT::x86mmx) {
31398 LHS = DAG.getBitcast(MVT::i64, LHS);
31399 RHS = DAG.getBitcast(MVT::i64, RHS);
31400 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
31401 return DAG.getBitcast(VT, newSelect);
31408 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
31410 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
31411 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
31412 /// Note that this is only legal for some op/cc combinations.
31413 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
31415 const X86Subtarget &Subtarget) {
31416 // This combine only operates on CMP-like nodes.
31417 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31418 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31421 // Can't replace the cmp if it has more uses than the one we're looking at.
31422 // FIXME: We would like to be able to handle this, but would need to make sure
31423 // all uses were updated.
31424 if (!Cmp.hasOneUse())
31427 // This only applies to variations of the common case:
31428 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
31429 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
31430 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
31431 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
31432 // Using the proper condcodes (see below), overflow is checked for.
31434 // FIXME: We can generalize both constraints:
31435 // - XOR/OR/AND (if they were made to survive AtomicExpand)
31437 // if the result is compared.
31439 SDValue CmpLHS = Cmp.getOperand(0);
31440 SDValue CmpRHS = Cmp.getOperand(1);
31442 if (!CmpLHS.hasOneUse())
31445 unsigned Opc = CmpLHS.getOpcode();
31446 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
31449 SDValue OpRHS = CmpLHS.getOperand(2);
31450 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
31454 APInt Addend = OpRHSC->getAPIntValue();
31455 if (Opc == ISD::ATOMIC_LOAD_SUB)
31458 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
31462 APInt Comparison = CmpRHSC->getAPIntValue();
31464 // If the addend is the negation of the comparison value, then we can do
31465 // a full comparison by emitting the atomic arithmetic as a locked sub.
31466 if (Comparison == -Addend) {
31467 // The CC is fine, but we need to rewrite the LHS of the comparison as an
31469 auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
31470 auto AtomicSub = DAG.getAtomic(
31471 ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
31472 /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
31473 /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
31474 AN->getMemOperand());
31475 // If the comparision uses the CF flag we can't use INC/DEC instructions.
31476 bool NeedCF = false;
31479 case X86::COND_A: case X86::COND_AE:
31480 case X86::COND_B: case X86::COND_BE:
31484 auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
31485 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31486 DAG.getUNDEF(CmpLHS.getValueType()));
31487 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31491 // We can handle comparisons with zero in a number of cases by manipulating
31493 if (!Comparison.isNullValue())
31496 if (CC == X86::COND_S && Addend == 1)
31498 else if (CC == X86::COND_NS && Addend == 1)
31500 else if (CC == X86::COND_G && Addend == -1)
31502 else if (CC == X86::COND_LE && Addend == -1)
31507 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
31508 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
31509 DAG.getUNDEF(CmpLHS.getValueType()));
31510 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
31514 // Check whether a boolean test is testing a boolean value generated by
31515 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
31518 // Simplify the following patterns:
31519 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
31520 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
31521 // to (Op EFLAGS Cond)
31523 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
31524 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
31525 // to (Op EFLAGS !Cond)
31527 // where Op could be BRCOND or CMOV.
31529 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
31530 // This combine only operates on CMP-like nodes.
31531 if (!(Cmp.getOpcode() == X86ISD::CMP ||
31532 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
31535 // Quit if not used as a boolean value.
31536 if (CC != X86::COND_E && CC != X86::COND_NE)
31539 // Check CMP operands. One of them should be 0 or 1 and the other should be
31540 // an SetCC or extended from it.
31541 SDValue Op1 = Cmp.getOperand(0);
31542 SDValue Op2 = Cmp.getOperand(1);
31545 const ConstantSDNode* C = nullptr;
31546 bool needOppositeCond = (CC == X86::COND_E);
31547 bool checkAgainstTrue = false; // Is it a comparison against 1?
31549 if ((C = dyn_cast<ConstantSDNode>(Op1)))
31551 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
31553 else // Quit if all operands are not constants.
31556 if (C->getZExtValue() == 1) {
31557 needOppositeCond = !needOppositeCond;
31558 checkAgainstTrue = true;
31559 } else if (C->getZExtValue() != 0)
31560 // Quit if the constant is neither 0 or 1.
31563 bool truncatedToBoolWithAnd = false;
31564 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
31565 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
31566 SetCC.getOpcode() == ISD::TRUNCATE ||
31567 SetCC.getOpcode() == ISD::AND) {
31568 if (SetCC.getOpcode() == ISD::AND) {
31570 if (isOneConstant(SetCC.getOperand(0)))
31572 if (isOneConstant(SetCC.getOperand(1)))
31576 SetCC = SetCC.getOperand(OpIdx);
31577 truncatedToBoolWithAnd = true;
31579 SetCC = SetCC.getOperand(0);
31582 switch (SetCC.getOpcode()) {
31583 case X86ISD::SETCC_CARRY:
31584 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
31585 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
31586 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
31587 // truncated to i1 using 'and'.
31588 if (checkAgainstTrue && !truncatedToBoolWithAnd)
31590 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
31591 "Invalid use of SETCC_CARRY!");
31593 case X86ISD::SETCC:
31594 // Set the condition code or opposite one if necessary.
31595 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
31596 if (needOppositeCond)
31597 CC = X86::GetOppositeBranchCondition(CC);
31598 return SetCC.getOperand(1);
31599 case X86ISD::CMOV: {
31600 // Check whether false/true value has canonical one, i.e. 0 or 1.
31601 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
31602 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
31603 // Quit if true value is not a constant.
31606 // Quit if false value is not a constant.
31608 SDValue Op = SetCC.getOperand(0);
31609 // Skip 'zext' or 'trunc' node.
31610 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
31611 Op.getOpcode() == ISD::TRUNCATE)
31612 Op = Op.getOperand(0);
31613 // A special case for rdrand/rdseed, where 0 is set if false cond is
31615 if ((Op.getOpcode() != X86ISD::RDRAND &&
31616 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
31619 // Quit if false value is not the constant 0 or 1.
31620 bool FValIsFalse = true;
31621 if (FVal && FVal->getZExtValue() != 0) {
31622 if (FVal->getZExtValue() != 1)
31624 // If FVal is 1, opposite cond is needed.
31625 needOppositeCond = !needOppositeCond;
31626 FValIsFalse = false;
31628 // Quit if TVal is not the constant opposite of FVal.
31629 if (FValIsFalse && TVal->getZExtValue() != 1)
31631 if (!FValIsFalse && TVal->getZExtValue() != 0)
31633 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
31634 if (needOppositeCond)
31635 CC = X86::GetOppositeBranchCondition(CC);
31636 return SetCC.getOperand(3);
31643 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
31645 /// (X86or (X86setcc) (X86setcc))
31646 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
31647 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
31648 X86::CondCode &CC1, SDValue &Flags,
31650 if (Cond->getOpcode() == X86ISD::CMP) {
31651 if (!isNullConstant(Cond->getOperand(1)))
31654 Cond = Cond->getOperand(0);
31659 SDValue SetCC0, SetCC1;
31660 switch (Cond->getOpcode()) {
31661 default: return false;
31668 SetCC0 = Cond->getOperand(0);
31669 SetCC1 = Cond->getOperand(1);
31673 // Make sure we have SETCC nodes, using the same flags value.
31674 if (SetCC0.getOpcode() != X86ISD::SETCC ||
31675 SetCC1.getOpcode() != X86ISD::SETCC ||
31676 SetCC0->getOperand(1) != SetCC1->getOperand(1))
31679 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
31680 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
31681 Flags = SetCC0->getOperand(1);
31685 // When legalizing carry, we create carries via add X, -1
31686 // If that comes from an actual carry, via setcc, we use the
31688 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
31689 if (EFLAGS.getOpcode() == X86ISD::ADD) {
31690 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
31691 SDValue Carry = EFLAGS.getOperand(0);
31692 while (Carry.getOpcode() == ISD::TRUNCATE ||
31693 Carry.getOpcode() == ISD::ZERO_EXTEND ||
31694 Carry.getOpcode() == ISD::SIGN_EXTEND ||
31695 Carry.getOpcode() == ISD::ANY_EXTEND ||
31696 (Carry.getOpcode() == ISD::AND &&
31697 isOneConstant(Carry.getOperand(1))))
31698 Carry = Carry.getOperand(0);
31699 if (Carry.getOpcode() == X86ISD::SETCC ||
31700 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
31701 if (Carry.getConstantOperandVal(0) == X86::COND_B)
31702 return Carry.getOperand(1);
31710 /// Optimize an EFLAGS definition used according to the condition code \p CC
31711 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
31712 /// uses of chain values.
31713 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
31715 const X86Subtarget &Subtarget) {
31716 if (CC == X86::COND_B)
31717 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
31720 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
31722 return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
31725 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
31726 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
31727 TargetLowering::DAGCombinerInfo &DCI,
31728 const X86Subtarget &Subtarget) {
31731 SDValue FalseOp = N->getOperand(0);
31732 SDValue TrueOp = N->getOperand(1);
31733 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
31734 SDValue Cond = N->getOperand(3);
31736 if (CC == X86::COND_E || CC == X86::COND_NE) {
31737 switch (Cond.getOpcode()) {
31741 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
31742 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
31743 return (CC == X86::COND_E) ? FalseOp : TrueOp;
31747 // Try to simplify the EFLAGS and condition code operands.
31748 // We can't always do this as FCMOV only supports a subset of X86 cond.
31749 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
31750 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
31751 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
31753 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
31757 // If this is a select between two integer constants, try to do some
31758 // optimizations. Note that the operands are ordered the opposite of SELECT
31760 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
31761 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
31762 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
31763 // larger than FalseC (the false value).
31764 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
31765 CC = X86::GetOppositeBranchCondition(CC);
31766 std::swap(TrueC, FalseC);
31767 std::swap(TrueOp, FalseOp);
31770 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
31771 // This is efficient for any integer data type (including i8/i16) and
31773 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31774 Cond = getSETCC(CC, Cond, DL, DAG);
31776 // Zero extend the condition if needed.
31777 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31779 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31780 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31781 DAG.getConstant(ShAmt, DL, MVT::i8));
31785 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
31786 // for any integer data type, including i8/i16.
31787 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31788 Cond = getSETCC(CC, Cond, DL, DAG);
31790 // Zero extend the condition if needed.
31791 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31792 FalseC->getValueType(0), Cond);
31793 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31794 SDValue(FalseC, 0));
31798 // Optimize cases that will turn into an LEA instruction. This requires
31799 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31800 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31801 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31802 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31804 bool isFastMultiplier = false;
31806 switch ((unsigned char)Diff) {
31808 case 1: // result = add base, cond
31809 case 2: // result = lea base( , cond*2)
31810 case 3: // result = lea base(cond, cond*2)
31811 case 4: // result = lea base( , cond*4)
31812 case 5: // result = lea base(cond, cond*4)
31813 case 8: // result = lea base( , cond*8)
31814 case 9: // result = lea base(cond, cond*8)
31815 isFastMultiplier = true;
31820 if (isFastMultiplier) {
31821 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31822 Cond = getSETCC(CC, Cond, DL ,DAG);
31823 // Zero extend the condition if needed.
31824 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31826 // Scale the condition by the difference.
31828 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31829 DAG.getConstant(Diff, DL, Cond.getValueType()));
31831 // Add the base if non-zero.
31832 if (FalseC->getAPIntValue() != 0)
31833 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31834 SDValue(FalseC, 0));
31841 // Handle these cases:
31842 // (select (x != c), e, c) -> select (x != c), e, x),
31843 // (select (x == c), c, e) -> select (x == c), x, e)
31844 // where the c is an integer constant, and the "select" is the combination
31845 // of CMOV and CMP.
31847 // The rationale for this change is that the conditional-move from a constant
31848 // needs two instructions, however, conditional-move from a register needs
31849 // only one instruction.
31851 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31852 // some instruction-combining opportunities. This opt needs to be
31853 // postponed as late as possible.
31855 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31856 // the DCI.xxxx conditions are provided to postpone the optimization as
31857 // late as possible.
31859 ConstantSDNode *CmpAgainst = nullptr;
31860 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31861 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31862 !isa<ConstantSDNode>(Cond.getOperand(0))) {
31864 if (CC == X86::COND_NE &&
31865 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31866 CC = X86::GetOppositeBranchCondition(CC);
31867 std::swap(TrueOp, FalseOp);
31870 if (CC == X86::COND_E &&
31871 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31872 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31873 DAG.getConstant(CC, DL, MVT::i8), Cond };
31874 return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
31879 // Fold and/or of setcc's to double CMOV:
31880 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31881 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31883 // This combine lets us generate:
31884 // cmovcc1 (jcc1 if we don't have CMOV)
31890 // cmovne (jne if we don't have CMOV)
31891 // When we can't use the CMOV instruction, it might increase branch
31893 // When we can use CMOV, or when there is no mispredict, this improves
31894 // throughput and reduces register pressure.
31896 if (CC == X86::COND_NE) {
31898 X86::CondCode CC0, CC1;
31900 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31902 std::swap(FalseOp, TrueOp);
31903 CC0 = X86::GetOppositeBranchCondition(CC0);
31904 CC1 = X86::GetOppositeBranchCondition(CC1);
31907 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31909 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
31910 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31911 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
31919 /// Different mul shrinking modes.
31920 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31922 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31923 EVT VT = N->getOperand(0).getValueType();
31924 if (VT.getScalarSizeInBits() != 32)
31927 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31928 unsigned SignBits[2] = {1, 1};
31929 bool IsPositive[2] = {false, false};
31930 for (unsigned i = 0; i < 2; i++) {
31931 SDValue Opd = N->getOperand(i);
31933 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31934 // compute signbits for it separately.
31935 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31936 // For anyextend, it is safe to assume an appropriate number of leading
31938 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31940 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31945 IsPositive[i] = true;
31946 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31947 // All the operands of BUILD_VECTOR need to be int constant.
31948 // Find the smallest value range which all the operands belong to.
31950 IsPositive[i] = true;
31951 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31952 if (SubOp.isUndef())
31954 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31957 APInt IntVal = CN->getAPIntValue();
31958 if (IntVal.isNegative())
31959 IsPositive[i] = false;
31960 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31963 SignBits[i] = DAG.ComputeNumSignBits(Opd);
31964 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31965 IsPositive[i] = true;
31969 bool AllPositive = IsPositive[0] && IsPositive[1];
31970 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31971 // When ranges are from -128 ~ 127, use MULS8 mode.
31972 if (MinSignBits >= 25)
31974 // When ranges are from 0 ~ 255, use MULU8 mode.
31975 else if (AllPositive && MinSignBits >= 24)
31977 // When ranges are from -32768 ~ 32767, use MULS16 mode.
31978 else if (MinSignBits >= 17)
31980 // When ranges are from 0 ~ 65535, use MULU16 mode.
31981 else if (AllPositive && MinSignBits >= 16)
31988 /// When the operands of vector mul are extended from smaller size values,
31989 /// like i8 and i16, the type of mul may be shrinked to generate more
31990 /// efficient code. Two typical patterns are handled:
31992 /// %2 = sext/zext <N x i8> %1 to <N x i32>
31993 /// %4 = sext/zext <N x i8> %3 to <N x i32>
31994 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31995 /// %5 = mul <N x i32> %2, %4
31998 /// %2 = zext/sext <N x i16> %1 to <N x i32>
31999 /// %4 = zext/sext <N x i16> %3 to <N x i32>
32000 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
32001 /// %5 = mul <N x i32> %2, %4
32003 /// There are four mul shrinking modes:
32004 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
32005 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
32006 /// generate pmullw+sext32 for it (MULS8 mode).
32007 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
32008 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
32009 /// generate pmullw+zext32 for it (MULU8 mode).
32010 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
32011 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
32012 /// generate pmullw+pmulhw for it (MULS16 mode).
32013 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
32014 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
32015 /// generate pmullw+pmulhuw for it (MULU16 mode).
32016 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
32017 const X86Subtarget &Subtarget) {
32018 // Check for legality
32019 // pmullw/pmulhw are not supported by SSE.
32020 if (!Subtarget.hasSSE2())
32023 // Check for profitability
32024 // pmulld is supported since SSE41. It is better to use pmulld
32025 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
32027 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
32028 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
32032 if (!canReduceVMulWidth(N, DAG, Mode))
32036 SDValue N0 = N->getOperand(0);
32037 SDValue N1 = N->getOperand(1);
32038 EVT VT = N->getOperand(0).getValueType();
32039 unsigned NumElts = VT.getVectorNumElements();
32040 if ((NumElts % 2) != 0)
32043 unsigned RegSize = 128;
32044 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
32045 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
32047 // Shrink the operands of mul.
32048 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
32049 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
32051 if (NumElts >= OpsVT.getVectorNumElements()) {
32052 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
32053 // lower part is needed.
32054 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
32055 if (Mode == MULU8 || Mode == MULS8) {
32056 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
32059 MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
32060 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
32061 // the higher part is also needed.
32062 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32063 ReducedVT, NewN0, NewN1);
32065 // Repack the lower part and higher part result of mul into a wider
32067 // Generate shuffle functioning as punpcklwd.
32068 SmallVector<int, 16> ShuffleMask(NumElts);
32069 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32070 ShuffleMask[2 * i] = i;
32071 ShuffleMask[2 * i + 1] = i + NumElts;
32074 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32075 ResLo = DAG.getBitcast(ResVT, ResLo);
32076 // Generate shuffle functioning as punpckhwd.
32077 for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
32078 ShuffleMask[2 * i] = i + NumElts / 2;
32079 ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
32082 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
32083 ResHi = DAG.getBitcast(ResVT, ResHi);
32084 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
32087 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
32088 // to legalize the mul explicitly because implicit legalization for type
32089 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
32090 // instructions which will not exist when we explicitly legalize it by
32091 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
32092 // <4 x i16> undef).
32094 // Legalize the operands of mul.
32095 // FIXME: We may be able to handle non-concatenated vectors by insertion.
32096 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
32097 if ((RegSize % ReducedSizeInBits) != 0)
32100 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
32101 DAG.getUNDEF(ReducedVT));
32103 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32105 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
32107 if (Mode == MULU8 || Mode == MULS8) {
32108 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
32110 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32112 // convert the type of mul result to VT.
32113 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32114 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
32115 : ISD::SIGN_EXTEND_VECTOR_INREG,
32117 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32118 DAG.getIntPtrConstant(0, DL));
32120 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
32121 // MULU16/MULS16, both parts are needed.
32122 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
32123 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
32124 OpsVT, NewN0, NewN1);
32126 // Repack the lower part and higher part result of mul into a wider
32127 // result. Make sure the type of mul result is VT.
32128 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
32129 SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
32130 Res = DAG.getBitcast(ResVT, Res);
32131 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
32132 DAG.getIntPtrConstant(0, DL));
32137 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
32138 EVT VT, SDLoc DL) {
32140 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
32141 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32142 DAG.getConstant(Mult, DL, VT));
32143 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
32144 DAG.getConstant(Shift, DL, MVT::i8));
32145 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32150 auto combineMulMulAddOrSub = [&](bool isAdd) {
32151 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32152 DAG.getConstant(9, DL, VT));
32153 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
32154 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
32163 // mul x, 11 => add ((shl (mul x, 5), 1), x)
32164 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
32166 // mul x, 21 => add ((shl (mul x, 5), 2), x)
32167 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
32169 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
32170 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32171 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
32173 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
32174 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
32176 // mul x, 13 => add ((shl (mul x, 3), 2), x)
32177 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
32179 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
32180 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
32182 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
32183 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32184 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
32186 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
32187 return combineMulMulAddOrSub(/*isAdd*/ false);
32189 // mul x, 28 => add ((mul (mul x, 9), 3), x)
32190 return combineMulMulAddOrSub(/*isAdd*/ true);
32192 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
32193 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
32194 combineMulMulAddOrSub(/*isAdd*/ true));
32196 // mul x, 30 => sub (sub ((shl x, 5), x), x)
32197 return DAG.getNode(
32199 DAG.getNode(ISD::SUB, DL, VT,
32200 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32201 DAG.getConstant(5, DL, MVT::i8)),
32208 /// Optimize a single multiply with constant into two operations in order to
32209 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
32210 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
32211 TargetLowering::DAGCombinerInfo &DCI,
32212 const X86Subtarget &Subtarget) {
32213 EVT VT = N->getValueType(0);
32214 if (DCI.isBeforeLegalize() && VT.isVector())
32215 return reduceVMULWidth(N, DAG, Subtarget);
32217 if (!MulConstantOptimization)
32219 // An imul is usually smaller than the alternative sequence.
32220 if (DAG.getMachineFunction().getFunction()->optForMinSize())
32223 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
32226 if (VT != MVT::i64 && VT != MVT::i32)
32229 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
32232 uint64_t MulAmt = C->getZExtValue();
32233 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
32236 uint64_t MulAmt1 = 0;
32237 uint64_t MulAmt2 = 0;
32238 if ((MulAmt % 9) == 0) {
32240 MulAmt2 = MulAmt / 9;
32241 } else if ((MulAmt % 5) == 0) {
32243 MulAmt2 = MulAmt / 5;
32244 } else if ((MulAmt % 3) == 0) {
32246 MulAmt2 = MulAmt / 3;
32252 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
32254 if (isPowerOf2_64(MulAmt2) &&
32255 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
32256 // If second multiplifer is pow2, issue it first. We want the multiply by
32257 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
32259 std::swap(MulAmt1, MulAmt2);
32261 if (isPowerOf2_64(MulAmt1))
32262 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32263 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
32265 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
32266 DAG.getConstant(MulAmt1, DL, VT));
32268 if (isPowerOf2_64(MulAmt2))
32269 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
32270 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
32272 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
32273 DAG.getConstant(MulAmt2, DL, VT));
32274 } else if (!Subtarget.slowLEA())
32275 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
32278 assert(MulAmt != 0 &&
32279 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
32280 "Both cases that could cause potential overflows should have "
32281 "already been handled.");
32282 int64_t SignMulAmt = C->getSExtValue();
32283 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
32284 (SignMulAmt != -INT64_MAX)) {
32285 int NumSign = SignMulAmt > 0 ? 1 : -1;
32286 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
32287 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
32288 if (IsPowerOf2_64PlusOne) {
32289 // (mul x, 2^N + 1) => (add (shl x, N), x)
32290 NewMul = DAG.getNode(
32291 ISD::ADD, DL, VT, N->getOperand(0),
32292 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32293 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
32295 } else if (IsPowerOf2_64MinusOne) {
32296 // (mul x, 2^N - 1) => (sub (shl x, N), x)
32297 NewMul = DAG.getNode(
32299 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
32300 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
32304 // To negate, subtract the number from zero
32305 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
32307 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
32312 // Do not add new nodes to DAG combiner worklist.
32313 DCI.CombineTo(N, NewMul, false);
32318 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
32319 SDValue N0 = N->getOperand(0);
32320 SDValue N1 = N->getOperand(1);
32321 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
32322 EVT VT = N0.getValueType();
32324 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
32325 // since the result of setcc_c is all zero's or all ones.
32326 if (VT.isInteger() && !VT.isVector() &&
32327 N1C && N0.getOpcode() == ISD::AND &&
32328 N0.getOperand(1).getOpcode() == ISD::Constant) {
32329 SDValue N00 = N0.getOperand(0);
32330 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
32331 Mask <<= N1C->getAPIntValue();
32332 bool MaskOK = false;
32333 // We can handle cases concerning bit-widening nodes containing setcc_c if
32334 // we carefully interrogate the mask to make sure we are semantics
32336 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
32337 // of the underlying setcc_c operation if the setcc_c was zero extended.
32338 // Consider the following example:
32339 // zext(setcc_c) -> i32 0x0000FFFF
32340 // c1 -> i32 0x0000FFFF
32341 // c2 -> i32 0x00000001
32342 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
32343 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
32344 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32346 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
32347 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32349 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
32350 N00.getOpcode() == ISD::ANY_EXTEND) &&
32351 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
32352 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
32354 if (MaskOK && Mask != 0) {
32356 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
32360 // Hardware support for vector shifts is sparse which makes us scalarize the
32361 // vector operations in many cases. Also, on sandybridge ADD is faster than
32363 // (shl V, 1) -> add V,V
32364 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
32365 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
32366 assert(N0.getValueType().isVector() && "Invalid vector shift type");
32367 // We shift all of the values by one. In many cases we do not have
32368 // hardware support for this operation. This is better expressed as an ADD
32370 if (N1SplatC->getAPIntValue() == 1)
32371 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
32377 static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
32378 SDValue N0 = N->getOperand(0);
32379 SDValue N1 = N->getOperand(1);
32380 EVT VT = N0.getValueType();
32381 unsigned Size = VT.getSizeInBits();
32383 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
32384 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
32385 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
32386 // depending on sign of (SarConst - [56,48,32,24,16])
32388 // sexts in X86 are MOVs. The MOVs have the same code size
32389 // as above SHIFTs (only SHIFT on 1 has lower code size).
32390 // However the MOVs have 2 advantages to a SHIFT:
32391 // 1. MOVs can write to a register that differs from source
32392 // 2. MOVs accept memory operands
32394 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
32395 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
32396 N0.getOperand(1).getOpcode() != ISD::Constant)
32399 SDValue N00 = N0.getOperand(0);
32400 SDValue N01 = N0.getOperand(1);
32401 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
32402 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
32403 EVT CVT = N1.getValueType();
32405 if (SarConst.isNegative())
32408 for (MVT SVT : MVT::integer_valuetypes()) {
32409 unsigned ShiftSize = SVT.getSizeInBits();
32410 // skipping types without corresponding sext/zext and
32411 // ShlConst that is not one of [56,48,32,24,16]
32412 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
32416 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
32417 SarConst = SarConst - (Size - ShiftSize);
32420 else if (SarConst.isNegative())
32421 return DAG.getNode(ISD::SHL, DL, VT, NN,
32422 DAG.getConstant(-SarConst, DL, CVT));
32424 return DAG.getNode(ISD::SRA, DL, VT, NN,
32425 DAG.getConstant(SarConst, DL, CVT));
32430 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
32431 SDValue N0 = N->getOperand(0);
32432 SDValue N1 = N->getOperand(1);
32433 EVT VT = N0.getValueType();
32435 // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
32436 // TODO: This is a generic DAG combine that became an x86-only combine to
32437 // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
32438 // and-not ('andn').
32439 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
32442 auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
32443 auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32444 if (!ShiftC || !AndC)
32447 // If we can shrink the constant mask below 8-bits or 32-bits, then this
32448 // transform should reduce code size. It may also enable secondary transforms
32449 // from improved known-bits analysis or instruction selection.
32450 APInt MaskVal = AndC->getAPIntValue();
32451 APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
32452 unsigned OldMaskSize = MaskVal.getMinSignedBits();
32453 unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
32454 if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
32455 (OldMaskSize > 32 && NewMaskSize <= 32)) {
32456 // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
32458 SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
32459 SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
32460 return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
32465 /// \brief Returns a vector of 0s if the node in input is a vector logical
32466 /// shift by a constant amount which is known to be bigger than or equal
32467 /// to the vector element size in bits.
32468 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
32469 const X86Subtarget &Subtarget) {
32470 EVT VT = N->getValueType(0);
32472 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
32473 (!Subtarget.hasInt256() ||
32474 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
32477 SDValue Amt = N->getOperand(1);
32479 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
32480 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
32481 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
32482 unsigned MaxAmount =
32483 VT.getSimpleVT().getScalarSizeInBits();
32485 // SSE2/AVX2 logical shifts always return a vector of 0s
32486 // if the shift amount is bigger than or equal to
32487 // the element size. The constant shift amount will be
32488 // encoded as a 8-bit immediate.
32489 if (ShiftAmt.trunc(8).uge(MaxAmount))
32490 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
32496 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
32497 TargetLowering::DAGCombinerInfo &DCI,
32498 const X86Subtarget &Subtarget) {
32499 if (N->getOpcode() == ISD::SHL)
32500 if (SDValue V = combineShiftLeft(N, DAG))
32503 if (N->getOpcode() == ISD::SRA)
32504 if (SDValue V = combineShiftRightArithmetic(N, DAG))
32507 if (N->getOpcode() == ISD::SRL)
32508 if (SDValue V = combineShiftRightLogical(N, DAG))
32511 // Try to fold this logical shift into a zero vector.
32512 if (N->getOpcode() != ISD::SRA)
32513 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
32519 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
32520 TargetLowering::DAGCombinerInfo &DCI,
32521 const X86Subtarget &Subtarget) {
32522 unsigned Opcode = N->getOpcode();
32523 assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
32524 "Unexpected shift opcode");
32526 EVT VT = N->getValueType(0);
32527 SDValue N0 = N->getOperand(0);
32528 SDValue N1 = N->getOperand(1);
32529 unsigned DstBitsPerElt = VT.getScalarSizeInBits();
32530 unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
32531 assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
32532 N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
32533 "Unexpected PACKSS/PACKUS input type");
32535 // Constant Folding.
32536 APInt UndefElts0, UndefElts1;
32537 SmallVector<APInt, 32> EltBits0, EltBits1;
32538 if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
32539 (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
32540 getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
32541 getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
32542 unsigned NumLanes = VT.getSizeInBits() / 128;
32543 unsigned NumDstElts = VT.getVectorNumElements();
32544 unsigned NumSrcElts = NumDstElts / 2;
32545 unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
32546 unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
32547 bool IsSigned = (X86ISD::PACKSS == Opcode);
32549 APInt Undefs(NumDstElts, 0);
32550 SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
32551 for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
32552 for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
32553 unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
32554 auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
32555 auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
32557 if (UndefElts[SrcIdx]) {
32558 Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
32562 APInt &Val = EltBits[SrcIdx];
32564 // PACKSS: Truncate signed value with signed saturation.
32565 // Source values less than dst minint are saturated to minint.
32566 // Source values greater than dst maxint are saturated to maxint.
32567 if (Val.isSignedIntN(DstBitsPerElt))
32568 Val = Val.trunc(DstBitsPerElt);
32569 else if (Val.isNegative())
32570 Val = APInt::getSignedMinValue(DstBitsPerElt);
32572 Val = APInt::getSignedMaxValue(DstBitsPerElt);
32574 // PACKUS: Truncate signed value with unsigned saturation.
32575 // Source values less than zero are saturated to zero.
32576 // Source values greater than dst maxuint are saturated to maxuint.
32577 if (Val.isIntN(DstBitsPerElt))
32578 Val = Val.trunc(DstBitsPerElt);
32579 else if (Val.isNegative())
32580 Val = APInt::getNullValue(DstBitsPerElt);
32582 Val = APInt::getAllOnesValue(DstBitsPerElt);
32584 Bits[Lane * NumDstEltsPerLane + Elt] = Val;
32588 return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
32591 // Attempt to combine as shuffle.
32593 if (SDValue Res = combineX86ShufflesRecursively(
32594 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32595 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32596 DCI.CombineTo(N, Res);
32603 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
32604 TargetLowering::DAGCombinerInfo &DCI,
32605 const X86Subtarget &Subtarget) {
32606 unsigned Opcode = N->getOpcode();
32607 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
32608 X86ISD::VSRLI == Opcode) &&
32609 "Unexpected shift opcode");
32610 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
32611 EVT VT = N->getValueType(0);
32612 SDValue N0 = N->getOperand(0);
32613 SDValue N1 = N->getOperand(1);
32614 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
32615 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
32616 "Unexpected value type");
32618 // Out of range logical bit shifts are guaranteed to be zero.
32619 // Out of range arithmetic bit shifts splat the sign bit.
32620 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
32621 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
32623 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
32625 ShiftVal = NumBitsPerElt - 1;
32628 // Shift N0 by zero -> N0.
32632 // Shift zero -> zero.
32633 if (ISD::isBuildVectorAllZeros(N0.getNode()))
32634 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
32636 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
32637 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
32638 // TODO - support other sra opcodes as needed.
32639 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
32640 N0.getOpcode() == X86ISD::VSRAI)
32641 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
32643 // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
32644 if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
32645 N1 == N0.getOperand(1)) {
32646 SDValue N00 = N0.getOperand(0);
32647 unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
32648 if (ShiftVal.ult(NumSignBits))
32652 // We can decode 'whole byte' logical bit shifts as shuffles.
32653 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
32655 if (SDValue Res = combineX86ShufflesRecursively(
32656 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32657 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32658 DCI.CombineTo(N, Res);
32663 // Constant Folding.
32665 SmallVector<APInt, 32> EltBits;
32666 if (N->isOnlyUserOf(N0.getNode()) &&
32667 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
32668 assert(EltBits.size() == VT.getVectorNumElements() &&
32669 "Unexpected shift value type");
32670 unsigned ShiftImm = ShiftVal.getZExtValue();
32671 for (APInt &Elt : EltBits) {
32672 if (X86ISD::VSHLI == Opcode)
32674 else if (X86ISD::VSRAI == Opcode)
32675 Elt.ashrInPlace(ShiftImm);
32677 Elt.lshrInPlace(ShiftImm);
32679 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
32685 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
32686 TargetLowering::DAGCombinerInfo &DCI,
32687 const X86Subtarget &Subtarget) {
32689 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
32690 (N->getOpcode() == X86ISD::PINSRW &&
32691 N->getValueType(0) == MVT::v8i16)) &&
32692 "Unexpected vector insertion");
32694 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
32696 if (SDValue Res = combineX86ShufflesRecursively(
32697 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
32698 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
32699 DCI.CombineTo(N, Res);
32706 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
32707 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
32708 /// OR -> CMPNEQSS.
32709 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
32710 TargetLowering::DAGCombinerInfo &DCI,
32711 const X86Subtarget &Subtarget) {
32714 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
32715 // we're requiring SSE2 for both.
32716 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
32717 SDValue N0 = N->getOperand(0);
32718 SDValue N1 = N->getOperand(1);
32719 SDValue CMP0 = N0->getOperand(1);
32720 SDValue CMP1 = N1->getOperand(1);
32723 // The SETCCs should both refer to the same CMP.
32724 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
32727 SDValue CMP00 = CMP0->getOperand(0);
32728 SDValue CMP01 = CMP0->getOperand(1);
32729 EVT VT = CMP00.getValueType();
32731 if (VT == MVT::f32 || VT == MVT::f64) {
32732 bool ExpectingFlags = false;
32733 // Check for any users that want flags:
32734 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
32735 !ExpectingFlags && UI != UE; ++UI)
32736 switch (UI->getOpcode()) {
32741 ExpectingFlags = true;
32743 case ISD::CopyToReg:
32744 case ISD::SIGN_EXTEND:
32745 case ISD::ZERO_EXTEND:
32746 case ISD::ANY_EXTEND:
32750 if (!ExpectingFlags) {
32751 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
32752 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
32754 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
32755 X86::CondCode tmp = cc0;
32760 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
32761 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
32762 // FIXME: need symbolic constants for these magic numbers.
32763 // See X86ATTInstPrinter.cpp:printSSECC().
32764 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
32765 if (Subtarget.hasAVX512()) {
32767 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
32768 DAG.getConstant(x86cc, DL, MVT::i8));
32769 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
32770 FSetCC, DAG.getIntPtrConstant(0, DL));
32772 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
32773 CMP00.getValueType(), CMP00, CMP01,
32774 DAG.getConstant(x86cc, DL,
32777 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
32778 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
32780 if (is64BitFP && !Subtarget.is64Bit()) {
32781 // On a 32-bit target, we cannot bitcast the 64-bit float to a
32782 // 64-bit integer, since that's not a legal type. Since
32783 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
32784 // bits, but can do this little dance to extract the lowest 32 bits
32785 // and work with those going forward.
32786 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
32788 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
32789 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
32790 Vector32, DAG.getIntPtrConstant(0, DL));
32794 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
32795 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
32796 DAG.getConstant(1, DL, IntVT));
32797 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
32799 return OneBitOfTruth;
32807 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
32808 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
32809 assert(N->getOpcode() == ISD::AND);
32811 EVT VT = N->getValueType(0);
32812 SDValue N0 = N->getOperand(0);
32813 SDValue N1 = N->getOperand(1);
32816 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
32819 if (N0.getOpcode() == ISD::XOR &&
32820 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
32821 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
32823 if (N1.getOpcode() == ISD::XOR &&
32824 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
32825 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
32830 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
32831 // register. In most cases we actually compare or select YMM-sized registers
32832 // and mixing the two types creates horrible code. This method optimizes
32833 // some of the transition sequences.
32834 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
32835 TargetLowering::DAGCombinerInfo &DCI,
32836 const X86Subtarget &Subtarget) {
32837 EVT VT = N->getValueType(0);
32838 if (!VT.is256BitVector())
32841 assert((N->getOpcode() == ISD::ANY_EXTEND ||
32842 N->getOpcode() == ISD::ZERO_EXTEND ||
32843 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
32845 SDValue Narrow = N->getOperand(0);
32846 EVT NarrowVT = Narrow->getValueType(0);
32847 if (!NarrowVT.is128BitVector())
32850 if (Narrow->getOpcode() != ISD::XOR &&
32851 Narrow->getOpcode() != ISD::AND &&
32852 Narrow->getOpcode() != ISD::OR)
32855 SDValue N0 = Narrow->getOperand(0);
32856 SDValue N1 = Narrow->getOperand(1);
32859 // The Left side has to be a trunc.
32860 if (N0.getOpcode() != ISD::TRUNCATE)
32863 // The type of the truncated inputs.
32864 EVT WideVT = N0->getOperand(0)->getValueType(0);
32868 // The right side has to be a 'trunc' or a constant vector.
32869 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
32870 ConstantSDNode *RHSConstSplat = nullptr;
32871 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
32872 RHSConstSplat = RHSBV->getConstantSplatNode();
32873 if (!RHSTrunc && !RHSConstSplat)
32876 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32878 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
32881 // Set N0 and N1 to hold the inputs to the new wide operation.
32882 N0 = N0->getOperand(0);
32883 if (RHSConstSplat) {
32884 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
32885 SDValue(RHSConstSplat, 0));
32886 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
32887 } else if (RHSTrunc) {
32888 N1 = N1->getOperand(0);
32891 // Generate the wide operation.
32892 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
32893 unsigned Opcode = N->getOpcode();
32895 case ISD::ANY_EXTEND:
32897 case ISD::ZERO_EXTEND: {
32898 unsigned InBits = NarrowVT.getScalarSizeInBits();
32899 APInt Mask = APInt::getAllOnesValue(InBits);
32900 Mask = Mask.zext(VT.getScalarSizeInBits());
32901 return DAG.getNode(ISD::AND, DL, VT,
32902 Op, DAG.getConstant(Mask, DL, VT));
32904 case ISD::SIGN_EXTEND:
32905 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
32906 Op, DAG.getValueType(NarrowVT));
32908 llvm_unreachable("Unexpected opcode");
32912 /// If both input operands of a logic op are being cast from floating point
32913 /// types, try to convert this into a floating point logic node to avoid
32914 /// unnecessary moves from SSE to integer registers.
32915 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
32916 const X86Subtarget &Subtarget) {
32917 unsigned FPOpcode = ISD::DELETED_NODE;
32918 if (N->getOpcode() == ISD::AND)
32919 FPOpcode = X86ISD::FAND;
32920 else if (N->getOpcode() == ISD::OR)
32921 FPOpcode = X86ISD::FOR;
32922 else if (N->getOpcode() == ISD::XOR)
32923 FPOpcode = X86ISD::FXOR;
32925 assert(FPOpcode != ISD::DELETED_NODE &&
32926 "Unexpected input node for FP logic conversion");
32928 EVT VT = N->getValueType(0);
32929 SDValue N0 = N->getOperand(0);
32930 SDValue N1 = N->getOperand(1);
32932 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
32933 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
32934 (Subtarget.hasSSE2() && VT == MVT::i64))) {
32935 SDValue N00 = N0.getOperand(0);
32936 SDValue N10 = N1.getOperand(0);
32937 EVT N00Type = N00.getValueType();
32938 EVT N10Type = N10.getValueType();
32939 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
32940 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32941 return DAG.getBitcast(VT, FPLogic);
32947 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
32948 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32949 /// with a shift-right to eliminate loading the vector constant mask value.
32950 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32951 const X86Subtarget &Subtarget) {
32952 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32953 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32954 EVT VT0 = Op0.getValueType();
32955 EVT VT1 = Op1.getValueType();
32957 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
32961 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
32962 !SplatVal.isMask())
32965 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
32968 unsigned EltBitWidth = VT0.getScalarSizeInBits();
32969 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32973 unsigned ShiftVal = SplatVal.countTrailingOnes();
32974 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32975 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32976 return DAG.getBitcast(N->getValueType(0), Shift);
32979 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32980 TargetLowering::DAGCombinerInfo &DCI,
32981 const X86Subtarget &Subtarget) {
32982 EVT VT = N->getValueType(0);
32984 // If this is SSE1 only convert to FAND to avoid scalarization.
32985 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
32986 return DAG.getBitcast(
32987 MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
32988 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
32989 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
32992 if (DCI.isBeforeLegalizeOps())
32995 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32998 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33001 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
33004 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
33007 // Attempt to recursively combine a bitmask AND with shuffles.
33008 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
33010 if (SDValue Res = combineX86ShufflesRecursively(
33011 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
33012 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
33013 DCI.CombineTo(N, Res);
33018 // Attempt to combine a scalar bitmask AND with an extracted shuffle.
33019 if ((VT.getScalarSizeInBits() % 8) == 0 &&
33020 N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33021 isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
33022 SDValue BitMask = N->getOperand(1);
33023 SDValue SrcVec = N->getOperand(0).getOperand(0);
33024 EVT SrcVecVT = SrcVec.getValueType();
33026 // Check that the constant bitmask masks whole bytes.
33028 SmallVector<APInt, 64> EltBits;
33029 if (VT == SrcVecVT.getScalarType() &&
33030 N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
33031 getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
33032 llvm::all_of(EltBits, [](APInt M) {
33033 return M.isNullValue() || M.isAllOnesValue();
33035 unsigned NumElts = SrcVecVT.getVectorNumElements();
33036 unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
33037 unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
33039 // Create a root shuffle mask from the byte mask and the extracted index.
33040 SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
33041 for (unsigned i = 0; i != Scale; ++i) {
33044 int VecIdx = Scale * Idx + i;
33045 ShuffleMask[VecIdx] =
33046 EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
33049 if (SDValue Shuffle = combineX86ShufflesRecursively(
33050 {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
33051 /*HasVarMask*/ false, DAG, DCI, Subtarget))
33052 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
33053 N->getOperand(0).getOperand(1));
33061 // (or (and (m, y), (pandn m, x)))
33063 // (vselect m, x, y)
33064 // As a special case, try to fold:
33065 // (or (and (m, (sub 0, x)), (pandn m, x)))
33067 // (sub (xor X, M), M)
33068 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
33069 const X86Subtarget &Subtarget) {
33070 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
33072 SDValue N0 = N->getOperand(0);
33073 SDValue N1 = N->getOperand(1);
33074 EVT VT = N->getValueType(0);
33076 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
33077 (VT.is256BitVector() && Subtarget.hasInt256())))
33080 // Canonicalize AND to LHS.
33081 if (N1.getOpcode() == ISD::AND)
33084 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
33085 // ANDNP combine allows other combines to happen that prevent matching.
33086 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
33089 SDValue Mask = N1.getOperand(0);
33090 SDValue X = N1.getOperand(1);
33092 if (N0.getOperand(0) == Mask)
33093 Y = N0.getOperand(1);
33094 if (N0.getOperand(1) == Mask)
33095 Y = N0.getOperand(0);
33097 // Check to see if the mask appeared in both the AND and ANDNP.
33101 // Validate that X, Y, and Mask are bitcasts, and see through them.
33102 Mask = peekThroughBitcasts(Mask);
33103 X = peekThroughBitcasts(X);
33104 Y = peekThroughBitcasts(Y);
33106 EVT MaskVT = Mask.getValueType();
33107 unsigned EltBits = MaskVT.getScalarSizeInBits();
33109 // TODO: Attempt to handle floating point cases as well?
33110 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
33116 // (or (and (M, (sub 0, X)), (pandn M, X)))
33117 // which is a special case of vselect:
33118 // (vselect M, (sub 0, X), X)
33120 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
33121 // We know that, if fNegate is 0 or 1:
33122 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
33124 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
33125 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
33126 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
33127 // This lets us transform our vselect to:
33128 // (add (xor X, M), (and M, 1))
33130 // (sub (xor X, M), M)
33131 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
33132 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
33133 auto IsNegV = [](SDNode *N, SDValue V) {
33134 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
33135 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
33138 if (IsNegV(Y.getNode(), X))
33140 else if (IsNegV(X.getNode(), Y))
33144 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
33145 SDValue SubOp2 = Mask;
33147 // If the negate was on the false side of the select, then
33148 // the operands of the SUB need to be swapped. PR 27251.
33149 // This is because the pattern being matched above is
33150 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
33151 // but if the pattern matched was
33152 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
33153 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
33154 // pattern also needs to be a negation of the replacement pattern above.
33155 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
33156 // sub accomplishes the negation of the replacement pattern.
33158 std::swap(SubOp1, SubOp2);
33160 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
33161 return DAG.getBitcast(VT, Res);
33165 // PBLENDVB is only available on SSE 4.1.
33166 if (!Subtarget.hasSSE41())
33169 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
33171 X = DAG.getBitcast(BlendVT, X);
33172 Y = DAG.getBitcast(BlendVT, Y);
33173 Mask = DAG.getBitcast(BlendVT, Mask);
33174 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
33175 return DAG.getBitcast(VT, Mask);
33178 // Helper function for combineOrCmpEqZeroToCtlzSrl
33182 // srl(ctlz x), log2(bitsize(x))
33183 // Input pattern is checked by caller.
33184 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
33185 SelectionDAG &DAG) {
33186 SDValue Cmp = Op.getOperand(1);
33187 EVT VT = Cmp.getOperand(0).getValueType();
33188 unsigned Log2b = Log2_32(VT.getSizeInBits());
33190 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
33191 // The result of the shift is true or false, and on X86, the 32-bit
33192 // encoding of shr and lzcnt is more desirable.
33193 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
33194 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
33195 DAG.getConstant(Log2b, dl, VT));
33196 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
33199 // Try to transform:
33200 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
33202 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
33203 // Will also attempt to match more generic cases, eg:
33204 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
33205 // Only applies if the target supports the FastLZCNT feature.
33206 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
33207 TargetLowering::DAGCombinerInfo &DCI,
33208 const X86Subtarget &Subtarget) {
33209 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
33212 auto isORCandidate = [](SDValue N) {
33213 return (N->getOpcode() == ISD::OR && N->hasOneUse());
33216 // Check the zero extend is extending to 32-bit or more. The code generated by
33217 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
33218 // instructions to clear the upper bits.
33219 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
33220 !isORCandidate(N->getOperand(0)))
33223 // Check the node matches: setcc(eq, cmp 0)
33224 auto isSetCCCandidate = [](SDValue N) {
33225 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
33226 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
33227 N->getOperand(1).getOpcode() == X86ISD::CMP &&
33228 isNullConstant(N->getOperand(1).getOperand(1)) &&
33229 N->getOperand(1).getValueType().bitsGE(MVT::i32);
33232 SDNode *OR = N->getOperand(0).getNode();
33233 SDValue LHS = OR->getOperand(0);
33234 SDValue RHS = OR->getOperand(1);
33236 // Save nodes matching or(or, setcc(eq, cmp 0)).
33237 SmallVector<SDNode *, 2> ORNodes;
33238 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
33239 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
33240 ORNodes.push_back(OR);
33241 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
33242 LHS = OR->getOperand(0);
33243 RHS = OR->getOperand(1);
33246 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
33247 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
33248 !isORCandidate(SDValue(OR, 0)))
33251 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
33253 // or(srl(ctlz),srl(ctlz)).
33254 // The dag combiner can then fold it into:
33255 // srl(or(ctlz, ctlz)).
33256 EVT VT = OR->getValueType(0);
33257 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
33258 SDValue Ret, NewRHS;
33259 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
33260 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
33265 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
33266 while (ORNodes.size() > 0) {
33267 OR = ORNodes.pop_back_val();
33268 LHS = OR->getOperand(0);
33269 RHS = OR->getOperand(1);
33270 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
33271 if (RHS->getOpcode() == ISD::OR)
33272 std::swap(LHS, RHS);
33273 EVT VT = OR->getValueType(0);
33274 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
33277 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
33281 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
33286 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
33287 TargetLowering::DAGCombinerInfo &DCI,
33288 const X86Subtarget &Subtarget) {
33289 SDValue N0 = N->getOperand(0);
33290 SDValue N1 = N->getOperand(1);
33291 EVT VT = N->getValueType(0);
33293 // If this is SSE1 only convert to FOR to avoid scalarization.
33294 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
33295 return DAG.getBitcast(MVT::v4i32,
33296 DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
33297 DAG.getBitcast(MVT::v4f32, N0),
33298 DAG.getBitcast(MVT::v4f32, N1)));
33301 if (DCI.isBeforeLegalizeOps())
33304 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
33307 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
33310 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
33313 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
33316 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
33317 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
33319 // SHLD/SHRD instructions have lower register pressure, but on some
33320 // platforms they have higher latency than the equivalent
33321 // series of shifts/or that would otherwise be generated.
33322 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
33323 // have higher latencies and we are not optimizing for size.
33324 if (!OptForSize && Subtarget.isSHLDSlow())
33327 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
33329 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
33331 if (!N0.hasOneUse() || !N1.hasOneUse())
33334 SDValue ShAmt0 = N0.getOperand(1);
33335 if (ShAmt0.getValueType() != MVT::i8)
33337 SDValue ShAmt1 = N1.getOperand(1);
33338 if (ShAmt1.getValueType() != MVT::i8)
33340 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
33341 ShAmt0 = ShAmt0.getOperand(0);
33342 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
33343 ShAmt1 = ShAmt1.getOperand(0);
33346 unsigned Opc = X86ISD::SHLD;
33347 SDValue Op0 = N0.getOperand(0);
33348 SDValue Op1 = N1.getOperand(0);
33349 if (ShAmt0.getOpcode() == ISD::SUB ||
33350 ShAmt0.getOpcode() == ISD::XOR) {
33351 Opc = X86ISD::SHRD;
33352 std::swap(Op0, Op1);
33353 std::swap(ShAmt0, ShAmt1);
33356 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
33357 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
33358 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
33359 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
33360 unsigned Bits = VT.getSizeInBits();
33361 if (ShAmt1.getOpcode() == ISD::SUB) {
33362 SDValue Sum = ShAmt1.getOperand(0);
33363 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
33364 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
33365 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
33366 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
33367 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
33368 return DAG.getNode(Opc, DL, VT,
33370 DAG.getNode(ISD::TRUNCATE, DL,
33373 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
33374 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
33375 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
33376 return DAG.getNode(Opc, DL, VT,
33377 N0.getOperand(0), N1.getOperand(0),
33378 DAG.getNode(ISD::TRUNCATE, DL,
33380 } else if (ShAmt1.getOpcode() == ISD::XOR) {
33381 SDValue Mask = ShAmt1.getOperand(1);
33382 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
33383 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
33384 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
33385 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
33386 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
33387 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
33388 if (Op1.getOpcode() == InnerShift &&
33389 isa<ConstantSDNode>(Op1.getOperand(1)) &&
33390 Op1.getConstantOperandVal(1) == 1) {
33391 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33392 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33394 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
33395 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
33396 Op1.getOperand(0) == Op1.getOperand(1)) {
33397 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
33398 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
33407 /// Try to turn tests against the signbit in the form of:
33408 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
33411 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
33412 // This is only worth doing if the output type is i8 or i1.
33413 EVT ResultType = N->getValueType(0);
33414 if (ResultType != MVT::i8 && ResultType != MVT::i1)
33417 SDValue N0 = N->getOperand(0);
33418 SDValue N1 = N->getOperand(1);
33420 // We should be performing an xor against a truncated shift.
33421 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
33424 // Make sure we are performing an xor against one.
33425 if (!isOneConstant(N1))
33428 // SetCC on x86 zero extends so only act on this if it's a logical shift.
33429 SDValue Shift = N0.getOperand(0);
33430 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
33433 // Make sure we are truncating from one of i16, i32 or i64.
33434 EVT ShiftTy = Shift.getValueType();
33435 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
33438 // Make sure the shift amount extracts the sign bit.
33439 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
33440 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
33443 // Create a greater-than comparison against -1.
33444 // N.B. Using SETGE against 0 works but we want a canonical looking
33445 // comparison, using SETGT matches up with what TranslateX86CC.
33447 SDValue ShiftOp = Shift.getOperand(0);
33448 EVT ShiftOpTy = ShiftOp.getValueType();
33449 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33450 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
33451 *DAG.getContext(), ResultType);
33452 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
33453 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
33454 if (SetCCResultType != ResultType)
33455 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
33459 /// Turn vector tests of the signbit in the form of:
33460 /// xor (sra X, elt_size(X)-1), -1
33464 /// This should be called before type legalization because the pattern may not
33465 /// persist after that.
33466 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
33467 const X86Subtarget &Subtarget) {
33468 EVT VT = N->getValueType(0);
33469 if (!VT.isSimple())
33472 switch (VT.getSimpleVT().SimpleTy) {
33473 default: return SDValue();
33476 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
33477 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
33481 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
33484 // There must be a shift right algebraic before the xor, and the xor must be a
33485 // 'not' operation.
33486 SDValue Shift = N->getOperand(0);
33487 SDValue Ones = N->getOperand(1);
33488 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
33489 !ISD::isBuildVectorAllOnes(Ones.getNode()))
33492 // The shift should be smearing the sign bit across each vector element.
33493 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
33497 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
33498 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
33499 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
33502 // Create a greater-than comparison against -1. We don't use the more obvious
33503 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
33504 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
33507 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
33508 /// is valid for the given \p Subtarget.
33509 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
33510 const X86Subtarget &Subtarget) {
33511 if (!Subtarget.hasAVX512())
33514 // FIXME: Scalar type may be supported if we move it to vector register.
33515 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
33518 EVT SrcElVT = SrcVT.getScalarType();
33519 EVT DstElVT = DstVT.getScalarType();
33520 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
33522 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
33524 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
33525 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
33529 /// Detect a pattern of truncation with saturation:
33530 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
33531 /// Return the source value to be truncated or SDValue() if the pattern was not
33533 static SDValue detectUSatPattern(SDValue In, EVT VT) {
33534 if (In.getOpcode() != ISD::UMIN)
33537 //Saturation with truncation. We truncate from InVT to VT.
33538 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
33539 "Unexpected types for truncate operation");
33542 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
33543 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
33544 // the element size of the destination type.
33545 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
33551 /// Detect a pattern of truncation with saturation:
33552 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
33553 /// The types should allow to use VPMOVUS* instruction on AVX512.
33554 /// Return the source value to be truncated or SDValue() if the pattern was not
33556 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
33557 const X86Subtarget &Subtarget) {
33558 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
33560 return detectUSatPattern(In, VT);
33564 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
33565 const X86Subtarget &Subtarget) {
33566 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33567 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
33569 if (auto USatVal = detectUSatPattern(In, VT))
33570 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
33571 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
33575 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
33576 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
33577 /// X86ISD::AVG instruction.
33578 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
33579 const X86Subtarget &Subtarget,
33581 if (!VT.isVector() || !VT.isSimple())
33583 EVT InVT = In.getValueType();
33584 unsigned NumElems = VT.getVectorNumElements();
33586 EVT ScalarVT = VT.getVectorElementType();
33587 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
33588 isPowerOf2_32(NumElems)))
33591 // InScalarVT is the intermediate type in AVG pattern and it should be greater
33592 // than the original input type (i8/i16).
33593 EVT InScalarVT = InVT.getVectorElementType();
33594 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
33597 if (!Subtarget.hasSSE2())
33599 if (Subtarget.hasBWI()) {
33600 if (VT.getSizeInBits() > 512)
33602 } else if (Subtarget.hasAVX2()) {
33603 if (VT.getSizeInBits() > 256)
33606 if (VT.getSizeInBits() > 128)
33610 // Detect the following pattern:
33612 // %1 = zext <N x i8> %a to <N x i32>
33613 // %2 = zext <N x i8> %b to <N x i32>
33614 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
33615 // %4 = add nuw nsw <N x i32> %3, %2
33616 // %5 = lshr <N x i32> %N, <i32 1 x N>
33617 // %6 = trunc <N x i32> %5 to <N x i8>
33619 // In AVX512, the last instruction can also be a trunc store.
33621 if (In.getOpcode() != ISD::SRL)
33624 // A lambda checking the given SDValue is a constant vector and each element
33625 // is in the range [Min, Max].
33626 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
33627 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
33628 if (!BV || !BV->isConstant())
33630 for (SDValue Op : V->ops()) {
33631 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
33634 uint64_t Val = C->getZExtValue();
33635 if (Val < Min || Val > Max)
33641 // Check if each element of the vector is left-shifted by one.
33642 auto LHS = In.getOperand(0);
33643 auto RHS = In.getOperand(1);
33644 if (!IsConstVectorInRange(RHS, 1, 1))
33646 if (LHS.getOpcode() != ISD::ADD)
33649 // Detect a pattern of a + b + 1 where the order doesn't matter.
33650 SDValue Operands[3];
33651 Operands[0] = LHS.getOperand(0);
33652 Operands[1] = LHS.getOperand(1);
33654 // Take care of the case when one of the operands is a constant vector whose
33655 // element is in the range [1, 256].
33656 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
33657 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
33658 Operands[0].getOperand(0).getValueType() == VT) {
33659 // The pattern is detected. Subtract one from the constant vector, then
33660 // demote it and emit X86ISD::AVG instruction.
33661 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
33662 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
33663 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
33664 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
33668 if (Operands[0].getOpcode() == ISD::ADD)
33669 std::swap(Operands[0], Operands[1]);
33670 else if (Operands[1].getOpcode() != ISD::ADD)
33672 Operands[2] = Operands[1].getOperand(0);
33673 Operands[1] = Operands[1].getOperand(1);
33675 // Now we have three operands of two additions. Check that one of them is a
33676 // constant vector with ones, and the other two are promoted from i8/i16.
33677 for (int i = 0; i < 3; ++i) {
33678 if (!IsConstVectorInRange(Operands[i], 1, 1))
33680 std::swap(Operands[i], Operands[2]);
33682 // Check if Operands[0] and Operands[1] are results of type promotion.
33683 for (int j = 0; j < 2; ++j)
33684 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
33685 Operands[j].getOperand(0).getValueType() != VT)
33688 // The pattern is detected, emit X86ISD::AVG instruction.
33689 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
33690 Operands[1].getOperand(0));
33696 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
33697 TargetLowering::DAGCombinerInfo &DCI,
33698 const X86Subtarget &Subtarget) {
33699 LoadSDNode *Ld = cast<LoadSDNode>(N);
33700 EVT RegVT = Ld->getValueType(0);
33701 EVT MemVT = Ld->getMemoryVT();
33703 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33705 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
33706 // into two 16-byte operations. Also split non-temporal aligned loads on
33707 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
33708 ISD::LoadExtType Ext = Ld->getExtensionType();
33710 unsigned AddressSpace = Ld->getAddressSpace();
33711 unsigned Alignment = Ld->getAlignment();
33712 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
33713 Ext == ISD::NON_EXTLOAD &&
33714 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
33715 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
33716 AddressSpace, Alignment, &Fast) && !Fast))) {
33717 unsigned NumElems = RegVT.getVectorNumElements();
33721 SDValue Ptr = Ld->getBasePtr();
33723 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
33726 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
33727 Alignment, Ld->getMemOperand()->getFlags());
33729 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
33731 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
33732 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
33733 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
33735 Load2.getValue(1));
33737 SDValue NewVec = DAG.getUNDEF(RegVT);
33738 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
33739 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
33740 return DCI.CombineTo(N, NewVec, TF, true);
33746 /// If V is a build vector of boolean constants and exactly one of those
33747 /// constants is true, return the operand index of that true element.
33748 /// Otherwise, return -1.
33749 static int getOneTrueElt(SDValue V) {
33750 // This needs to be a build vector of booleans.
33751 // TODO: Checking for the i1 type matches the IR definition for the mask,
33752 // but the mask check could be loosened to i8 or other types. That might
33753 // also require checking more than 'allOnesValue'; eg, the x86 HW
33754 // instructions only require that the MSB is set for each mask element.
33755 // The ISD::MSTORE comments/definition do not specify how the mask operand
33757 auto *BV = dyn_cast<BuildVectorSDNode>(V);
33758 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
33761 int TrueIndex = -1;
33762 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
33763 for (unsigned i = 0; i < NumElts; ++i) {
33764 const SDValue &Op = BV->getOperand(i);
33767 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
33770 if (ConstNode->getAPIntValue().isAllOnesValue()) {
33771 // If we already found a one, this is too many.
33772 if (TrueIndex >= 0)
33780 /// Given a masked memory load/store operation, return true if it has one mask
33781 /// bit set. If it has one mask bit set, then also return the memory address of
33782 /// the scalar element to load/store, the vector index to insert/extract that
33783 /// scalar element, and the alignment for the scalar memory access.
33784 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
33785 SelectionDAG &DAG, SDValue &Addr,
33786 SDValue &Index, unsigned &Alignment) {
33787 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
33788 if (TrueMaskElt < 0)
33791 // Get the address of the one scalar element that is specified by the mask
33792 // using the appropriate offset from the base pointer.
33793 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
33794 Addr = MaskedOp->getBasePtr();
33795 if (TrueMaskElt != 0) {
33796 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
33797 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
33800 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
33801 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
33805 /// If exactly one element of the mask is set for a non-extending masked load,
33806 /// it is a scalar load and vector insert.
33807 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33808 /// mask have already been optimized in IR, so we don't bother with those here.
33810 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
33811 TargetLowering::DAGCombinerInfo &DCI) {
33812 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33813 // However, some target hooks may need to be added to know when the transform
33814 // is profitable. Endianness would also have to be considered.
33816 SDValue Addr, VecIndex;
33817 unsigned Alignment;
33818 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
33821 // Load the one scalar element that is specified by the mask using the
33822 // appropriate offset from the base pointer.
33824 EVT VT = ML->getValueType(0);
33825 EVT EltVT = VT.getVectorElementType();
33827 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
33828 Alignment, ML->getMemOperand()->getFlags());
33830 // Insert the loaded element into the appropriate place in the vector.
33831 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
33833 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
33837 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
33838 TargetLowering::DAGCombinerInfo &DCI) {
33839 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
33843 EVT VT = ML->getValueType(0);
33845 // If we are loading the first and last elements of a vector, it is safe and
33846 // always faster to load the whole vector. Replace the masked load with a
33847 // vector load and select.
33848 unsigned NumElts = VT.getVectorNumElements();
33849 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
33850 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
33851 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
33852 if (LoadFirstElt && LoadLastElt) {
33853 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33854 ML->getMemOperand());
33855 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
33856 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
33859 // Convert a masked load with a constant mask into a masked load and a select.
33860 // This allows the select operation to use a faster kind of select instruction
33861 // (for example, vblendvps -> vblendps).
33863 // Don't try this if the pass-through operand is already undefined. That would
33864 // cause an infinite loop because that's what we're about to create.
33865 if (ML->getSrc0().isUndef())
33868 // The new masked load has an undef pass-through operand. The select uses the
33869 // original pass-through operand.
33870 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33871 ML->getMask(), DAG.getUNDEF(VT),
33872 ML->getMemoryVT(), ML->getMemOperand(),
33873 ML->getExtensionType());
33874 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
33876 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
33879 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
33880 TargetLowering::DAGCombinerInfo &DCI,
33881 const X86Subtarget &Subtarget) {
33882 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
33884 // TODO: Expanding load with constant mask may be optimized as well.
33885 if (Mld->isExpandingLoad())
33888 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
33889 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
33891 // TODO: Do some AVX512 subsets benefit from this transform?
33892 if (!Subtarget.hasAVX512())
33893 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
33897 if (Mld->getExtensionType() != ISD::SEXTLOAD)
33900 // Resolve extending loads.
33901 EVT VT = Mld->getValueType(0);
33902 unsigned NumElems = VT.getVectorNumElements();
33903 EVT LdVT = Mld->getMemoryVT();
33906 assert(LdVT != VT && "Cannot extend to the same type");
33907 unsigned ToSz = VT.getScalarSizeInBits();
33908 unsigned FromSz = LdVT.getScalarSizeInBits();
33909 // From/To sizes and ElemCount must be pow of two.
33910 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33911 "Unexpected size for extending masked load");
33913 unsigned SizeRatio = ToSz / FromSz;
33914 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
33916 // Create a type on which we perform the shuffle.
33917 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33918 LdVT.getScalarType(), NumElems*SizeRatio);
33919 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33921 // Convert Src0 value.
33922 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
33923 if (!Mld->getSrc0().isUndef()) {
33924 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33925 for (unsigned i = 0; i != NumElems; ++i)
33926 ShuffleVec[i] = i * SizeRatio;
33928 // Can't shuffle using an illegal type.
33929 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33930 "WideVecVT should be legal");
33931 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33932 DAG.getUNDEF(WideVecVT), ShuffleVec);
33935 // Prepare the new mask.
33937 SDValue Mask = Mld->getMask();
33938 if (Mask.getValueType() == VT) {
33939 // Mask and original value have the same type.
33940 NewMask = DAG.getBitcast(WideVecVT, Mask);
33941 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33942 for (unsigned i = 0; i != NumElems; ++i)
33943 ShuffleVec[i] = i * SizeRatio;
33944 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
33945 ShuffleVec[i] = NumElems * SizeRatio;
33946 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33947 DAG.getConstant(0, dl, WideVecVT),
33950 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33951 unsigned WidenNumElts = NumElems*SizeRatio;
33952 unsigned MaskNumElts = VT.getVectorNumElements();
33953 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33956 unsigned NumConcat = WidenNumElts / MaskNumElts;
33957 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33958 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
33960 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33963 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33964 Mld->getBasePtr(), NewMask, WideSrc0,
33965 Mld->getMemoryVT(), Mld->getMemOperand(),
33967 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33968 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33971 /// If exactly one element of the mask is set for a non-truncating masked store,
33972 /// it is a vector extract and scalar store.
33973 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33974 /// mask have already been optimized in IR, so we don't bother with those here.
33975 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33976 SelectionDAG &DAG) {
33977 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33978 // However, some target hooks may need to be added to know when the transform
33979 // is profitable. Endianness would also have to be considered.
33981 SDValue Addr, VecIndex;
33982 unsigned Alignment;
33983 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33986 // Extract the one scalar element that is actually being stored.
33988 EVT VT = MS->getValue().getValueType();
33989 EVT EltVT = VT.getVectorElementType();
33990 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33991 MS->getValue(), VecIndex);
33993 // Store that element at the appropriate offset from the base pointer.
33994 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33995 Alignment, MS->getMemOperand()->getFlags());
33998 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33999 const X86Subtarget &Subtarget) {
34000 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
34002 if (Mst->isCompressingStore())
34005 if (!Mst->isTruncatingStore()) {
34006 if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
34007 return ScalarStore;
34009 // If the mask is checking (0 > X), we're creating a vector with all-zeros
34010 // or all-ones elements based on the sign bits of X. AVX1 masked store only
34011 // cares about the sign bit of each mask element, so eliminate the compare:
34012 // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
34013 // Note that by waiting to match an x86-specific PCMPGT node, we're
34014 // eliminating potentially more complex matching of a setcc node which has
34015 // a full range of predicates.
34016 SDValue Mask = Mst->getMask();
34017 if (Mask.getOpcode() == X86ISD::PCMPGT &&
34018 ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
34019 assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
34020 "Unexpected type for PCMPGT");
34021 return DAG.getMaskedStore(
34022 Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
34023 Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
34026 // TODO: AVX512 targets should also be able to simplify something like the
34027 // pattern above, but that pattern will be different. It will either need to
34028 // match setcc more generally or match PCMPGTM later (in tablegen?).
34033 // Resolve truncating stores.
34034 EVT VT = Mst->getValue().getValueType();
34035 unsigned NumElems = VT.getVectorNumElements();
34036 EVT StVT = Mst->getMemoryVT();
34039 assert(StVT != VT && "Cannot truncate to the same type");
34040 unsigned FromSz = VT.getScalarSizeInBits();
34041 unsigned ToSz = StVT.getScalarSizeInBits();
34043 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34045 // The truncating store is legal in some cases. For example
34046 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34047 // are designated for truncate store.
34048 // In this case we don't need any further transformations.
34049 if (TLI.isTruncStoreLegal(VT, StVT))
34052 // From/To sizes and ElemCount must be pow of two.
34053 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
34054 "Unexpected size for truncating masked store");
34055 // We are going to use the original vector elt for storing.
34056 // Accumulated smaller vector elements must be a multiple of the store size.
34057 assert (((NumElems * FromSz) % ToSz) == 0 &&
34058 "Unexpected ratio for truncating masked store");
34060 unsigned SizeRatio = FromSz / ToSz;
34061 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34063 // Create a type on which we perform the shuffle.
34064 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34065 StVT.getScalarType(), NumElems*SizeRatio);
34067 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34069 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
34070 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
34071 for (unsigned i = 0; i != NumElems; ++i)
34072 ShuffleVec[i] = i * SizeRatio;
34074 // Can't shuffle using an illegal type.
34075 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
34076 "WideVecVT should be legal");
34078 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34079 DAG.getUNDEF(WideVecVT),
34083 SDValue Mask = Mst->getMask();
34084 if (Mask.getValueType() == VT) {
34085 // Mask and original value have the same type.
34086 NewMask = DAG.getBitcast(WideVecVT, Mask);
34087 for (unsigned i = 0; i != NumElems; ++i)
34088 ShuffleVec[i] = i * SizeRatio;
34089 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
34090 ShuffleVec[i] = NumElems*SizeRatio;
34091 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
34092 DAG.getConstant(0, dl, WideVecVT),
34095 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
34096 unsigned WidenNumElts = NumElems*SizeRatio;
34097 unsigned MaskNumElts = VT.getVectorNumElements();
34098 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
34101 unsigned NumConcat = WidenNumElts / MaskNumElts;
34102 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
34103 SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
34105 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
34108 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
34109 Mst->getBasePtr(), NewMask, StVT,
34110 Mst->getMemOperand(), false);
34113 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
34114 const X86Subtarget &Subtarget) {
34115 StoreSDNode *St = cast<StoreSDNode>(N);
34116 EVT VT = St->getValue().getValueType();
34117 EVT StVT = St->getMemoryVT();
34119 SDValue StoredVal = St->getOperand(1);
34120 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34122 // If we are saving a concatenation of two XMM registers and 32-byte stores
34123 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
34125 unsigned AddressSpace = St->getAddressSpace();
34126 unsigned Alignment = St->getAlignment();
34127 if (VT.is256BitVector() && StVT == VT &&
34128 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
34129 AddressSpace, Alignment, &Fast) &&
34131 unsigned NumElems = VT.getVectorNumElements();
34135 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
34136 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
34138 SDValue Ptr0 = St->getBasePtr();
34139 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
34142 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
34143 Alignment, St->getMemOperand()->getFlags());
34145 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
34146 std::min(16U, Alignment), St->getMemOperand()->getFlags());
34147 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
34150 // Optimize trunc store (of multiple scalars) to shuffle and store.
34151 // First, pack all of the elements in one place. Next, store to memory
34152 // in fewer chunks.
34153 if (St->isTruncatingStore() && VT.isVector()) {
34154 // Check if we can detect an AVG pattern from the truncation. If yes,
34155 // replace the trunc store by a normal store with the result of X86ISD::AVG
34157 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
34159 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
34160 St->getPointerInfo(), St->getAlignment(),
34161 St->getMemOperand()->getFlags());
34164 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
34165 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
34166 dl, Val, St->getBasePtr(),
34167 St->getMemoryVT(), St->getMemOperand(), DAG);
34169 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34170 unsigned NumElems = VT.getVectorNumElements();
34171 assert(StVT != VT && "Cannot truncate to the same type");
34172 unsigned FromSz = VT.getScalarSizeInBits();
34173 unsigned ToSz = StVT.getScalarSizeInBits();
34175 // The truncating store is legal in some cases. For example
34176 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
34177 // are designated for truncate store.
34178 // In this case we don't need any further transformations.
34179 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
34182 // From, To sizes and ElemCount must be pow of two
34183 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
34184 // We are going to use the original vector elt for storing.
34185 // Accumulated smaller vector elements must be a multiple of the store size.
34186 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
34188 unsigned SizeRatio = FromSz / ToSz;
34190 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
34192 // Create a type on which we perform the shuffle
34193 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
34194 StVT.getScalarType(), NumElems*SizeRatio);
34196 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
34198 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
34199 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
34200 for (unsigned i = 0; i != NumElems; ++i)
34201 ShuffleVec[i] = i * SizeRatio;
34203 // Can't shuffle using an illegal type.
34204 if (!TLI.isTypeLegal(WideVecVT))
34207 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
34208 DAG.getUNDEF(WideVecVT),
34210 // At this point all of the data is stored at the bottom of the
34211 // register. We now need to save it to mem.
34213 // Find the largest store unit
34214 MVT StoreType = MVT::i8;
34215 for (MVT Tp : MVT::integer_valuetypes()) {
34216 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
34220 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
34221 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
34222 (64 <= NumElems * ToSz))
34223 StoreType = MVT::f64;
34225 // Bitcast the original vector into a vector of store-size units
34226 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
34227 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
34228 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
34229 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
34230 SmallVector<SDValue, 8> Chains;
34231 SDValue Ptr = St->getBasePtr();
34233 // Perform one or more big stores into memory.
34234 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
34235 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
34236 StoreType, ShuffWide,
34237 DAG.getIntPtrConstant(i, dl));
34239 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
34240 St->getAlignment(), St->getMemOperand()->getFlags());
34241 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
34242 Chains.push_back(Ch);
34245 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
34248 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
34249 // the FP state in cases where an emms may be missing.
34250 // A preferable solution to the general problem is to figure out the right
34251 // places to insert EMMS. This qualifies as a quick hack.
34253 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
34254 if (VT.getSizeInBits() != 64)
34257 const Function *F = DAG.getMachineFunction().getFunction();
34258 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
34260 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
34261 if ((VT.isVector() ||
34262 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
34263 isa<LoadSDNode>(St->getValue()) &&
34264 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
34265 St->getChain().hasOneUse() && !St->isVolatile()) {
34266 SDNode* LdVal = St->getValue().getNode();
34267 LoadSDNode *Ld = nullptr;
34268 int TokenFactorIndex = -1;
34269 SmallVector<SDValue, 8> Ops;
34270 SDNode* ChainVal = St->getChain().getNode();
34271 // Must be a store of a load. We currently handle two cases: the load
34272 // is a direct child, and it's under an intervening TokenFactor. It is
34273 // possible to dig deeper under nested TokenFactors.
34274 if (ChainVal == LdVal)
34275 Ld = cast<LoadSDNode>(St->getChain());
34276 else if (St->getValue().hasOneUse() &&
34277 ChainVal->getOpcode() == ISD::TokenFactor) {
34278 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
34279 if (ChainVal->getOperand(i).getNode() == LdVal) {
34280 TokenFactorIndex = i;
34281 Ld = cast<LoadSDNode>(St->getValue());
34283 Ops.push_back(ChainVal->getOperand(i));
34287 if (!Ld || !ISD::isNormalLoad(Ld))
34290 // If this is not the MMX case, i.e. we are just turning i64 load/store
34291 // into f64 load/store, avoid the transformation if there are multiple
34292 // uses of the loaded value.
34293 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
34298 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
34299 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
34301 if (Subtarget.is64Bit() || F64IsLegal) {
34302 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
34303 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
34304 Ld->getPointerInfo(), Ld->getAlignment(),
34305 Ld->getMemOperand()->getFlags());
34306 // Make sure new load is placed in same chain order.
34307 SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
34308 if (TokenFactorIndex >= 0) {
34309 Ops.push_back(NewChain);
34310 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
34312 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
34313 St->getPointerInfo(), St->getAlignment(),
34314 St->getMemOperand()->getFlags());
34317 // Otherwise, lower to two pairs of 32-bit loads / stores.
34318 SDValue LoAddr = Ld->getBasePtr();
34319 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
34321 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
34322 Ld->getPointerInfo(), Ld->getAlignment(),
34323 Ld->getMemOperand()->getFlags());
34324 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
34325 Ld->getPointerInfo().getWithOffset(4),
34326 MinAlign(Ld->getAlignment(), 4),
34327 Ld->getMemOperand()->getFlags());
34328 // Make sure new loads are placed in same chain order.
34329 SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
34330 NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
34332 if (TokenFactorIndex >= 0) {
34333 Ops.push_back(NewChain);
34334 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
34337 LoAddr = St->getBasePtr();
34338 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
34341 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
34342 St->getAlignment(), St->getMemOperand()->getFlags());
34343 SDValue HiSt = DAG.getStore(
34344 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
34345 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
34346 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
34349 // This is similar to the above case, but here we handle a scalar 64-bit
34350 // integer store that is extracted from a vector on a 32-bit target.
34351 // If we have SSE2, then we can treat it like a floating-point double
34352 // to get past legalization. The execution dependencies fixup pass will
34353 // choose the optimal machine instruction for the store if this really is
34354 // an integer or v2f32 rather than an f64.
34355 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
34356 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
34357 SDValue OldExtract = St->getOperand(1);
34358 SDValue ExtOp0 = OldExtract.getOperand(0);
34359 unsigned VecSize = ExtOp0.getValueSizeInBits();
34360 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
34361 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
34362 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
34363 BitCast, OldExtract.getOperand(1));
34364 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
34365 St->getPointerInfo(), St->getAlignment(),
34366 St->getMemOperand()->getFlags());
34372 /// Return 'true' if this vector operation is "horizontal"
34373 /// and return the operands for the horizontal operation in LHS and RHS. A
34374 /// horizontal operation performs the binary operation on successive elements
34375 /// of its first operand, then on successive elements of its second operand,
34376 /// returning the resulting values in a vector. For example, if
34377 /// A = < float a0, float a1, float a2, float a3 >
34379 /// B = < float b0, float b1, float b2, float b3 >
34380 /// then the result of doing a horizontal operation on A and B is
34381 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
34382 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
34383 /// A horizontal-op B, for some already available A and B, and if so then LHS is
34384 /// set to A, RHS to B, and the routine returns 'true'.
34385 /// Note that the binary operation should have the property that if one of the
34386 /// operands is UNDEF then the result is UNDEF.
34387 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
34388 // Look for the following pattern: if
34389 // A = < float a0, float a1, float a2, float a3 >
34390 // B = < float b0, float b1, float b2, float b3 >
34392 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
34393 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
34394 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
34395 // which is A horizontal-op B.
34397 // At least one of the operands should be a vector shuffle.
34398 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
34399 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
34402 MVT VT = LHS.getSimpleValueType();
34404 assert((VT.is128BitVector() || VT.is256BitVector()) &&
34405 "Unsupported vector type for horizontal add/sub");
34407 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
34408 // operate independently on 128-bit lanes.
34409 unsigned NumElts = VT.getVectorNumElements();
34410 unsigned NumLanes = VT.getSizeInBits()/128;
34411 unsigned NumLaneElts = NumElts / NumLanes;
34412 assert((NumLaneElts % 2 == 0) &&
34413 "Vector type should have an even number of elements in each lane");
34414 unsigned HalfLaneElts = NumLaneElts/2;
34416 // View LHS in the form
34417 // LHS = VECTOR_SHUFFLE A, B, LMask
34418 // If LHS is not a shuffle then pretend it is the shuffle
34419 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
34420 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
34423 SmallVector<int, 16> LMask(NumElts);
34424 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
34425 if (!LHS.getOperand(0).isUndef())
34426 A = LHS.getOperand(0);
34427 if (!LHS.getOperand(1).isUndef())
34428 B = LHS.getOperand(1);
34429 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
34430 std::copy(Mask.begin(), Mask.end(), LMask.begin());
34432 if (!LHS.isUndef())
34434 for (unsigned i = 0; i != NumElts; ++i)
34438 // Likewise, view RHS in the form
34439 // RHS = VECTOR_SHUFFLE C, D, RMask
34441 SmallVector<int, 16> RMask(NumElts);
34442 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
34443 if (!RHS.getOperand(0).isUndef())
34444 C = RHS.getOperand(0);
34445 if (!RHS.getOperand(1).isUndef())
34446 D = RHS.getOperand(1);
34447 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
34448 std::copy(Mask.begin(), Mask.end(), RMask.begin());
34450 if (!RHS.isUndef())
34452 for (unsigned i = 0; i != NumElts; ++i)
34456 // Check that the shuffles are both shuffling the same vectors.
34457 if (!(A == C && B == D) && !(A == D && B == C))
34460 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
34461 if (!A.getNode() && !B.getNode())
34464 // If A and B occur in reverse order in RHS, then "swap" them (which means
34465 // rewriting the mask).
34467 ShuffleVectorSDNode::commuteMask(RMask);
34469 // At this point LHS and RHS are equivalent to
34470 // LHS = VECTOR_SHUFFLE A, B, LMask
34471 // RHS = VECTOR_SHUFFLE A, B, RMask
34472 // Check that the masks correspond to performing a horizontal operation.
34473 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
34474 for (unsigned i = 0; i != NumLaneElts; ++i) {
34475 int LIdx = LMask[i+l], RIdx = RMask[i+l];
34477 // Ignore any UNDEF components.
34478 if (LIdx < 0 || RIdx < 0 ||
34479 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
34480 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
34483 // Check that successive elements are being operated on. If not, this is
34484 // not a horizontal operation.
34485 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
34486 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
34487 if (!(LIdx == Index && RIdx == Index + 1) &&
34488 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
34493 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
34494 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
34498 /// Do target-specific dag combines on floating-point adds/subs.
34499 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
34500 const X86Subtarget &Subtarget) {
34501 EVT VT = N->getValueType(0);
34502 SDValue LHS = N->getOperand(0);
34503 SDValue RHS = N->getOperand(1);
34504 bool IsFadd = N->getOpcode() == ISD::FADD;
34505 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
34507 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
34508 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
34509 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
34510 isHorizontalBinOp(LHS, RHS, IsFadd)) {
34511 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
34512 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
34517 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
34519 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
34520 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
34521 const X86Subtarget &Subtarget,
34523 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
34524 SDValue Src = N->getOperand(0);
34525 unsigned Opcode = Src.getOpcode();
34526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34528 EVT VT = N->getValueType(0);
34529 EVT SrcVT = Src.getValueType();
34531 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
34532 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
34534 // Repeated operand, so we are only trading one output truncation for
34535 // one input truncation.
34539 // See if either operand has been extended from a smaller/equal size to
34540 // the truncation size, allowing a truncation to combine with the extend.
34541 unsigned Opcode0 = Op0.getOpcode();
34542 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
34543 Opcode0 == ISD::ZERO_EXTEND) &&
34544 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
34547 unsigned Opcode1 = Op1.getOpcode();
34548 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
34549 Opcode1 == ISD::ZERO_EXTEND) &&
34550 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
34553 // See if either operand is a single use constant which can be constant
34555 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
34556 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
34557 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
34558 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
34561 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
34562 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
34563 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
34564 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
34567 // Don't combine if the operation has other uses.
34568 if (!N->isOnlyUserOf(Src.getNode()))
34571 // Only support vector truncation for now.
34572 // TODO: i64 scalar math would benefit as well.
34573 if (!VT.isVector())
34576 // In most cases its only worth pre-truncating if we're only facing the cost
34577 // of one truncation.
34578 // i.e. if one of the inputs will constant fold or the input is repeated.
34583 SDValue Op0 = Src.getOperand(0);
34584 SDValue Op1 = Src.getOperand(1);
34585 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
34586 IsRepeatedOpOrFreeTruncation(Op0, Op1))
34587 return TruncateArithmetic(Op0, Op1);
34592 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
34593 // better to truncate if we have the chance.
34594 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
34595 !TLI.isOperationLegal(Opcode, SrcVT))
34596 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
34599 // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
34600 SDValue Op0 = Src.getOperand(0);
34601 SDValue Op1 = Src.getOperand(1);
34602 if (TLI.isOperationLegal(Opcode, VT) &&
34603 IsRepeatedOpOrFreeTruncation(Op0, Op1))
34604 return TruncateArithmetic(Op0, Op1);
34612 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
34614 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
34615 SmallVector<SDValue, 8> &Regs) {
34616 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
34617 Regs[0].getValueType() == MVT::v2i64));
34618 EVT OutVT = N->getValueType(0);
34619 EVT OutSVT = OutVT.getVectorElementType();
34620 EVT InVT = Regs[0].getValueType();
34621 EVT InSVT = InVT.getVectorElementType();
34624 // First, use mask to unset all bits that won't appear in the result.
34625 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
34626 "OutSVT can only be either i8 or i16.");
34628 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
34629 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
34630 for (auto &Reg : Regs)
34631 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
34633 MVT UnpackedVT, PackedVT;
34634 if (OutSVT == MVT::i8) {
34635 UnpackedVT = MVT::v8i16;
34636 PackedVT = MVT::v16i8;
34638 UnpackedVT = MVT::v4i32;
34639 PackedVT = MVT::v8i16;
34642 // In each iteration, truncate the type by a half size.
34643 auto RegNum = Regs.size();
34644 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
34645 j < e; j *= 2, RegNum /= 2) {
34646 for (unsigned i = 0; i < RegNum; i++)
34647 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
34648 for (unsigned i = 0; i < RegNum / 2; i++)
34649 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
34653 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
34654 // then extract a subvector as the result since v8i8 is not a legal type.
34655 if (OutVT == MVT::v8i8) {
34656 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
34657 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
34658 DAG.getIntPtrConstant(0, DL));
34660 } else if (RegNum > 1) {
34661 Regs.resize(RegNum);
34662 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
34667 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
34669 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
34671 SmallVector<SDValue, 8> &Regs) {
34672 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
34673 EVT OutVT = N->getValueType(0);
34676 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
34677 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
34678 for (auto &Reg : Regs) {
34679 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
34681 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
34685 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
34686 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
34689 if (Regs.size() > 2) {
34690 Regs.resize(Regs.size() / 2);
34691 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
34696 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
34697 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
34698 /// legalization the truncation will be translated into a BUILD_VECTOR with each
34699 /// element that is extracted from a vector and then truncated, and it is
34700 /// difficult to do this optimization based on them.
34701 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
34702 const X86Subtarget &Subtarget) {
34703 EVT OutVT = N->getValueType(0);
34704 if (!OutVT.isVector())
34707 SDValue In = N->getOperand(0);
34708 if (!In.getValueType().isSimple())
34711 EVT InVT = In.getValueType();
34712 unsigned NumElems = OutVT.getVectorNumElements();
34714 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
34715 // SSE2, and we need to take care of it specially.
34716 // AVX512 provides vpmovdb.
34717 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
34720 EVT OutSVT = OutVT.getVectorElementType();
34721 EVT InSVT = InVT.getVectorElementType();
34722 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
34723 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
34727 // SSSE3's pshufb results in less instructions in the cases below.
34728 if (Subtarget.hasSSSE3() && NumElems == 8 &&
34729 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
34730 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
34735 // Split a long vector into vectors of legal type.
34736 unsigned RegNum = InVT.getSizeInBits() / 128;
34737 SmallVector<SDValue, 8> SubVec(RegNum);
34738 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
34739 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
34741 for (unsigned i = 0; i < RegNum; i++)
34742 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
34743 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
34745 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
34746 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
34747 // truncate 2 x v4i32 to v8i16.
34748 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
34749 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
34750 else if (InSVT == MVT::i32)
34751 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
34756 /// This function transforms vector truncation of 'extended sign-bits' or
34757 /// 'extended zero-bits' values.
34758 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
34759 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
34761 const X86Subtarget &Subtarget) {
34762 // Requires SSE2 but AVX512 has fast truncate.
34763 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
34766 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
34769 SDValue In = N->getOperand(0);
34770 if (!In.getValueType().isSimple())
34773 MVT VT = N->getValueType(0).getSimpleVT();
34774 MVT SVT = VT.getScalarType();
34776 MVT InVT = In.getValueType().getSimpleVT();
34777 MVT InSVT = InVT.getScalarType();
34779 // Check we have a truncation suited for PACKSS.
34780 if (!VT.is128BitVector() && !VT.is256BitVector())
34782 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
34784 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
34787 // Use PACKSS if the input has sign-bits that extend all the way to the
34788 // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
34789 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
34790 unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
34791 if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
34792 return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
34794 // Use PACKUS if the input has zero-bits that extend all the way to the
34795 // packed/truncated value. e.g. masks, zext_in_reg, etc.
34797 DAG.computeKnownBits(In, Known);
34798 unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
34799 NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
34800 if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
34801 return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
34806 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
34807 const X86Subtarget &Subtarget) {
34808 EVT VT = N->getValueType(0);
34809 SDValue Src = N->getOperand(0);
34812 // Attempt to pre-truncate inputs to arithmetic ops instead.
34813 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
34816 // Try to detect AVG pattern first.
34817 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
34820 // Try to combine truncation with unsigned saturation.
34821 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
34824 // The bitcast source is a direct mmx result.
34825 // Detect bitcasts between i32 to x86mmx
34826 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
34827 SDValue BCSrc = Src.getOperand(0);
34828 if (BCSrc.getValueType() == MVT::x86mmx)
34829 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
34832 // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
34833 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
34836 return combineVectorTruncation(N, DAG, Subtarget);
34839 /// Returns the negated value if the node \p N flips sign of FP value.
34841 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
34842 /// AVX512F does not have FXOR, so FNEG is lowered as
34843 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
34844 /// In this case we go though all bitcasts.
34845 static SDValue isFNEG(SDNode *N) {
34846 if (N->getOpcode() == ISD::FNEG)
34847 return N->getOperand(0);
34849 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
34850 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
34853 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
34854 if (!Op1.getValueType().isFloatingPoint())
34857 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
34859 unsigned EltBits = Op1.getScalarValueSizeInBits();
34860 auto isSignMask = [&](const ConstantFP *C) {
34861 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
34864 // There is more than one way to represent the same constant on
34865 // the different X86 targets. The type of the node may also depend on size.
34866 // - load scalar value and broadcast
34867 // - BUILD_VECTOR node
34868 // - load from a constant pool.
34869 // We check all variants here.
34870 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
34871 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
34872 if (isSignMask(cast<ConstantFP>(C)))
34875 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
34876 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
34877 if (isSignMask(CN->getConstantFPValue()))
34880 } else if (auto *C = getTargetConstantFromNode(Op1)) {
34881 if (C->getType()->isVectorTy()) {
34882 if (auto *SplatV = C->getSplatValue())
34883 if (isSignMask(cast<ConstantFP>(SplatV)))
34885 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
34886 if (isSignMask(FPConst))
34892 /// Do target-specific dag combines on floating point negations.
34893 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
34894 const X86Subtarget &Subtarget) {
34895 EVT OrigVT = N->getValueType(0);
34896 SDValue Arg = isFNEG(N);
34897 assert(Arg.getNode() && "N is expected to be an FNEG node");
34899 EVT VT = Arg.getValueType();
34900 EVT SVT = VT.getScalarType();
34903 // Let legalize expand this if it isn't a legal type yet.
34904 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34907 // If we're negating a FMUL node on a target with FMA, then we can avoid the
34908 // use of a constant by performing (-0 - A*B) instead.
34909 // FIXME: Check rounding control flags as well once it becomes available.
34910 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
34911 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
34912 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
34913 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
34914 Arg.getOperand(1), Zero);
34915 return DAG.getBitcast(OrigVT, NewNode);
34918 // If we're negating an FMA node, then we can adjust the
34919 // instruction to include the extra negation.
34920 unsigned NewOpcode = 0;
34921 if (Arg.hasOneUse()) {
34922 switch (Arg.getOpcode()) {
34923 case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
34924 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
34925 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
34926 case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
34927 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
34928 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
34929 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
34930 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
34931 // We can't handle scalar intrinsic node here because it would only
34932 // invert one element and not the whole vector. But we could try to handle
34933 // a negation of the lower element only.
34937 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
34938 Arg.getNode()->ops()));
34943 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
34944 const X86Subtarget &Subtarget) {
34945 MVT VT = N->getSimpleValueType(0);
34946 // If we have integer vector types available, use the integer opcodes.
34947 if (VT.isVector() && Subtarget.hasSSE2()) {
34950 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
34952 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
34953 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
34954 unsigned IntOpcode;
34955 switch (N->getOpcode()) {
34956 default: llvm_unreachable("Unexpected FP logic op");
34957 case X86ISD::FOR: IntOpcode = ISD::OR; break;
34958 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34959 case X86ISD::FAND: IntOpcode = ISD::AND; break;
34960 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34962 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34963 return DAG.getBitcast(VT, IntOp);
34969 /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
34970 static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
34971 if (N->getOpcode() != ISD::XOR)
34974 SDValue LHS = N->getOperand(0);
34975 auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
34976 if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
34979 X86::CondCode NewCC = X86::GetOppositeBranchCondition(
34980 X86::CondCode(LHS->getConstantOperandVal(0)));
34982 return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
34985 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34986 TargetLowering::DAGCombinerInfo &DCI,
34987 const X86Subtarget &Subtarget) {
34988 // If this is SSE1 only convert to FXOR to avoid scalarization.
34989 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
34990 N->getValueType(0) == MVT::v4i32) {
34991 return DAG.getBitcast(
34992 MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
34993 DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
34994 DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
34997 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
35000 if (DCI.isBeforeLegalizeOps())
35003 if (SDValue SetCC = foldXor1SetCC(N, DAG))
35006 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
35009 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
35013 return combineFneg(N, DAG, Subtarget);
35018 static bool isNullFPScalarOrVectorConst(SDValue V) {
35019 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
35022 /// If a value is a scalar FP zero or a vector FP zero (potentially including
35023 /// undefined elements), return a zero constant that may be used to fold away
35024 /// that value. In the case of a vector, the returned constant will not contain
35025 /// undefined elements even if the input parameter does. This makes it suitable
35026 /// to be used as a replacement operand with operations (eg, bitwise-and) where
35027 /// an undef should not propagate.
35028 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
35029 const X86Subtarget &Subtarget) {
35030 if (!isNullFPScalarOrVectorConst(V))
35033 if (V.getValueType().isVector())
35034 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
35039 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
35040 const X86Subtarget &Subtarget) {
35041 SDValue N0 = N->getOperand(0);
35042 SDValue N1 = N->getOperand(1);
35043 EVT VT = N->getValueType(0);
35046 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
35047 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
35048 (VT == MVT::f64 && Subtarget.hasSSE2()) ||
35049 (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
35052 auto isAllOnesConstantFP = [](SDValue V) {
35053 if (V.getSimpleValueType().isVector())
35054 return ISD::isBuildVectorAllOnes(V.getNode());
35055 auto *C = dyn_cast<ConstantFPSDNode>(V);
35056 return C && C->getConstantFPValue()->isAllOnesValue();
35059 // fand (fxor X, -1), Y --> fandn X, Y
35060 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
35061 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
35063 // fand X, (fxor Y, -1) --> fandn Y, X
35064 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
35065 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
35070 /// Do target-specific dag combines on X86ISD::FAND nodes.
35071 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
35072 const X86Subtarget &Subtarget) {
35073 // FAND(0.0, x) -> 0.0
35074 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
35077 // FAND(x, 0.0) -> 0.0
35078 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35081 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
35084 return lowerX86FPLogicOp(N, DAG, Subtarget);
35087 /// Do target-specific dag combines on X86ISD::FANDN nodes.
35088 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
35089 const X86Subtarget &Subtarget) {
35090 // FANDN(0.0, x) -> x
35091 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35092 return N->getOperand(1);
35094 // FANDN(x, 0.0) -> 0.0
35095 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
35098 return lowerX86FPLogicOp(N, DAG, Subtarget);
35101 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
35102 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
35103 const X86Subtarget &Subtarget) {
35104 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
35106 // F[X]OR(0.0, x) -> x
35107 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
35108 return N->getOperand(1);
35110 // F[X]OR(x, 0.0) -> x
35111 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
35112 return N->getOperand(0);
35115 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
35118 return lowerX86FPLogicOp(N, DAG, Subtarget);
35121 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
35122 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
35123 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
35125 // Only perform optimizations if UnsafeMath is used.
35126 if (!DAG.getTarget().Options.UnsafeFPMath)
35129 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
35130 // into FMINC and FMAXC, which are Commutative operations.
35131 unsigned NewOp = 0;
35132 switch (N->getOpcode()) {
35133 default: llvm_unreachable("unknown opcode");
35134 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
35135 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
35138 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
35139 N->getOperand(0), N->getOperand(1));
35142 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
35143 const X86Subtarget &Subtarget) {
35144 if (Subtarget.useSoftFloat())
35147 // TODO: Check for global or instruction-level "nnan". In that case, we
35148 // should be able to lower to FMAX/FMIN alone.
35149 // TODO: If an operand is already known to be a NaN or not a NaN, this
35150 // should be an optional swap and FMAX/FMIN.
35152 EVT VT = N->getValueType(0);
35153 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
35154 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
35155 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
35158 // This takes at least 3 instructions, so favor a library call when operating
35159 // on a scalar and minimizing code size.
35160 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
35163 SDValue Op0 = N->getOperand(0);
35164 SDValue Op1 = N->getOperand(1);
35166 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
35167 DAG.getDataLayout(), *DAG.getContext(), VT);
35169 // There are 4 possibilities involving NaN inputs, and these are the required
35173 // ----------------
35174 // Num | Max | Op0 |
35175 // Op0 ----------------
35176 // NaN | Op1 | NaN |
35177 // ----------------
35179 // The SSE FP max/min instructions were not designed for this case, but rather
35181 // Min = Op1 < Op0 ? Op1 : Op0
35182 // Max = Op1 > Op0 ? Op1 : Op0
35184 // So they always return Op0 if either input is a NaN. However, we can still
35185 // use those instructions for fmaxnum by selecting away a NaN input.
35187 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
35188 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
35189 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
35190 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
35192 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
35193 // are NaN, the NaN value of Op1 is the result.
35194 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
35197 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
35198 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
35199 TargetLowering::DAGCombinerInfo &DCI,
35200 const X86Subtarget &Subtarget) {
35201 // ANDNP(0, x) -> x
35202 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
35203 return N->getOperand(1);
35205 // ANDNP(x, 0) -> 0
35206 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
35207 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
35209 EVT VT = N->getValueType(0);
35211 // Attempt to recursively combine a bitmask ANDNP with shuffles.
35212 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
35214 if (SDValue Res = combineX86ShufflesRecursively(
35215 {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
35216 /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
35217 DCI.CombineTo(N, Res);
35225 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
35226 TargetLowering::DAGCombinerInfo &DCI) {
35227 SDValue N0 = N->getOperand(0);
35228 SDValue N1 = N->getOperand(1);
35230 // BT ignores high bits in the bit index operand.
35231 unsigned BitWidth = N1.getValueSizeInBits();
35232 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
35233 if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
35234 return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
35239 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
35240 const X86Subtarget &Subtarget) {
35241 EVT VT = N->getValueType(0);
35242 if (!VT.isVector())
35245 SDValue N0 = N->getOperand(0);
35246 SDValue N1 = N->getOperand(1);
35247 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
35250 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
35251 // both SSE and AVX2 since there is no sign-extended shift right
35252 // operation on a vector with 64-bit elements.
35253 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
35254 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
35255 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
35256 N0.getOpcode() == ISD::SIGN_EXTEND)) {
35257 SDValue N00 = N0.getOperand(0);
35259 // EXTLOAD has a better solution on AVX2,
35260 // it may be replaced with X86ISD::VSEXT node.
35261 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
35262 if (!ISD::isNormalLoad(N00.getNode()))
35265 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
35266 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
35268 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
35274 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
35275 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
35276 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
35277 /// opportunities to combine math ops, use an LEA, or use a complex addressing
35278 /// mode. This can eliminate extend, add, and shift instructions.
35279 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
35280 const X86Subtarget &Subtarget) {
35281 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
35282 Ext->getOpcode() != ISD::ZERO_EXTEND)
35285 // TODO: This should be valid for other integer types.
35286 EVT VT = Ext->getValueType(0);
35287 if (VT != MVT::i64)
35290 SDValue Add = Ext->getOperand(0);
35291 if (Add.getOpcode() != ISD::ADD)
35294 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
35295 bool NSW = Add->getFlags().hasNoSignedWrap();
35296 bool NUW = Add->getFlags().hasNoUnsignedWrap();
35298 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
35300 if ((Sext && !NSW) || (!Sext && !NUW))
35303 // Having a constant operand to the 'add' ensures that we are not increasing
35304 // the instruction count because the constant is extended for free below.
35305 // A constant operand can also become the displacement field of an LEA.
35306 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
35310 // Don't make the 'add' bigger if there's no hope of combining it with some
35311 // other 'add' or 'shl' instruction.
35312 // TODO: It may be profitable to generate simpler LEA instructions in place
35313 // of single 'add' instructions, but the cost model for selecting an LEA
35314 // currently has a high threshold.
35315 bool HasLEAPotential = false;
35316 for (auto *User : Ext->uses()) {
35317 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
35318 HasLEAPotential = true;
35322 if (!HasLEAPotential)
35325 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
35326 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
35327 SDValue AddOp0 = Add.getOperand(0);
35328 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
35329 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
35331 // The wider add is guaranteed to not wrap because both operands are
35334 Flags.setNoSignedWrap(NSW);
35335 Flags.setNoUnsignedWrap(NUW);
35336 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
35339 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
35340 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
35341 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
35342 /// extends from AH (which we otherwise need to do contortions to access).
35343 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
35344 SDValue N0 = N->getOperand(0);
35345 auto OpcodeN = N->getOpcode();
35346 auto OpcodeN0 = N0.getOpcode();
35347 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
35348 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
35351 EVT VT = N->getValueType(0);
35352 EVT InVT = N0.getValueType();
35353 if (N0.getResNo() != 1 || InVT != MVT::i8 ||
35354 !(VT == MVT::i32 || VT == MVT::i64))
35357 SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
35358 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
35359 : X86ISD::UDIVREM8_ZEXT_HREG;
35360 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
35362 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
35363 // If this was a 64-bit extend, complete it.
35364 if (VT == MVT::i64)
35365 return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
35366 return R.getValue(1);
35369 // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
35370 // operands and the result of CMOV is not used anywhere else - promote CMOV
35371 // itself instead of promoting its result. This could be beneficial, because:
35372 // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
35373 // (or more) pseudo-CMOVs only when they go one-after-another and
35374 // getting rid of result extension code after CMOV will help that.
35375 // 2) Promotion of constant CMOV arguments is free, hence the
35376 // {ANY,SIGN,ZERO}_EXTEND will just be deleted.
35377 // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
35378 // promotion is also good in terms of code-size.
35379 // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
35381 static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
35382 SDValue CMovN = Extend->getOperand(0);
35383 if (CMovN.getOpcode() != X86ISD::CMOV)
35386 EVT TargetVT = Extend->getValueType(0);
35387 unsigned ExtendOpcode = Extend->getOpcode();
35390 EVT VT = CMovN.getValueType();
35391 SDValue CMovOp0 = CMovN.getOperand(0);
35392 SDValue CMovOp1 = CMovN.getOperand(1);
35394 bool DoPromoteCMOV =
35395 (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
35396 CMovN.hasOneUse() &&
35397 (isa<ConstantSDNode>(CMovOp0.getNode()) &&
35398 isa<ConstantSDNode>(CMovOp1.getNode()));
35400 if (!DoPromoteCMOV)
35403 CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
35404 CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
35406 return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
35407 CMovN.getOperand(2), CMovN.getOperand(3));
35410 // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
35411 // This is more or less the reverse of combineBitcastvxi1.
35413 combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
35414 TargetLowering::DAGCombinerInfo &DCI,
35415 const X86Subtarget &Subtarget) {
35416 unsigned Opcode = N->getOpcode();
35417 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
35418 Opcode != ISD::ANY_EXTEND)
35420 if (!DCI.isBeforeLegalizeOps())
35422 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
35425 SDValue N0 = N->getOperand(0);
35426 EVT VT = N->getValueType(0);
35427 EVT SVT = VT.getScalarType();
35428 EVT InSVT = N0.getValueType().getScalarType();
35429 unsigned EltSizeInBits = SVT.getSizeInBits();
35431 // Input type must be extending a bool vector (bit-casted from a scalar
35432 // integer) to legal integer types.
35433 if (!VT.isVector())
35435 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
35437 if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
35440 SDValue N00 = N0.getOperand(0);
35441 EVT SclVT = N0.getOperand(0).getValueType();
35442 if (!SclVT.isScalarInteger())
35447 SmallVector<int, 32> ShuffleMask;
35448 unsigned NumElts = VT.getVectorNumElements();
35449 assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
35451 // Broadcast the scalar integer to the vector elements.
35452 if (NumElts > EltSizeInBits) {
35453 // If the scalar integer is greater than the vector element size, then we
35454 // must split it down into sub-sections for broadcasting. For example:
35455 // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
35456 // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
35457 assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
35458 unsigned Scale = NumElts / EltSizeInBits;
35460 EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
35461 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
35462 Vec = DAG.getBitcast(VT, Vec);
35464 for (unsigned i = 0; i != Scale; ++i)
35465 ShuffleMask.append(EltSizeInBits, i);
35467 // For smaller scalar integers, we can simply any-extend it to the vector
35468 // element size (we don't care about the upper bits) and broadcast it to all
35470 SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
35471 Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
35472 ShuffleMask.append(NumElts, 0);
35474 Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
35476 // Now, mask the relevant bit in each element.
35477 SmallVector<SDValue, 32> Bits;
35478 for (unsigned i = 0; i != NumElts; ++i) {
35479 int BitIdx = (i % EltSizeInBits);
35480 APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
35481 Bits.push_back(DAG.getConstant(Bit, DL, SVT));
35483 SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
35484 Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
35486 // Compare against the bitmask and extend the result.
35487 EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
35488 Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
35489 Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
35491 // For SEXT, this is now done, otherwise shift the result down for
35493 if (Opcode == ISD::SIGN_EXTEND)
35495 return DAG.getNode(ISD::SRL, DL, VT, Vec,
35496 DAG.getConstant(EltSizeInBits - 1, DL, VT));
35499 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
35500 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
35501 /// with UNDEFs) of the input to vectors of the same size as the target type
35502 /// which then extends the lowest elements.
35503 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
35504 TargetLowering::DAGCombinerInfo &DCI,
35505 const X86Subtarget &Subtarget) {
35506 unsigned Opcode = N->getOpcode();
35507 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
35509 if (!DCI.isBeforeLegalizeOps())
35511 if (!Subtarget.hasSSE2())
35514 SDValue N0 = N->getOperand(0);
35515 EVT VT = N->getValueType(0);
35516 EVT SVT = VT.getScalarType();
35517 EVT InVT = N0.getValueType();
35518 EVT InSVT = InVT.getScalarType();
35520 // Input type must be a vector and we must be extending legal integer types.
35521 if (!VT.isVector())
35523 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
35525 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
35528 // On AVX2+ targets, if the input/output types are both legal then we will be
35529 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
35530 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
35531 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
35536 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
35537 EVT InVT = N.getValueType();
35538 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
35539 Size / InVT.getScalarSizeInBits());
35540 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
35541 DAG.getUNDEF(InVT));
35543 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
35546 // If target-size is less than 128-bits, extend to a type that would extend
35547 // to 128 bits, extend that and extract the original target vector.
35548 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
35549 unsigned Scale = 128 / VT.getSizeInBits();
35551 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
35552 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
35553 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
35554 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
35555 DAG.getIntPtrConstant(0, DL));
35558 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
35559 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
35560 // Also use this if we don't have SSE41 to allow the legalizer do its job.
35561 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
35562 (VT.is256BitVector() && Subtarget.hasInt256()) ||
35563 (VT.is512BitVector() && Subtarget.hasAVX512())) {
35564 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
35565 return Opcode == ISD::SIGN_EXTEND
35566 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
35567 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
35570 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
35571 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
35572 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
35573 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
35574 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
35576 SmallVector<SDValue, 8> Opnds;
35577 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
35578 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
35579 DAG.getIntPtrConstant(Offset, DL));
35580 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
35581 SrcVec = Opcode == ISD::SIGN_EXTEND
35582 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
35583 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
35584 Opnds.push_back(SrcVec);
35586 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
35589 // On pre-AVX2 targets, split into 128-bit nodes of
35590 // ISD::*_EXTEND_VECTOR_INREG.
35591 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
35592 return SplitAndExtendInReg(128);
35594 // On pre-AVX512 targets, split into 256-bit nodes of
35595 // ISD::*_EXTEND_VECTOR_INREG.
35596 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
35597 return SplitAndExtendInReg(256);
35602 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
35603 TargetLowering::DAGCombinerInfo &DCI,
35604 const X86Subtarget &Subtarget) {
35605 SDValue N0 = N->getOperand(0);
35606 EVT VT = N->getValueType(0);
35607 EVT InVT = N0.getValueType();
35610 if (SDValue DivRem8 = getDivRem8(N, DAG))
35613 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
35616 if (!DCI.isBeforeLegalizeOps()) {
35617 if (InVT == MVT::i1) {
35618 SDValue Zero = DAG.getConstant(0, DL, VT);
35619 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
35620 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
35625 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
35626 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
35627 // Invert and sign-extend a boolean is the same as zero-extend and subtract
35628 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
35629 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
35630 // sext (xor Bool, -1) --> sub (zext Bool), 1
35631 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
35632 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
35635 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
35638 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
35641 if (Subtarget.hasAVX() && VT.is256BitVector())
35642 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
35645 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
35651 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
35652 const X86Subtarget &Subtarget) {
35653 // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
35655 EVT VT = N->getValueType(0);
35657 // Let legalize expand this if it isn't a legal type yet.
35658 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
35661 EVT ScalarVT = VT.getScalarType();
35662 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
35665 SDValue A = N->getOperand(0);
35666 SDValue B = N->getOperand(1);
35667 SDValue C = N->getOperand(2);
35669 auto invertIfNegative = [](SDValue &V) {
35670 if (SDValue NegVal = isFNEG(V.getNode())) {
35677 // Do not convert the passthru input of scalar intrinsics.
35678 // FIXME: We could allow negations of the lower element only.
35679 bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
35680 N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
35681 bool NegB = invertIfNegative(B);
35682 bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
35683 N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
35685 // Negative multiplication when NegA xor NegB
35686 bool NegMul = (NegA != NegB);
35687 bool HasNeg = NegA || NegB || NegC;
35689 unsigned NewOpcode;
35691 NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
35693 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
35695 // For FMA, we risk reconstructing the node we started with.
35696 // In order to avoid this, we check for negation or opcode change. If
35697 // one of the two happened, then it is a new node and we return it.
35698 if (N->getOpcode() == ISD::FMA) {
35699 if (HasNeg || NewOpcode != N->getOpcode())
35700 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
35704 if (N->getOpcode() == X86ISD::FMADD_RND) {
35705 switch (NewOpcode) {
35706 case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
35707 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
35708 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
35709 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
35711 } else if (N->getOpcode() == X86ISD::FMADDS1) {
35712 switch (NewOpcode) {
35713 case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
35714 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
35715 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
35716 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
35718 } else if (N->getOpcode() == X86ISD::FMADDS3) {
35719 switch (NewOpcode) {
35720 case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
35721 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
35722 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
35723 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
35725 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
35726 switch (NewOpcode) {
35727 case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
35728 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
35729 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
35730 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
35732 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
35733 switch (NewOpcode) {
35734 case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
35735 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
35736 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
35737 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
35739 } else if (N->getOpcode() == X86ISD::FMADD4S) {
35740 switch (NewOpcode) {
35741 case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
35742 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
35743 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
35744 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
35747 llvm_unreachable("Unexpected opcode!");
35750 // Only return the node is the opcode was changed or one of the
35751 // operand was negated. If not, we'll just recreate the same node.
35752 if (HasNeg || NewOpcode != N->getOpcode()) {
35753 if (N->getNumOperands() == 4)
35754 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
35755 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
35761 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
35762 static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
35763 const X86Subtarget &Subtarget) {
35765 EVT VT = N->getValueType(0);
35767 SDValue NegVal = isFNEG(N->getOperand(2).getNode());
35771 unsigned NewOpcode;
35772 switch (N->getOpcode()) {
35773 default: llvm_unreachable("Unexpected opcode!");
35774 case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
35775 case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
35776 case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
35777 case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
35780 if (N->getNumOperands() == 4)
35781 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
35782 NegVal, N->getOperand(3));
35783 return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
35787 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
35788 TargetLowering::DAGCombinerInfo &DCI,
35789 const X86Subtarget &Subtarget) {
35790 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
35791 // (and (i32 x86isd::setcc_carry), 1)
35792 // This eliminates the zext. This transformation is necessary because
35793 // ISD::SETCC is always legalized to i8.
35795 SDValue N0 = N->getOperand(0);
35796 EVT VT = N->getValueType(0);
35798 if (N0.getOpcode() == ISD::AND &&
35800 N0.getOperand(0).hasOneUse()) {
35801 SDValue N00 = N0.getOperand(0);
35802 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
35803 if (!isOneConstant(N0.getOperand(1)))
35805 return DAG.getNode(ISD::AND, dl, VT,
35806 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
35807 N00.getOperand(0), N00.getOperand(1)),
35808 DAG.getConstant(1, dl, VT));
35812 if (N0.getOpcode() == ISD::TRUNCATE &&
35814 N0.getOperand(0).hasOneUse()) {
35815 SDValue N00 = N0.getOperand(0);
35816 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
35817 return DAG.getNode(ISD::AND, dl, VT,
35818 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
35819 N00.getOperand(0), N00.getOperand(1)),
35820 DAG.getConstant(1, dl, VT));
35824 if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
35827 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
35830 if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
35833 if (VT.is256BitVector())
35834 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
35837 if (SDValue DivRem8 = getDivRem8(N, DAG))
35840 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
35843 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
35849 /// Try to map a 128-bit or larger integer comparison to vector instructions
35850 /// before type legalization splits it up into chunks.
35851 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
35852 const X86Subtarget &Subtarget) {
35853 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
35854 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
35856 // We're looking for an oversized integer equality comparison, but ignore a
35857 // comparison with zero because that gets special treatment in EmitTest().
35858 SDValue X = SetCC->getOperand(0);
35859 SDValue Y = SetCC->getOperand(1);
35860 EVT OpVT = X.getValueType();
35861 unsigned OpSize = OpVT.getSizeInBits();
35862 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
35865 // Bail out if we know that this is not really just an oversized integer.
35866 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
35867 peekThroughBitcasts(Y).getValueType() == MVT::f128)
35870 // TODO: Use PXOR + PTEST for SSE4.1 or later?
35871 // TODO: Add support for AVX-512.
35872 EVT VT = SetCC->getValueType(0);
35874 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
35875 (OpSize == 256 && Subtarget.hasAVX2())) {
35876 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
35877 SDValue VecX = DAG.getBitcast(VecVT, X);
35878 SDValue VecY = DAG.getBitcast(VecVT, Y);
35880 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
35881 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
35882 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
35883 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
35884 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
35885 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
35886 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
35887 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
35889 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
35895 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
35896 const X86Subtarget &Subtarget) {
35897 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
35898 SDValue LHS = N->getOperand(0);
35899 SDValue RHS = N->getOperand(1);
35900 EVT VT = N->getValueType(0);
35903 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
35904 EVT OpVT = LHS.getValueType();
35905 // 0-x == y --> x+y == 0
35906 // 0-x != y --> x+y != 0
35907 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
35909 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
35910 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
35912 // x == 0-y --> x+y == 0
35913 // x != 0-y --> x+y != 0
35914 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
35916 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
35917 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
35920 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
35924 if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
35925 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
35926 // Put build_vectors on the right.
35927 if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
35928 std::swap(LHS, RHS);
35929 CC = ISD::getSetCCSwappedOperands(CC);
35933 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
35934 (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
35935 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
35937 if (IsSEXT0 && IsVZero1) {
35938 assert(VT == LHS.getOperand(0).getValueType() &&
35939 "Uexpected operand type");
35940 if (CC == ISD::SETGT)
35941 return DAG.getConstant(0, DL, VT);
35942 if (CC == ISD::SETLE)
35943 return DAG.getConstant(1, DL, VT);
35944 if (CC == ISD::SETEQ || CC == ISD::SETGE)
35945 return DAG.getNOT(DL, LHS.getOperand(0), VT);
35947 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
35948 "Unexpected condition code!");
35949 return LHS.getOperand(0);
35953 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
35954 // to avoid scalarization via legalization because v4i32 is not a legal type.
35955 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
35956 LHS.getValueType() == MVT::v4f32)
35957 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
35962 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
35963 TargetLowering::DAGCombinerInfo &DCI,
35964 const X86Subtarget &Subtarget) {
35967 // Pre-shrink oversized index elements to avoid triggering scalarization.
35968 if (DCI.isBeforeLegalize()) {
35969 SDValue Index = N->getOperand(4);
35970 if (Index.getScalarValueSizeInBits() > 64) {
35971 EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
35972 Index.getValueType().getVectorNumElements());
35973 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
35974 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
35976 DAG.UpdateNodeOperands(N, NewOps);
35977 DCI.AddToWorklist(N);
35978 return SDValue(N, 0);
35982 // Try to remove sign extends from i32 to i64 on the index.
35983 // Only do this before legalize in case we are relying on it for
35985 // TODO: We should maybe remove any sign extend once we learn how to sign
35986 // extend narrow index during lowering.
35987 if (DCI.isBeforeLegalizeOps()) {
35988 SDValue Index = N->getOperand(4);
35989 if (Index.getScalarValueSizeInBits() == 64 &&
35990 Index.getOpcode() == ISD::SIGN_EXTEND &&
35991 Index.getOperand(0).getScalarValueSizeInBits() == 32) {
35992 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
35993 NewOps[4] = Index.getOperand(0);
35994 DAG.UpdateNodeOperands(N, NewOps);
35995 // The original sign extend has less users, add back to worklist in case
35996 // it needs to be removed.
35997 DCI.AddToWorklist(Index.getNode());
35998 DCI.AddToWorklist(N);
35999 return SDValue(N, 0);
36003 // Gather and Scatter instructions use k-registers for masks. The type of
36004 // the masks is v*i1. So the mask will be truncated anyway.
36005 // The SIGN_EXTEND_INREG my be dropped.
36006 SDValue Mask = N->getOperand(2);
36007 if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
36008 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
36009 NewOps[2] = Mask.getOperand(0);
36010 DAG.UpdateNodeOperands(N, NewOps);
36013 // With AVX2 we only demand the upper bit of the mask.
36014 if (!Subtarget.hasAVX512()) {
36015 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
36016 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
36017 !DCI.isBeforeLegalizeOps());
36019 APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
36020 if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
36021 DCI.AddToWorklist(Mask.getNode());
36022 DCI.CommitTargetLoweringOpt(TLO);
36023 return SDValue(N, 0);
36030 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
36031 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
36032 const X86Subtarget &Subtarget) {
36034 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
36035 SDValue EFLAGS = N->getOperand(1);
36037 // Try to simplify the EFLAGS and condition code operands.
36038 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
36039 return getSETCC(CC, Flags, DL, DAG);
36044 /// Optimize branch condition evaluation.
36045 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
36046 const X86Subtarget &Subtarget) {
36048 SDValue EFLAGS = N->getOperand(3);
36049 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
36051 // Try to simplify the EFLAGS and condition code operands.
36052 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
36053 // RAUW them under us.
36054 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
36055 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
36056 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
36057 N->getOperand(1), Cond, Flags);
36063 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
36064 SelectionDAG &DAG) {
36065 // Take advantage of vector comparisons producing 0 or -1 in each lane to
36066 // optimize away operation when it's from a constant.
36068 // The general transformation is:
36069 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
36070 // AND(VECTOR_CMP(x,y), constant2)
36071 // constant2 = UNARYOP(constant)
36073 // Early exit if this isn't a vector operation, the operand of the
36074 // unary operation isn't a bitwise AND, or if the sizes of the operations
36075 // aren't the same.
36076 EVT VT = N->getValueType(0);
36077 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
36078 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
36079 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
36082 // Now check that the other operand of the AND is a constant. We could
36083 // make the transformation for non-constant splats as well, but it's unclear
36084 // that would be a benefit as it would not eliminate any operations, just
36085 // perform one more step in scalar code before moving to the vector unit.
36086 if (BuildVectorSDNode *BV =
36087 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
36088 // Bail out if the vector isn't a constant.
36089 if (!BV->isConstant())
36092 // Everything checks out. Build up the new and improved node.
36094 EVT IntVT = BV->getValueType(0);
36095 // Create a new constant of the appropriate type for the transformed
36097 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
36098 // The AND node needs bitcasts to/from an integer vector type around it.
36099 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
36100 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
36101 N->getOperand(0)->getOperand(0), MaskConst);
36102 SDValue Res = DAG.getBitcast(VT, NewAnd);
36109 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
36110 const X86Subtarget &Subtarget) {
36111 SDValue Op0 = N->getOperand(0);
36112 EVT VT = N->getValueType(0);
36113 EVT InVT = Op0.getValueType();
36114 EVT InSVT = InVT.getScalarType();
36116 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
36117 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
36118 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
36120 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36121 InVT.getVectorNumElements());
36122 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
36124 // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
36125 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36128 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
36129 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
36130 // the optimization here.
36131 if (DAG.SignBitIsZero(Op0))
36132 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
36137 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
36138 const X86Subtarget &Subtarget) {
36139 // First try to optimize away the conversion entirely when it's
36140 // conditionally from a constant. Vectors only.
36141 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
36144 // Now move on to more general possibilities.
36145 SDValue Op0 = N->getOperand(0);
36146 EVT VT = N->getValueType(0);
36147 EVT InVT = Op0.getValueType();
36148 EVT InSVT = InVT.getScalarType();
36150 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
36151 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
36152 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
36153 if (InVT.isVector() &&
36154 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
36155 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
36157 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36158 InVT.getVectorNumElements());
36159 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
36160 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
36163 // Without AVX512DQ we only support i64 to float scalar conversion. For both
36164 // vectors and scalars, see if we know that the upper bits are all the sign
36165 // bit, in which case we can truncate the input to i32 and convert from that.
36166 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
36167 unsigned BitWidth = InVT.getScalarSizeInBits();
36168 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
36169 if (NumSignBits >= (BitWidth - 31)) {
36170 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
36171 if (InVT.isVector())
36172 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
36173 InVT.getVectorNumElements());
36175 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
36176 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
36180 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
36181 // a 32-bit target where SSE doesn't support i64->FP operations.
36182 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
36183 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
36184 EVT LdVT = Ld->getValueType(0);
36186 // This transformation is not supported if the result type is f16 or f128.
36187 if (VT == MVT::f16 || VT == MVT::f128)
36190 if (!Ld->isVolatile() && !VT.isVector() &&
36191 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
36192 !Subtarget.is64Bit() && LdVT == MVT::i64) {
36193 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
36194 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
36195 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
36202 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
36203 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36204 MVT VT = N->getSimpleValueType(0);
36205 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36206 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
36207 N->getOperand(0), N->getOperand(1),
36214 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
36215 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
36216 TargetLowering::DAGCombinerInfo &DCI) {
36217 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
36218 // the result is either zero or one (depending on the input carry bit).
36219 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
36220 if (X86::isZeroNode(N->getOperand(0)) &&
36221 X86::isZeroNode(N->getOperand(1)) &&
36222 // We don't have a good way to replace an EFLAGS use, so only do this when
36224 SDValue(N, 1).use_empty()) {
36226 EVT VT = N->getValueType(0);
36227 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
36228 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
36229 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36230 DAG.getConstant(X86::COND_B, DL,
36233 DAG.getConstant(1, DL, VT));
36234 return DCI.CombineTo(N, Res1, CarryOut);
36237 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
36238 MVT VT = N->getSimpleValueType(0);
36239 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36240 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
36241 N->getOperand(0), N->getOperand(1),
36248 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
36249 /// which is more useful than 0/1 in some cases.
36250 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
36252 // "Condition code B" is also known as "the carry flag" (CF).
36253 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
36254 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
36255 MVT VT = N->getSimpleValueType(0);
36257 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
36259 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
36260 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
36263 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
36264 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
36265 /// with CMP+{ADC, SBB}.
36266 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
36267 bool IsSub = N->getOpcode() == ISD::SUB;
36268 SDValue X = N->getOperand(0);
36269 SDValue Y = N->getOperand(1);
36271 // If this is an add, canonicalize a zext operand to the RHS.
36272 // TODO: Incomplete? What if both sides are zexts?
36273 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
36274 Y.getOpcode() != ISD::ZERO_EXTEND)
36277 // Look through a one-use zext.
36278 bool PeekedThroughZext = false;
36279 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
36280 Y = Y.getOperand(0);
36281 PeekedThroughZext = true;
36284 // If this is an add, canonicalize a setcc operand to the RHS.
36285 // TODO: Incomplete? What if both sides are setcc?
36286 // TODO: Should we allow peeking through a zext of the other operand?
36287 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
36288 Y.getOpcode() != X86ISD::SETCC)
36291 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
36295 EVT VT = N->getValueType(0);
36296 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
36298 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36299 // the general case below.
36300 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
36302 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
36303 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
36304 // This is a complicated way to get -1 or 0 from the carry flag:
36305 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36306 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
36307 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36308 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36312 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
36313 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
36314 SDValue EFLAGS = Y->getOperand(1);
36315 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36316 EFLAGS.getValueType().isInteger() &&
36317 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36318 // Swap the operands of a SUB, and we have the same pattern as above.
36319 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
36320 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
36321 SDValue NewSub = DAG.getNode(
36322 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
36323 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36324 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36325 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36326 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36332 if (CC == X86::COND_B) {
36333 // X + SETB Z --> X + (mask SBB Z, Z)
36334 // X - SETB Z --> X - (mask SBB Z, Z)
36335 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
36336 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
36337 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36338 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36339 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36342 if (CC == X86::COND_A) {
36343 SDValue EFLAGS = Y->getOperand(1);
36344 // Try to convert COND_A into COND_B in an attempt to facilitate
36345 // materializing "setb reg".
36347 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
36348 // cannot take an immediate as its first operand.
36350 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
36351 EFLAGS.getValueType().isInteger() &&
36352 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
36353 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
36354 EFLAGS.getNode()->getVTList(),
36355 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
36356 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
36357 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
36358 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
36359 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
36360 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
36364 if (CC != X86::COND_E && CC != X86::COND_NE)
36367 SDValue Cmp = Y.getOperand(1);
36368 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
36369 !X86::isZeroNode(Cmp.getOperand(1)) ||
36370 !Cmp.getOperand(0).getValueType().isInteger())
36373 SDValue Z = Cmp.getOperand(0);
36374 EVT ZVT = Z.getValueType();
36376 // If X is -1 or 0, then we have an opportunity to avoid constants required in
36377 // the general case below.
36379 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
36381 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
36382 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
36383 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
36384 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
36385 SDValue Zero = DAG.getConstant(0, DL, ZVT);
36386 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
36387 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
36388 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36389 DAG.getConstant(X86::COND_B, DL, MVT::i8),
36390 SDValue(Neg.getNode(), 1));
36393 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
36394 // with fake operands:
36395 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
36396 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
36397 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
36398 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
36399 SDValue One = DAG.getConstant(1, DL, ZVT);
36400 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
36401 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
36402 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
36406 // (cmp Z, 1) sets the carry flag if Z is 0.
36407 SDValue One = DAG.getConstant(1, DL, ZVT);
36408 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
36410 // Add the flags type for ADC/SBB nodes.
36411 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
36413 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
36414 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
36415 if (CC == X86::COND_NE)
36416 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
36417 DAG.getConstant(-1ULL, DL, VT), Cmp1);
36419 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
36420 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
36421 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
36422 DAG.getConstant(0, DL, VT), Cmp1);
36425 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
36426 const X86Subtarget &Subtarget) {
36427 if (!Subtarget.hasSSE2())
36430 SDValue MulOp = N->getOperand(0);
36431 SDValue Phi = N->getOperand(1);
36433 if (MulOp.getOpcode() != ISD::MUL)
36434 std::swap(MulOp, Phi);
36435 if (MulOp.getOpcode() != ISD::MUL)
36439 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
36442 EVT VT = N->getValueType(0);
36444 unsigned RegSize = 128;
36445 if (Subtarget.hasBWI())
36447 else if (Subtarget.hasAVX2())
36449 unsigned VectorSize = VT.getVectorNumElements() * 16;
36450 // If the vector size is less than 128, or greater than the supported RegSize,
36451 // do not use PMADD.
36452 if (VectorSize < 128 || VectorSize > RegSize)
36456 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
36457 VT.getVectorNumElements());
36458 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
36459 VT.getVectorNumElements() / 2);
36461 // Shrink the operands of mul.
36462 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
36463 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
36465 // Madd vector size is half of the original vector size
36466 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
36467 // Fill the rest of the output with 0
36468 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
36469 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
36470 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
36473 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
36474 const X86Subtarget &Subtarget) {
36475 if (!Subtarget.hasSSE2())
36479 EVT VT = N->getValueType(0);
36480 SDValue Op0 = N->getOperand(0);
36481 SDValue Op1 = N->getOperand(1);
36483 // TODO: There's nothing special about i32, any integer type above i16 should
36484 // work just as well.
36485 if (!VT.isVector() || !VT.isSimple() ||
36486 !(VT.getVectorElementType() == MVT::i32))
36489 unsigned RegSize = 128;
36490 if (Subtarget.hasBWI())
36492 else if (Subtarget.hasAVX2())
36495 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
36496 // TODO: We should be able to handle larger vectors by splitting them before
36497 // feeding them into several SADs, and then reducing over those.
36498 if (VT.getSizeInBits() / 4 > RegSize)
36501 // We know N is a reduction add, which means one of its operands is a phi.
36502 // To match SAD, we need the other operand to be a vector select.
36503 SDValue SelectOp, Phi;
36504 if (Op0.getOpcode() == ISD::VSELECT) {
36507 } else if (Op1.getOpcode() == ISD::VSELECT) {
36513 // Check whether we have an abs-diff pattern feeding into the select.
36514 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
36517 // SAD pattern detected. Now build a SAD instruction and an addition for
36518 // reduction. Note that the number of elements of the result of SAD is less
36519 // than the number of elements of its input. Therefore, we could only update
36520 // part of elements in the reduction vector.
36521 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
36523 // The output of PSADBW is a vector of i64.
36524 // We need to turn the vector of i64 into a vector of i32.
36525 // If the reduction vector is at least as wide as the psadbw result, just
36526 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
36528 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
36529 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
36530 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
36532 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
36534 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
36535 // Fill the upper elements with zero to match the add width.
36536 SDValue Zero = DAG.getConstant(0, DL, VT);
36537 Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
36538 DAG.getIntPtrConstant(0, DL));
36541 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
36544 /// Convert vector increment or decrement to sub/add with an all-ones constant:
36545 /// add X, <1, 1...> --> sub X, <-1, -1...>
36546 /// sub X, <1, 1...> --> add X, <-1, -1...>
36547 /// The all-ones vector constant can be materialized using a pcmpeq instruction
36548 /// that is commonly recognized as an idiom (has no register dependency), so
36549 /// that's better/smaller than loading a splat 1 constant.
36550 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
36551 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
36552 "Unexpected opcode for increment/decrement transform");
36554 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
36555 // out and wait for legalization if we have an unsupported vector length.
36556 EVT VT = N->getValueType(0);
36557 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
36560 SDNode *N1 = N->getOperand(1).getNode();
36562 if (!ISD::isConstantSplatVector(N1, SplatVal) ||
36563 !SplatVal.isOneValue())
36566 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
36567 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
36568 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
36571 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
36572 const X86Subtarget &Subtarget) {
36573 const SDNodeFlags Flags = N->getFlags();
36574 if (Flags.hasVectorReduction()) {
36575 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
36577 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
36580 EVT VT = N->getValueType(0);
36581 SDValue Op0 = N->getOperand(0);
36582 SDValue Op1 = N->getOperand(1);
36584 // Try to synthesize horizontal adds from adds of shuffles.
36585 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
36586 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
36587 isHorizontalBinOp(Op0, Op1, true))
36588 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
36590 if (SDValue V = combineIncDecVector(N, DAG))
36593 return combineAddOrSubToADCOrSBB(N, DAG);
36596 static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
36597 const X86Subtarget &Subtarget) {
36598 SDValue Op0 = N->getOperand(0);
36599 SDValue Op1 = N->getOperand(1);
36600 EVT VT = N->getValueType(0);
36602 // PSUBUS is supported, starting from SSE2, but special preprocessing
36603 // for v8i32 requires umin, which appears in SSE41.
36604 if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
36605 !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
36606 !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
36607 !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
36608 (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
36609 VT == MVT::v8i64)))
36612 SDValue SubusLHS, SubusRHS;
36613 // Try to find umax(a,b) - b or a - umin(a,b) patterns
36614 // they may be converted to subus(a,b).
36615 // TODO: Need to add IR cannonicialization for this code.
36616 if (Op0.getOpcode() == ISD::UMAX) {
36618 SDValue MaxLHS = Op0.getOperand(0);
36619 SDValue MaxRHS = Op0.getOperand(1);
36622 else if (MaxRHS == Op1)
36626 } else if (Op1.getOpcode() == ISD::UMIN) {
36628 SDValue MinLHS = Op1.getOperand(0);
36629 SDValue MinRHS = Op1.getOperand(1);
36632 else if (MinRHS == Op0)
36639 // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
36640 // special preprocessing in some cases.
36641 if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
36642 return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
36644 // Special preprocessing case can be only applied
36645 // if the value was zero extended from 16 bit,
36646 // so we require first 16 bits to be zeros for 32 bit
36647 // values, or first 48 bits for 64 bit values.
36649 DAG.computeKnownBits(SubusLHS, Known);
36650 unsigned NumZeros = Known.countMinLeadingZeros();
36651 if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
36654 EVT ExtType = SubusLHS.getValueType();
36656 if (VT == MVT::v8i32 || VT == MVT::v8i64)
36657 ShrinkedType = MVT::v8i16;
36659 ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
36661 // If SubusLHS is zeroextended - truncate SubusRHS to it's
36662 // size SubusRHS = umin(0xFFF.., SubusRHS).
36663 SDValue SaturationConst =
36664 DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
36665 ShrinkedType.getScalarSizeInBits()),
36666 SDLoc(SubusLHS), ExtType);
36667 SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
36669 SDValue NewSubusLHS =
36670 DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
36671 SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
36672 SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
36673 NewSubusLHS, NewSubusRHS);
36674 // Zero extend the result, it may be used somewhere as 32 bit,
36675 // if not zext and following trunc will shrink.
36676 return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
36679 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
36680 const X86Subtarget &Subtarget) {
36681 SDValue Op0 = N->getOperand(0);
36682 SDValue Op1 = N->getOperand(1);
36684 // X86 can't encode an immediate LHS of a sub. See if we can push the
36685 // negation into a preceding instruction.
36686 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
36687 // If the RHS of the sub is a XOR with one use and a constant, invert the
36688 // immediate. Then add one to the LHS of the sub so we can turn
36689 // X-Y -> X+~Y+1, saving one register.
36690 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
36691 isa<ConstantSDNode>(Op1.getOperand(1))) {
36692 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
36693 EVT VT = Op0.getValueType();
36694 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
36696 DAG.getConstant(~XorC, SDLoc(Op1), VT));
36697 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
36698 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
36702 // Try to synthesize horizontal subs from subs of shuffles.
36703 EVT VT = N->getValueType(0);
36704 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
36705 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
36706 isHorizontalBinOp(Op0, Op1, false))
36707 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
36709 if (SDValue V = combineIncDecVector(N, DAG))
36712 // Try to create PSUBUS if SUB's argument is max/min
36713 if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
36716 return combineAddOrSubToADCOrSBB(N, DAG);
36719 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
36720 TargetLowering::DAGCombinerInfo &DCI,
36721 const X86Subtarget &Subtarget) {
36722 if (DCI.isBeforeLegalize())
36726 unsigned Opcode = N->getOpcode();
36727 MVT VT = N->getSimpleValueType(0);
36728 MVT SVT = VT.getVectorElementType();
36729 unsigned NumElts = VT.getVectorNumElements();
36730 unsigned EltSizeInBits = SVT.getSizeInBits();
36732 SDValue Op = N->getOperand(0);
36733 MVT OpVT = Op.getSimpleValueType();
36734 MVT OpEltVT = OpVT.getVectorElementType();
36735 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
36736 unsigned InputBits = OpEltSizeInBits * NumElts;
36738 // Perform any constant folding.
36739 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
36741 SmallVector<APInt, 64> EltBits;
36742 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
36743 APInt Undefs(NumElts, 0);
36744 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
36746 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
36747 for (unsigned i = 0; i != NumElts; ++i) {
36748 if (UndefElts[i]) {
36752 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
36753 : EltBits[i].sextOrTrunc(EltSizeInBits);
36755 return getConstVector(Vals, Undefs, VT, DAG, DL);
36758 // (vzext (bitcast (vzext (x)) -> (vzext x)
36759 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
36760 SDValue V = peekThroughBitcasts(Op);
36761 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
36762 MVT InnerVT = V.getSimpleValueType();
36763 MVT InnerEltVT = InnerVT.getVectorElementType();
36765 // If the element sizes match exactly, we can just do one larger vzext. This
36766 // is always an exact type match as vzext operates on integer types.
36767 if (OpEltVT == InnerEltVT) {
36768 assert(OpVT == InnerVT && "Types must match for vzext!");
36769 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
36772 // The only other way we can combine them is if only a single element of the
36773 // inner vzext is used in the input to the outer vzext.
36774 if (InnerEltVT.getSizeInBits() < InputBits)
36777 // In this case, the inner vzext is completely dead because we're going to
36778 // only look at bits inside of the low element. Just do the outer vzext on
36779 // a bitcast of the input to the inner.
36780 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
36783 // Check if we can bypass extracting and re-inserting an element of an input
36784 // vector. Essentially:
36785 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
36786 // TODO: Add X86ISD::VSEXT support
36787 if (Opcode == X86ISD::VZEXT &&
36788 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
36789 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
36790 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
36791 SDValue ExtractedV = V.getOperand(0);
36792 SDValue OrigV = ExtractedV.getOperand(0);
36793 if (isNullConstant(ExtractedV.getOperand(1))) {
36794 MVT OrigVT = OrigV.getSimpleValueType();
36795 // Extract a subvector if necessary...
36796 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
36797 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
36798 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
36799 OrigVT.getVectorNumElements() / Ratio);
36800 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
36801 DAG.getIntPtrConstant(0, DL));
36803 Op = DAG.getBitcast(OpVT, OrigV);
36804 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
36811 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
36812 const X86Subtarget &Subtarget) {
36813 SDValue Op0 = N->getOperand(0);
36814 SDValue Op1 = N->getOperand(1);
36816 MVT VT = N->getSimpleValueType(0);
36819 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
36820 if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
36821 return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
36822 Op0->getOperand(1));
36824 // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
36825 // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
36826 if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
36827 ISD::isBuildVectorAllZeros(Op1.getNode()))
36828 return getZeroVector(VT, Subtarget, DAG, DL);
36833 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
36834 const X86Subtarget &Subtarget) {
36835 MVT VT = N->getSimpleValueType(0);
36838 if (N->getOperand(0) == N->getOperand(1)) {
36839 if (N->getOpcode() == X86ISD::PCMPEQ)
36840 return getOnesVector(VT, DAG, DL);
36841 if (N->getOpcode() == X86ISD::PCMPGT)
36842 return getZeroVector(VT, Subtarget, DAG, DL);
36848 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
36849 TargetLowering::DAGCombinerInfo &DCI,
36850 const X86Subtarget &Subtarget) {
36851 if (DCI.isBeforeLegalizeOps())
36854 MVT OpVT = N->getSimpleValueType(0);
36856 // Early out for mask vectors.
36857 if (OpVT.getVectorElementType() == MVT::i1)
36861 SDValue Vec = N->getOperand(0);
36862 SDValue SubVec = N->getOperand(1);
36864 unsigned IdxVal = N->getConstantOperandVal(2);
36865 MVT SubVecVT = SubVec.getSimpleValueType();
36867 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
36868 // Inserting zeros into zeros is a nop.
36869 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
36872 // If we're inserting into a zero vector and then into a larger zero vector,
36873 // just insert into the larger zero vector directly.
36874 if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
36875 ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
36876 unsigned Idx2Val = SubVec.getConstantOperandVal(2);
36877 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
36878 SubVec.getOperand(1),
36879 DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
36882 // If we're inserting a bitcast into zeros, rewrite the insert and move the
36883 // bitcast to the other side. This helps with detecting zero extending
36885 // TODO: Is this useful for other indices than 0?
36886 if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
36887 MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
36888 unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
36889 MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
36890 SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
36891 DAG.getBitcast(NewVT, Vec),
36892 SubVec.getOperand(0), N->getOperand(2));
36893 return DAG.getBitcast(OpVT, Insert);
36897 // If this is an insert of an extract, combine to a shuffle. Don't do this
36898 // if the insert or extract can be represented with a subregister operation.
36899 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
36900 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
36901 (IdxVal != 0 || !Vec.isUndef())) {
36902 int ExtIdxVal = SubVec.getConstantOperandVal(1);
36903 if (ExtIdxVal != 0) {
36904 int VecNumElts = OpVT.getVectorNumElements();
36905 int SubVecNumElts = SubVecVT.getVectorNumElements();
36906 SmallVector<int, 64> Mask(VecNumElts);
36907 // First create an identity shuffle mask.
36908 for (int i = 0; i != VecNumElts; ++i)
36910 // Now insert the extracted portion.
36911 for (int i = 0; i != SubVecNumElts; ++i)
36912 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
36914 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
36918 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
36920 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
36921 // (load16 addr + 16), Elts/2)
36924 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
36925 // (load32 addr + 32), Elts/2)
36927 // or a 16-byte or 32-byte broadcast:
36928 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
36929 // (load16 addr), Elts/2)
36930 // --> X86SubVBroadcast(load16 addr)
36932 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
36933 // (load32 addr), Elts/2)
36934 // --> X86SubVBroadcast(load32 addr)
36935 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
36936 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
36937 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
36938 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
36939 if (Idx2 && Idx2->getZExtValue() == 0) {
36940 SDValue SubVec2 = Vec.getOperand(1);
36941 // If needed, look through bitcasts to get to the load.
36942 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
36944 unsigned Alignment = FirstLd->getAlignment();
36945 unsigned AS = FirstLd->getAddressSpace();
36946 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
36947 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
36948 OpVT, AS, Alignment, &Fast) && Fast) {
36949 SDValue Ops[] = {SubVec2, SubVec};
36950 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
36955 // If lower/upper loads are the same and the only users of the load, then
36956 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
36957 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
36958 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
36959 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
36960 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
36962 // If this is subv_broadcast insert into both halves, use a larger
36964 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
36965 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
36966 SubVec.getOperand(0));
36968 // If we're inserting all zeros into the upper half, change this to
36969 // an insert into an all zeros vector. We will match this to a move
36970 // with implicit upper bit zeroing during isel.
36971 if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
36972 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
36973 getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
36974 Vec.getOperand(2));
36976 // If we are inserting into both halves of the vector, the starting
36977 // vector should be undef. If it isn't, make it so. Only do this if the
36978 // the early insert has no other uses.
36979 // TODO: Should this be a generic DAG combine?
36980 if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
36981 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
36982 SubVec2, Vec.getOperand(2));
36983 DCI.AddToWorklist(Vec.getNode());
36984 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
36994 static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
36995 TargetLowering::DAGCombinerInfo &DCI,
36996 const X86Subtarget &Subtarget) {
36997 if (DCI.isBeforeLegalizeOps())
37000 MVT OpVT = N->getSimpleValueType(0);
37001 SDValue InVec = N->getOperand(0);
37002 unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
37004 if (ISD::isBuildVectorAllZeros(InVec.getNode()))
37005 return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
37007 if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
37008 if (OpVT.getScalarType() == MVT::i1)
37009 return DAG.getConstant(1, SDLoc(N), OpVT);
37010 return getOnesVector(OpVT, DAG, SDLoc(N));
37013 if (InVec.getOpcode() == ISD::BUILD_VECTOR)
37014 return DAG.getBuildVector(
37016 InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
37021 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
37022 DAGCombinerInfo &DCI) const {
37023 SelectionDAG &DAG = DCI.DAG;
37024 switch (N->getOpcode()) {
37026 case ISD::EXTRACT_VECTOR_ELT:
37027 case X86ISD::PEXTRW:
37028 case X86ISD::PEXTRB:
37029 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
37030 case ISD::INSERT_SUBVECTOR:
37031 return combineInsertSubvector(N, DAG, DCI, Subtarget);
37032 case ISD::EXTRACT_SUBVECTOR:
37033 return combineExtractSubvector(N, DAG, DCI, Subtarget);
37036 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
37037 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
37038 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
37039 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
37040 case ISD::SUB: return combineSub(N, DAG, Subtarget);
37041 case X86ISD::SBB: return combineSBB(N, DAG);
37042 case X86ISD::ADC: return combineADC(N, DAG, DCI);
37043 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
37046 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
37047 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
37048 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
37049 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
37050 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
37051 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
37052 case ISD::STORE: return combineStore(N, DAG, Subtarget);
37053 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
37054 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
37055 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
37057 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
37058 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
37059 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
37060 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
37061 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
37062 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
37064 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
37066 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
37068 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
37069 case X86ISD::BT: return combineBT(N, DAG, DCI);
37070 case ISD::ANY_EXTEND:
37071 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
37072 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
37073 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
37074 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
37075 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
37076 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
37077 case X86ISD::PACKSS:
37078 case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
37079 case X86ISD::VSHLI:
37080 case X86ISD::VSRAI:
37081 case X86ISD::VSRLI:
37082 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
37083 case ISD::SIGN_EXTEND_VECTOR_INREG:
37084 case ISD::ZERO_EXTEND_VECTOR_INREG:
37085 case X86ISD::VSEXT:
37086 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
37087 case X86ISD::PINSRB:
37088 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
37089 case X86ISD::SHUFP: // Handle all target specific shuffles
37090 case X86ISD::INSERTPS:
37091 case X86ISD::EXTRQI:
37092 case X86ISD::INSERTQI:
37093 case X86ISD::PALIGNR:
37094 case X86ISD::VSHLDQ:
37095 case X86ISD::VSRLDQ:
37096 case X86ISD::BLENDI:
37097 case X86ISD::UNPCKH:
37098 case X86ISD::UNPCKL:
37099 case X86ISD::MOVHLPS:
37100 case X86ISD::MOVLHPS:
37101 case X86ISD::PSHUFB:
37102 case X86ISD::PSHUFD:
37103 case X86ISD::PSHUFHW:
37104 case X86ISD::PSHUFLW:
37105 case X86ISD::MOVSHDUP:
37106 case X86ISD::MOVSLDUP:
37107 case X86ISD::MOVDDUP:
37108 case X86ISD::MOVSS:
37109 case X86ISD::MOVSD:
37110 case X86ISD::VBROADCAST:
37111 case X86ISD::VPPERM:
37112 case X86ISD::VPERMI:
37113 case X86ISD::VPERMV:
37114 case X86ISD::VPERMV3:
37115 case X86ISD::VPERMIV3:
37116 case X86ISD::VPERMIL2:
37117 case X86ISD::VPERMILPI:
37118 case X86ISD::VPERMILPV:
37119 case X86ISD::VPERM2X128:
37120 case X86ISD::VZEXT_MOVL:
37121 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
37122 case X86ISD::FMADD_RND:
37123 case X86ISD::FMADDS1_RND:
37124 case X86ISD::FMADDS3_RND:
37125 case X86ISD::FMADDS1:
37126 case X86ISD::FMADDS3:
37127 case X86ISD::FMADD4S:
37128 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
37129 case X86ISD::FMADDSUB_RND:
37130 case X86ISD::FMSUBADD_RND:
37131 case X86ISD::FMADDSUB:
37132 case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
37133 case X86ISD::MGATHER:
37134 case X86ISD::MSCATTER:
37136 case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
37137 case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
37138 case X86ISD::PCMPEQ:
37139 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
37145 /// Return true if the target has native support for the specified value type
37146 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
37147 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
37148 /// some i16 instructions are slow.
37149 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
37150 if (!isTypeLegal(VT))
37152 if (VT != MVT::i16)
37159 case ISD::SIGN_EXTEND:
37160 case ISD::ZERO_EXTEND:
37161 case ISD::ANY_EXTEND:
37174 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
37175 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
37176 /// we don't adjust the stack we clobber the first frame index.
37177 /// See X86InstrInfo::copyPhysReg.
37178 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
37179 const MachineRegisterInfo &MRI = MF.getRegInfo();
37180 return any_of(MRI.reg_instructions(X86::EFLAGS),
37181 [](const MachineInstr &RI) { return RI.isCopy(); });
37184 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
37185 if (hasCopyImplyingStackAdjustment(MF)) {
37186 MachineFrameInfo &MFI = MF.getFrameInfo();
37187 MFI.setHasCopyImplyingStackAdjustment(true);
37190 TargetLoweringBase::finalizeLowering(MF);
37193 /// This method query the target whether it is beneficial for dag combiner to
37194 /// promote the specified node. If true, it should return the desired promotion
37195 /// type by reference.
37196 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
37197 EVT VT = Op.getValueType();
37198 if (VT != MVT::i16)
37201 bool Promote = false;
37202 bool Commute = false;
37203 switch (Op.getOpcode()) {
37205 case ISD::SIGN_EXTEND:
37206 case ISD::ZERO_EXTEND:
37207 case ISD::ANY_EXTEND:
37212 SDValue N0 = Op.getOperand(0);
37213 // Look out for (store (shl (load), x)).
37214 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
37227 SDValue N0 = Op.getOperand(0);
37228 SDValue N1 = Op.getOperand(1);
37229 if (!Commute && MayFoldLoad(N1))
37231 // Avoid disabling potential load folding opportunities.
37232 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
37234 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
37244 bool X86TargetLowering::
37245 isDesirableToCombineBuildVectorToShuffleTruncate(
37246 ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
37248 assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
37249 "Element count mismatch");
37251 Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
37252 "Shuffle Mask expected to be legal");
37254 // For 32-bit elements VPERMD is better than shuffle+truncate.
37255 // TODO: After we improve lowerBuildVector, add execption for VPERMW.
37256 if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
37259 if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
37265 //===----------------------------------------------------------------------===//
37266 // X86 Inline Assembly Support
37267 //===----------------------------------------------------------------------===//
37269 // Helper to match a string separated by whitespace.
37270 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
37271 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
37273 for (StringRef Piece : Pieces) {
37274 if (!S.startswith(Piece)) // Check if the piece matches.
37277 S = S.substr(Piece.size());
37278 StringRef::size_type Pos = S.find_first_not_of(" \t");
37279 if (Pos == 0) // We matched a prefix.
37288 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
37290 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
37291 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
37292 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
37293 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
37295 if (AsmPieces.size() == 3)
37297 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
37304 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
37305 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
37307 const std::string &AsmStr = IA->getAsmString();
37309 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
37310 if (!Ty || Ty->getBitWidth() % 16 != 0)
37313 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
37314 SmallVector<StringRef, 4> AsmPieces;
37315 SplitString(AsmStr, AsmPieces, ";\n");
37317 switch (AsmPieces.size()) {
37318 default: return false;
37320 // FIXME: this should verify that we are targeting a 486 or better. If not,
37321 // we will turn this bswap into something that will be lowered to logical
37322 // ops instead of emitting the bswap asm. For now, we don't support 486 or
37323 // lower so don't worry about this.
37325 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
37326 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
37327 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
37328 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
37329 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
37330 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
37331 // No need to check constraints, nothing other than the equivalent of
37332 // "=r,0" would be valid here.
37333 return IntrinsicLowering::LowerToByteSwap(CI);
37336 // rorw $$8, ${0:w} --> llvm.bswap.i16
37337 if (CI->getType()->isIntegerTy(16) &&
37338 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37339 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
37340 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
37342 StringRef ConstraintsStr = IA->getConstraintString();
37343 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37344 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37345 if (clobbersFlagRegisters(AsmPieces))
37346 return IntrinsicLowering::LowerToByteSwap(CI);
37350 if (CI->getType()->isIntegerTy(32) &&
37351 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
37352 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
37353 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
37354 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
37356 StringRef ConstraintsStr = IA->getConstraintString();
37357 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
37358 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
37359 if (clobbersFlagRegisters(AsmPieces))
37360 return IntrinsicLowering::LowerToByteSwap(CI);
37363 if (CI->getType()->isIntegerTy(64)) {
37364 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
37365 if (Constraints.size() >= 2 &&
37366 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
37367 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
37368 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
37369 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
37370 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
37371 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
37372 return IntrinsicLowering::LowerToByteSwap(CI);
37380 /// Given a constraint letter, return the type of constraint for this target.
37381 X86TargetLowering::ConstraintType
37382 X86TargetLowering::getConstraintType(StringRef Constraint) const {
37383 if (Constraint.size() == 1) {
37384 switch (Constraint[0]) {
37396 case 'k': // AVX512 masking registers.
37397 return C_RegisterClass;
37421 else if (Constraint.size() == 2) {
37422 switch (Constraint[0]) {
37426 switch (Constraint[1]) {
37437 return C_RegisterClass;
37441 return TargetLowering::getConstraintType(Constraint);
37444 /// Examine constraint type and operand type and determine a weight value.
37445 /// This object must already have been set up with the operand type
37446 /// and the current alternative constraint selected.
37447 TargetLowering::ConstraintWeight
37448 X86TargetLowering::getSingleConstraintMatchWeight(
37449 AsmOperandInfo &info, const char *constraint) const {
37450 ConstraintWeight weight = CW_Invalid;
37451 Value *CallOperandVal = info.CallOperandVal;
37452 // If we don't have a value, we can't do a match,
37453 // but allow it at the lowest weight.
37454 if (!CallOperandVal)
37456 Type *type = CallOperandVal->getType();
37457 // Look at the constraint type.
37458 switch (*constraint) {
37460 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
37472 if (CallOperandVal->getType()->isIntegerTy())
37473 weight = CW_SpecificReg;
37478 if (type->isFloatingPointTy())
37479 weight = CW_SpecificReg;
37482 if (type->isX86_MMXTy() && Subtarget.hasMMX())
37483 weight = CW_SpecificReg;
37486 unsigned Size = StringRef(constraint).size();
37487 // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
37488 char NextChar = Size == 2 ? constraint[1] : 'i';
37491 switch (NextChar) {
37497 if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
37498 return CW_SpecificReg;
37500 // Conditional OpMask regs (AVX512)
37502 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
37503 return CW_Register;
37507 if (type->isX86_MMXTy() && Subtarget.hasMMX())
37510 // Any SSE reg when ISA >= SSE2, same as 'Y'
37514 if (!Subtarget.hasSSE2())
37518 // Fall through (handle "Y" constraint).
37522 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
37523 weight = CW_Register;
37526 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
37527 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
37528 weight = CW_Register;
37531 // Enable conditional vector operations using %k<#> registers.
37532 if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
37533 weight = CW_Register;
37536 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
37537 if (C->getZExtValue() <= 31)
37538 weight = CW_Constant;
37542 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37543 if (C->getZExtValue() <= 63)
37544 weight = CW_Constant;
37548 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37549 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
37550 weight = CW_Constant;
37554 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37555 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
37556 weight = CW_Constant;
37560 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37561 if (C->getZExtValue() <= 3)
37562 weight = CW_Constant;
37566 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37567 if (C->getZExtValue() <= 0xff)
37568 weight = CW_Constant;
37573 if (isa<ConstantFP>(CallOperandVal)) {
37574 weight = CW_Constant;
37578 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37579 if ((C->getSExtValue() >= -0x80000000LL) &&
37580 (C->getSExtValue() <= 0x7fffffffLL))
37581 weight = CW_Constant;
37585 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
37586 if (C->getZExtValue() <= 0xffffffff)
37587 weight = CW_Constant;
37594 /// Try to replace an X constraint, which matches anything, with another that
37595 /// has more specific requirements based on the type of the corresponding
37597 const char *X86TargetLowering::
37598 LowerXConstraint(EVT ConstraintVT) const {
37599 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
37600 // 'f' like normal targets.
37601 if (ConstraintVT.isFloatingPoint()) {
37602 if (Subtarget.hasSSE2())
37604 if (Subtarget.hasSSE1())
37608 return TargetLowering::LowerXConstraint(ConstraintVT);
37611 /// Lower the specified operand into the Ops vector.
37612 /// If it is invalid, don't add anything to Ops.
37613 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
37614 std::string &Constraint,
37615 std::vector<SDValue>&Ops,
37616 SelectionDAG &DAG) const {
37619 // Only support length 1 constraints for now.
37620 if (Constraint.length() > 1) return;
37622 char ConstraintLetter = Constraint[0];
37623 switch (ConstraintLetter) {
37626 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37627 if (C->getZExtValue() <= 31) {
37628 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37629 Op.getValueType());
37635 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37636 if (C->getZExtValue() <= 63) {
37637 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37638 Op.getValueType());
37644 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37645 if (isInt<8>(C->getSExtValue())) {
37646 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37647 Op.getValueType());
37653 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37654 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
37655 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
37656 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
37657 Op.getValueType());
37663 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37664 if (C->getZExtValue() <= 3) {
37665 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37666 Op.getValueType());
37672 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37673 if (C->getZExtValue() <= 255) {
37674 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37675 Op.getValueType());
37681 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37682 if (C->getZExtValue() <= 127) {
37683 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37684 Op.getValueType());
37690 // 32-bit signed value
37691 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37692 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
37693 C->getSExtValue())) {
37694 // Widen to 64 bits here to get it sign extended.
37695 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
37698 // FIXME gcc accepts some relocatable values here too, but only in certain
37699 // memory models; it's complicated.
37704 // 32-bit unsigned value
37705 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
37706 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
37707 C->getZExtValue())) {
37708 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
37709 Op.getValueType());
37713 // FIXME gcc accepts some relocatable values here too, but only in certain
37714 // memory models; it's complicated.
37718 // Literal immediates are always ok.
37719 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
37720 // Widen to 64 bits here to get it sign extended.
37721 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
37725 // In any sort of PIC mode addresses need to be computed at runtime by
37726 // adding in a register or some sort of table lookup. These can't
37727 // be used as immediates.
37728 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
37731 // If we are in non-pic codegen mode, we allow the address of a global (with
37732 // an optional displacement) to be used with 'i'.
37733 GlobalAddressSDNode *GA = nullptr;
37734 int64_t Offset = 0;
37736 // Match either (GA), (GA+C), (GA+C1+C2), etc.
37738 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
37739 Offset += GA->getOffset();
37741 } else if (Op.getOpcode() == ISD::ADD) {
37742 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
37743 Offset += C->getZExtValue();
37744 Op = Op.getOperand(0);
37747 } else if (Op.getOpcode() == ISD::SUB) {
37748 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
37749 Offset += -C->getZExtValue();
37750 Op = Op.getOperand(0);
37755 // Otherwise, this isn't something we can handle, reject it.
37759 const GlobalValue *GV = GA->getGlobal();
37760 // If we require an extra load to get this address, as in PIC mode, we
37761 // can't accept it.
37762 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
37765 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
37766 GA->getValueType(0), Offset);
37771 if (Result.getNode()) {
37772 Ops.push_back(Result);
37775 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
37778 /// Check if \p RC is a general purpose register class.
37779 /// I.e., GR* or one of their variant.
37780 static bool isGRClass(const TargetRegisterClass &RC) {
37781 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
37782 RC.hasSuperClassEq(&X86::GR16RegClass) ||
37783 RC.hasSuperClassEq(&X86::GR32RegClass) ||
37784 RC.hasSuperClassEq(&X86::GR64RegClass) ||
37785 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
37788 /// Check if \p RC is a vector register class.
37789 /// I.e., FR* / VR* or one of their variant.
37790 static bool isFRClass(const TargetRegisterClass &RC) {
37791 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
37792 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
37793 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
37794 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
37795 RC.hasSuperClassEq(&X86::VR512RegClass);
37798 std::pair<unsigned, const TargetRegisterClass *>
37799 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
37800 StringRef Constraint,
37802 // First, see if this is a constraint that directly corresponds to an LLVM
37804 if (Constraint.size() == 1) {
37805 // GCC Constraint Letters
37806 switch (Constraint[0]) {
37808 // TODO: Slight differences here in allocation order and leaving
37809 // RIP in the class. Do they matter any more here than they do
37810 // in the normal allocation?
37812 if (Subtarget.hasAVX512()) {
37813 // Only supported in AVX512 or later.
37814 switch (VT.SimpleTy) {
37817 return std::make_pair(0U, &X86::VK32RegClass);
37819 return std::make_pair(0U, &X86::VK16RegClass);
37821 return std::make_pair(0U, &X86::VK8RegClass);
37823 return std::make_pair(0U, &X86::VK1RegClass);
37825 return std::make_pair(0U, &X86::VK64RegClass);
37829 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
37830 if (Subtarget.is64Bit()) {
37831 if (VT == MVT::i32 || VT == MVT::f32)
37832 return std::make_pair(0U, &X86::GR32RegClass);
37833 if (VT == MVT::i16)
37834 return std::make_pair(0U, &X86::GR16RegClass);
37835 if (VT == MVT::i8 || VT == MVT::i1)
37836 return std::make_pair(0U, &X86::GR8RegClass);
37837 if (VT == MVT::i64 || VT == MVT::f64)
37838 return std::make_pair(0U, &X86::GR64RegClass);
37842 // 32-bit fallthrough
37843 case 'Q': // Q_REGS
37844 if (VT == MVT::i32 || VT == MVT::f32)
37845 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
37846 if (VT == MVT::i16)
37847 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
37848 if (VT == MVT::i8 || VT == MVT::i1)
37849 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
37850 if (VT == MVT::i64)
37851 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
37853 case 'r': // GENERAL_REGS
37854 case 'l': // INDEX_REGS
37855 if (VT == MVT::i8 || VT == MVT::i1)
37856 return std::make_pair(0U, &X86::GR8RegClass);
37857 if (VT == MVT::i16)
37858 return std::make_pair(0U, &X86::GR16RegClass);
37859 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
37860 return std::make_pair(0U, &X86::GR32RegClass);
37861 return std::make_pair(0U, &X86::GR64RegClass);
37862 case 'R': // LEGACY_REGS
37863 if (VT == MVT::i8 || VT == MVT::i1)
37864 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
37865 if (VT == MVT::i16)
37866 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
37867 if (VT == MVT::i32 || !Subtarget.is64Bit())
37868 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
37869 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
37870 case 'f': // FP Stack registers.
37871 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
37872 // value to the correct fpstack register class.
37873 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
37874 return std::make_pair(0U, &X86::RFP32RegClass);
37875 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
37876 return std::make_pair(0U, &X86::RFP64RegClass);
37877 return std::make_pair(0U, &X86::RFP80RegClass);
37878 case 'y': // MMX_REGS if MMX allowed.
37879 if (!Subtarget.hasMMX()) break;
37880 return std::make_pair(0U, &X86::VR64RegClass);
37881 case 'Y': // SSE_REGS if SSE2 allowed
37882 if (!Subtarget.hasSSE2()) break;
37885 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
37886 if (!Subtarget.hasSSE1()) break;
37887 bool VConstraint = (Constraint[0] == 'v');
37889 switch (VT.SimpleTy) {
37891 // Scalar SSE types.
37894 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
37895 return std::make_pair(0U, &X86::FR32XRegClass);
37896 return std::make_pair(0U, &X86::FR32RegClass);
37899 if (VConstraint && Subtarget.hasVLX())
37900 return std::make_pair(0U, &X86::FR64XRegClass);
37901 return std::make_pair(0U, &X86::FR64RegClass);
37902 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
37910 if (VConstraint && Subtarget.hasVLX())
37911 return std::make_pair(0U, &X86::VR128XRegClass);
37912 return std::make_pair(0U, &X86::VR128RegClass);
37920 if (VConstraint && Subtarget.hasVLX())
37921 return std::make_pair(0U, &X86::VR256XRegClass);
37922 return std::make_pair(0U, &X86::VR256RegClass);
37927 return std::make_pair(0U, &X86::VR512RegClass);
37931 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
37932 switch (Constraint[1]) {
37938 return getRegForInlineAsmConstraint(TRI, "Y", VT);
37940 if (!Subtarget.hasMMX()) break;
37941 return std::make_pair(0U, &X86::VR64RegClass);
37944 if (!Subtarget.hasSSE1()) break;
37945 return std::make_pair(X86::XMM0, &X86::VR128RegClass);
37947 // This register class doesn't allocate k0 for masked vector operation.
37948 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
37949 switch (VT.SimpleTy) {
37952 return std::make_pair(0U, &X86::VK32WMRegClass);
37954 return std::make_pair(0U, &X86::VK16WMRegClass);
37956 return std::make_pair(0U, &X86::VK8WMRegClass);
37958 return std::make_pair(0U, &X86::VK1WMRegClass);
37960 return std::make_pair(0U, &X86::VK64WMRegClass);
37967 // Use the default implementation in TargetLowering to convert the register
37968 // constraint into a member of a register class.
37969 std::pair<unsigned, const TargetRegisterClass*> Res;
37970 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
37972 // Not found as a standard register?
37974 // Map st(0) -> st(7) -> ST0
37975 if (Constraint.size() == 7 && Constraint[0] == '{' &&
37976 tolower(Constraint[1]) == 's' &&
37977 tolower(Constraint[2]) == 't' &&
37978 Constraint[3] == '(' &&
37979 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
37980 Constraint[5] == ')' &&
37981 Constraint[6] == '}') {
37983 Res.first = X86::FP0+Constraint[4]-'0';
37984 Res.second = &X86::RFP80RegClass;
37988 // GCC allows "st(0)" to be called just plain "st".
37989 if (StringRef("{st}").equals_lower(Constraint)) {
37990 Res.first = X86::FP0;
37991 Res.second = &X86::RFP80RegClass;
37996 if (StringRef("{flags}").equals_lower(Constraint)) {
37997 Res.first = X86::EFLAGS;
37998 Res.second = &X86::CCRRegClass;
38002 // 'A' means [ER]AX + [ER]DX.
38003 if (Constraint == "A") {
38004 if (Subtarget.is64Bit()) {
38005 Res.first = X86::RAX;
38006 Res.second = &X86::GR64_ADRegClass;
38008 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
38009 "Expecting 64, 32 or 16 bit subtarget");
38010 Res.first = X86::EAX;
38011 Res.second = &X86::GR32_ADRegClass;
38018 // Otherwise, check to see if this is a register class of the wrong value
38019 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
38020 // turn into {ax},{dx}.
38021 // MVT::Other is used to specify clobber names.
38022 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
38023 return Res; // Correct type already, nothing to do.
38025 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
38026 // return "eax". This should even work for things like getting 64bit integer
38027 // registers when given an f64 type.
38028 const TargetRegisterClass *Class = Res.second;
38029 // The generic code will match the first register class that contains the
38030 // given register. Thus, based on the ordering of the tablegened file,
38031 // the "plain" GR classes might not come first.
38032 // Therefore, use a helper method.
38033 if (isGRClass(*Class)) {
38034 unsigned Size = VT.getSizeInBits();
38035 if (Size == 1) Size = 8;
38036 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
38038 bool is64Bit = Subtarget.is64Bit();
38039 const TargetRegisterClass *RC =
38040 Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
38041 : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
38042 : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
38043 : &X86::GR64RegClass;
38044 if (RC->contains(DestReg))
38045 Res = std::make_pair(DestReg, RC);
38047 // No register found/type mismatch.
38049 Res.second = nullptr;
38051 } else if (isFRClass(*Class)) {
38052 // Handle references to XMM physical registers that got mapped into the
38053 // wrong class. This can happen with constraints like {xmm0} where the
38054 // target independent register mapper will just pick the first match it can
38055 // find, ignoring the required type.
38057 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
38058 if (VT == MVT::f32 || VT == MVT::i32)
38059 Res.second = &X86::FR32RegClass;
38060 else if (VT == MVT::f64 || VT == MVT::i64)
38061 Res.second = &X86::FR64RegClass;
38062 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
38063 Res.second = &X86::VR128RegClass;
38064 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
38065 Res.second = &X86::VR256RegClass;
38066 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
38067 Res.second = &X86::VR512RegClass;
38069 // Type mismatch and not a clobber: Return an error;
38071 Res.second = nullptr;
38078 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
38079 const AddrMode &AM, Type *Ty,
38080 unsigned AS) const {
38081 // Scaling factors are not free at all.
38082 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
38083 // will take 2 allocations in the out of order engine instead of 1
38084 // for plain addressing mode, i.e. inst (reg1).
38086 // vaddps (%rsi,%drx), %ymm0, %ymm1
38087 // Requires two allocations (one for the load, one for the computation)
38089 // vaddps (%rsi), %ymm0, %ymm1
38090 // Requires just 1 allocation, i.e., freeing allocations for other operations
38091 // and having less micro operations to execute.
38093 // For some X86 architectures, this is even worse because for instance for
38094 // stores, the complex addressing mode forces the instruction to use the
38095 // "load" ports instead of the dedicated "store" port.
38096 // E.g., on Haswell:
38097 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
38098 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
38099 if (isLegalAddressingMode(DL, AM, Ty, AS))
38100 // Scale represents reg2 * scale, thus account for 1
38101 // as soon as we use a second register.
38102 return AM.Scale != 0;
38106 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
38107 // Integer division on x86 is expensive. However, when aggressively optimizing
38108 // for code size, we prefer to use a div instruction, as it is usually smaller
38109 // than the alternative sequence.
38110 // The exception to this is vector division. Since x86 doesn't have vector
38111 // integer division, leaving the division as-is is a loss even in terms of
38112 // size, because it will have to be scalarized, while the alternative code
38113 // sequence can be performed in vector form.
38115 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
38116 return OptSize && !VT.isVector();
38119 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
38120 if (!Subtarget.is64Bit())
38123 // Update IsSplitCSR in X86MachineFunctionInfo.
38124 X86MachineFunctionInfo *AFI =
38125 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
38126 AFI->setIsSplitCSR(true);
38129 void X86TargetLowering::insertCopiesSplitCSR(
38130 MachineBasicBlock *Entry,
38131 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
38132 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
38133 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
38137 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
38138 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
38139 MachineBasicBlock::iterator MBBI = Entry->begin();
38140 for (const MCPhysReg *I = IStart; *I; ++I) {
38141 const TargetRegisterClass *RC = nullptr;
38142 if (X86::GR64RegClass.contains(*I))
38143 RC = &X86::GR64RegClass;
38145 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
38147 unsigned NewVR = MRI->createVirtualRegister(RC);
38148 // Create copy from CSR to a virtual register.
38149 // FIXME: this currently does not emit CFI pseudo-instructions, it works
38150 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
38151 // nounwind. If we want to generalize this later, we may need to emit
38152 // CFI pseudo-instructions.
38153 assert(Entry->getParent()->getFunction()->hasFnAttribute(
38154 Attribute::NoUnwind) &&
38155 "Function should be nounwind in insertCopiesSplitCSR!");
38156 Entry->addLiveIn(*I);
38157 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
38160 // Insert the copy-back instructions right before the terminator.
38161 for (auto *Exit : Exits)
38162 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
38163 TII->get(TargetOpcode::COPY), *I)
38168 bool X86TargetLowering::supportSwiftError() const {
38169 return Subtarget.is64Bit();
38172 /// Returns the name of the symbol used to emit stack probes or the empty
38173 /// string if not applicable.
38174 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
38175 // If the function specifically requests stack probes, emit them.
38176 if (MF.getFunction()->hasFnAttribute("probe-stack"))
38177 return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
38179 // Generally, if we aren't on Windows, the platform ABI does not include
38180 // support for stack probes, so don't emit them.
38181 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
38184 // We need a stack probe to conform to the Windows ABI. Choose the right
38186 if (Subtarget.is64Bit())
38187 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
38188 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";