let hasSideEffects = 0 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"), [], itins.rr, d>,
- Sched<[WriteFShuffle]>;
+ Sched<[WriteFMove]>;
let canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (ld_frag addr:$src))], itins.rm, d>,
- Sched<[WriteLoad]>;
+ Sched<[WriteFLoad]>;
}
let Predicates = [HasAVX, NoVLX] in {
PD;
}
-let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [WriteFStore], Predicates = [HasAVX, NoVLX] in {
def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(alignedstore (v4f32 VR128:$src), addr:$dst)],
def : InstAlias<"vmovupd\t{$src, $dst|$dst, $src}",
(VMOVUPDYrr_REV VR256L:$dst, VR256H:$src), 0>;
-let SchedRW = [WriteStore] in {
+let SchedRW = [WriteFStore] in {
def MOVAPSmr : PSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movaps\t{$src, $dst|$dst, $src}",
[(alignedstore (v4f32 VR128:$src), addr:$dst)],
//===----------------------------------------------------------------------===//
let AddedComplexity = 400 in { // Prefer non-temporal versions
-let SchedRW = [WriteStore] in {
let Predicates = [HasAVX, NoVLX] in {
+let SchedRW = [WriteFStore] in {
def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
addr:$dst)],
IIC_SSE_MOVNT>, VEX, VEX_WIG;
-let ExeDomain = SSEPackedInt in
-def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
- (ins i128mem:$dst, VR128:$src),
- "movntdq\t{$src, $dst|$dst, $src}",
- [(alignednontemporalstore (v2i64 VR128:$src),
- addr:$dst)],
- IIC_SSE_MOVNT>, VEX, VEX_WIG;
-
def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f64 VR256:$src),
addr:$dst)],
IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
-let ExeDomain = SSEPackedInt in
+} // SchedRW
+
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
+def VMOVNTDQmr : VPDI<0xE7, MRMDestMem, (outs),
+ (ins i128mem:$dst, VR128:$src),
+ "movntdq\t{$src, $dst|$dst, $src}",
+ [(alignednontemporalstore (v2i64 VR128:$src),
+ addr:$dst)],
+ IIC_SSE_MOVNT>, VEX, VEX_WIG;
def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4i64 VR256:$src),
addr:$dst)],
IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
-}
+} // ExeDomain, SchedRW
+} // Predicates
+let SchedRW = [WriteVecStore] in {
def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntps\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v4f32 VR128:$src), addr:$dst)],
"movntpd\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore(v2f64 VR128:$src), addr:$dst)],
IIC_SSE_MOVNT>;
+} // SchedRW
-let ExeDomain = SSEPackedInt in
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in
def MOVNTDQmr : PDI<0xE7, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
"movntdq\t{$src, $dst|$dst, $src}",
[(alignednontemporalstore (v2i64 VR128:$src), addr:$dst)],
IIC_SSE_MOVNT>;
+let SchedRW = [WriteStore] in {
// There is no AVX form for instructions below this point
def MOVNTImr : I<0xC3, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
"movnti{l}\t{$src, $dst|$dst, $src}",
let ExeDomain = SSEPackedInt in { // SSE integer instructions
-let hasSideEffects = 0, SchedRW = [WriteMove] in {
+let hasSideEffects = 0, SchedRW = [WriteVecMove] in {
def VMOVDQArr : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
VEX, VEX_WIG;
// For Disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
- SchedRW = [WriteMove] in {
+ SchedRW = [WriteVecMove] in {
def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>,
}
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
- hasSideEffects = 0, SchedRW = [WriteLoad], Predicates = [HasAVX,NoVLX] in {
+ hasSideEffects = 0, SchedRW = [WriteVecLoad], Predicates = [HasAVX,NoVLX] in {
def VMOVDQArm : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqa\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (alignedloadv2i64 addr:$src))],
XS, VEX, VEX_L, VEX_WIG;
}
-let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore],
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteVecStore],
Predicates = [HasAVX,NoVLX] in {
def VMOVDQAmr : VPDI<0x7F, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src),
XS, VEX, VEX_L, VEX_WIG;
}
-let SchedRW = [WriteMove] in {
+let SchedRW = [WriteVecMove] in {
let hasSideEffects = 0 in {
def MOVDQArr : PDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>;
} // SchedRW
let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
- hasSideEffects = 0, SchedRW = [WriteLoad] in {
+ hasSideEffects = 0, SchedRW = [WriteVecLoad] in {
def MOVDQArm : PDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"movdqa\t{$src, $dst|$dst, $src}",
[/*(set VR128:$dst, (alignedloadv2i64 addr:$src))*/],
XS, Requires<[UseSSE2]>;
}
-let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteVecStore] in {
def MOVDQAmr : PDI<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}",
[/*(alignedstore (v2i64 VR128:$src), addr:$dst)*/],
// SSE2 - Conditional Store
//===---------------------------------------------------------------------===//
-let ExeDomain = SSEPackedInt, SchedRW = [WriteStore] in {
+let ExeDomain = SSEPackedInt, SchedRW = [WriteVecStore] in {
let Uses = [EDI], Predicates = [HasAVX,Not64BitMode] in
def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
// SSE3 - Move Unaligned Integer
//===---------------------------------------------------------------------===//
-let SchedRW = [WriteLoad] in {
+let SchedRW = [WriteVecLoad] in {
let Predicates = [HasAVX] in {
def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vlddqu\t{$src, $dst|$dst, $src}",
"vlddqu\t{$src, $dst|$dst, $src}",
[(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
VEX, VEX_L, VEX_WIG;
-}
+} // Predicates
def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"lddqu\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))],
IIC_SSE_LDDQU>;
-}
+} // SchedRW
//===---------------------------------------------------------------------===//
// SSE3 - Arithmetic
}
let AddedComplexity = 400 in { // Prefer non-temporal versions
-let SchedRW = [WriteLoad] in {
+let SchedRW = [WriteVecLoad] in {
let Predicates = [HasAVX, NoVLX] in
def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
"vmovntdqa\t{$src, $dst|$dst, $src}", []>,
def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
(ins f128mem:$dst, VR256:$src1, u8imm:$src2),
"vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- []>, Sched<[WriteStore]>, VEX, VEX_L;
+ []>, Sched<[WriteFStore]>, VEX, VEX_L;
}
multiclass vextract_lowering<string InstrStr, ValueType From, ValueType To> {
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
+ IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteFLoad]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
+ IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteFLoad]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, Sched<[WriteStore]>;
+ VEX_4V, Sched<[WriteFStore]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, VEX_L, Sched<[WriteStore]>;
+ VEX_4V, VEX_L, Sched<[WriteFStore]>;
}
let ExeDomain = SSEPackedSingle in
def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
(ins i128mem:$dst, VR256:$src1, u8imm:$src2),
"vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- Sched<[WriteStore]>, VEX, VEX_L;
+ Sched<[WriteVecStore]>, VEX, VEX_L;
let Predicates = [HasAVX2, NoVLX] in {
defm : vextract_lowering<"VEXTRACTI128", v4i64, v2i64>;
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
+ IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteVecLoad]>;
def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
- IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
+ IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteVecLoad]>;
def mr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, Sched<[WriteStore]>;
+ VEX_4V, Sched<[WriteVecStore]>;
def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
- VEX_4V, VEX_L, Sched<[WriteStore]>;
+ VEX_4V, VEX_L, Sched<[WriteVecStore]>;
}
defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
defm : BWWriteResPair<WriteJump, BWPort06, 1>;
// Floating point. This covers both scalar and vector operations.
+def : WriteRes<WriteFLoad, [BWPort23]> { let Latency = 5; }
+def : WriteRes<WriteFStore, [BWPort237, BWPort4]>;
+def : WriteRes<WriteFMove, [BWPort5]>;
+
defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare.
defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication.
defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
+def : WriteRes<WriteVecLoad, [BWPort23]> { let Latency = 5; }
+def : WriteRes<WriteVecStore, [BWPort237, BWPort4]>;
+def : WriteRes<WriteVecMove, [BWPort015]>;
+
defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals.
defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts.
defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply.
}
// Scalar and vector floating point.
+def : WriteRes<WriteFStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteFLoad, [HWPort23]> { let Latency = 5; }
+def : WriteRes<WriteFMove, [HWPort5]>;
+
defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
}
// Vector integer operations.
+def : WriteRes<WriteVecStore, [HWPort237, HWPort4]>;
+def : WriteRes<WriteVecLoad, [HWPort23]> { let Latency = 5; }
+def : WriteRes<WriteVecMove, [HWPort015]>;
+
defm : HWWriteResPair<WriteVecShift, HWPort0, 1>;
defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
}
// Scalar and vector floating point.
+def : WriteRes<WriteFStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteFLoad, [SBPort23]> { let Latency = 6; }
+def : WriteRes<WriteFMove, [SBPort5]>;
+
defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
defm : SBWriteResPair<WriteFDiv, SBPort0, 24>;
}
// Vector integer operations.
+def : WriteRes<WriteVecStore, [SBPort23, SBPort4]>;
+def : WriteRes<WriteVecLoad, [SBPort23]> { let Latency = 6; }
+def : WriteRes<WriteVecMove, [SBPort05]>;
+
defm : SBWriteResPair<WriteVecShift, SBPort5, 1>;
defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>;
defm : SBWriteResPair<WriteVecALU, SBPort1, 3>;
defm : SKLWriteResPair<WriteJump, SKLPort06, 1>;
// Floating point. This covers both scalar and vector operations.
+def : WriteRes<WriteFLoad, [SKLPort23]> { let Latency = 6; }
+def : WriteRes<WriteFStore, [SKLPort237, SKLPort4]>;
+def : WriteRes<WriteFMove, [SKLPort015]>;
+
defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare.
defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication.
defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
+def : WriteRes<WriteVecLoad, [SKLPort23]> { let Latency = 6; }
+def : WriteRes<WriteVecStore, [SKLPort237, SKLPort4]>;
+def : WriteRes<WriteVecMove, [SKLPort015]>;
+
defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals.
defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts.
defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply.
defm : SKXWriteResPair<WriteJump, SKXPort06, 1>;
// Floating point. This covers both scalar and vector operations.
+def : WriteRes<WriteFLoad, [SKXPort23]> { let Latency = 5; }
+def : WriteRes<WriteFStore, [SKXPort237, SKXPort4]>;
+def : WriteRes<WriteFMove, [SKXPort015]>;
+
defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare.
defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication.
defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
// Vector integer operations.
+def : WriteRes<WriteVecLoad, [SKXPort23]> { let Latency = 5; }
+def : WriteRes<WriteVecStore, [SKXPort237, SKXPort4]>;
+def : WriteRes<WriteVecMove, [SKXPort015]>;
+
defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals.
defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts.
defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply.
defm WriteJump : X86SchedWritePair;
// Floating point. This covers both scalar and vector operations.
+def WriteFLoad : SchedWrite;
+def WriteFStore : SchedWrite;
+def WriteFMove : SchedWrite;
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
defm WriteFDiv : X86SchedWritePair; // Floating point division.
defm WritePHAdd : X86SchedWritePair;
// Vector integer operations.
+def WriteVecLoad : SchedWrite;
+def WriteVecStore : SchedWrite;
+def WriteVecMove : SchedWrite;
defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
////////////////////////////////////////////////////////////////////////////////
// Loads, stores, and moves, not folded with other operations.
-// FIXME: Split x86 and SSE load/store/moves
////////////////////////////////////////////////////////////////////////////////
def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
// FIXME: SS vs PS latencies
////////////////////////////////////////////////////////////////////////////////
+def : WriteRes<WriteFLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteFStore, [JSAGU]>;
+def : WriteRes<WriteFMove, [JFPU01]>;
+
defm : JWriteResFpuPair<WriteFAdd, [JFPU0], 3>;
defm : JWriteResFpuPair<WriteFMul, [JFPU1], 2>;
defm : JWriteResFpuPair<WriteFMA, [JFPU1], 2>; // NOTE: Doesn't exist on Jaguar.
// Vector integer operations.
////////////////////////////////////////////////////////////////////////////////
+def : WriteRes<WriteVecLoad, [JLAGU]> { let Latency = 5; }
+def : WriteRes<WriteVecStore, [JSAGU]>;
+def : WriteRes<WriteVecMove, [JFPU01]>;
+
defm : JWriteResFpuPair<WriteVecALU, [JFPU01], 1>;
defm : JWriteResFpuPair<WriteVecShift, [JFPU01], 1>;
defm : JWriteResFpuPair<WriteVecIMul, [JFPU0], 2>;
}
// Scalar and vector floating point.
+def : WriteRes<WriteFStore, [FPC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteFLoad, [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteFMove, [FPC_RSV01]>;
+
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
}
// Vector integer operations.
+def : WriteRes<WriteVecStore, [FPC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteVecLoad, [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteVecMove, [FPC_RSV01]>;
+
defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>;
defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>;
}
// Floating point operations
+def : WriteRes<WriteFStore, [ZnAGU]>;
+def : WriteRes<WriteFMove, [ZnFPU]>;
+def : WriteRes<WriteFLoad, [ZnAGU]> { let Latency = 8; }
+
defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>;
defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>;
defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
// Vector integer operations which uses FPU units
+def : WriteRes<WriteVecStore, [ZnAGU]>;
+def : WriteRes<WriteVecMove, [ZnFPU]>;
+def : WriteRes<WriteVecLoad, [ZnAGU]> { let Latency = 8; }
+
defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>;
; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaskmovpd (%rdi), %xmm0, %xmm2 # sched: [8:0.50]
; ZNVER1-NEXT: vmaskmovpd %xmm1, %xmm0, (%rdi) # sched: [4:0.50]
-; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovapd %xmm2, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <2 x double> @llvm.x86.avx.maskload.pd(i8* %a0, <2 x i64> %a1)
call void @llvm.x86.avx.maskstore.pd(i8* %a0, <2 x i64> %a1, <2 x double> %a2)
; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaskmovpd (%rdi), %ymm0, %ymm2 # sched: [8:1.00]
; ZNVER1-NEXT: vmaskmovpd %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovapd %ymm2, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x double> @llvm.x86.avx.maskload.pd.256(i8* %a0, <4 x i64> %a1)
call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %a1, <4 x double> %a2)
; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaskmovps (%rdi), %xmm0, %xmm2 # sched: [8:0.50]
; ZNVER1-NEXT: vmaskmovps %xmm1, %xmm0, (%rdi) # sched: [4:0.50]
-; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <4 x float> @llvm.x86.avx.maskload.ps(i8* %a0, <4 x i32> %a1)
call void @llvm.x86.avx.maskstore.ps(i8* %a0, <4 x i32> %a1, <4 x float> %a2)
; ZNVER1: # %bb.0:
; ZNVER1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2 # sched: [8:1.00]
; ZNVER1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi) # sched: [5:1.00]
-; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovaps %ymm2, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = call <8 x float> @llvm.x86.avx.maskload.ps.256(i8* %a0, <8 x i32> %a1)
call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %a1, <8 x float> %a2)
define <4 x i64> @test_movntdqa(i8* %a0) {
; GENERIC-LABEL: test_movntdqa:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [4:0.50]
+; GENERIC-NEXT: vmovntdqa (%rdi), %ymm0 # sched: [6:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_movntdqa:
define <4 x i32> @test_pmaskmovd(i8* %a0, <4 x i32> %a1, <4 x i32> %a2) {
; GENERIC-LABEL: test_pmaskmovd:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovd (%rdi), %xmm0, %xmm2 # sched: [6:0.50]
; GENERIC-NEXT: vpmaskmovd %xmm1, %xmm0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
define <8 x i32> @test_pmaskmovd_ymm(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
; GENERIC-LABEL: test_pmaskmovd_ymm:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovd (%rdi), %ymm0, %ymm2 # sched: [6:0.50]
; GENERIC-NEXT: vpmaskmovd %ymm1, %ymm0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
define <2 x i64> @test_pmaskmovq(i8* %a0, <2 x i64> %a1, <2 x i64> %a2) {
; GENERIC-LABEL: test_pmaskmovq:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovq (%rdi), %xmm0, %xmm2 # sched: [6:0.50]
; GENERIC-NEXT: vpmaskmovq %xmm1, %xmm0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %xmm2, %xmm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
define <4 x i64> @test_pmaskmovq_ymm(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
; GENERIC-LABEL: test_pmaskmovq_ymm:
; GENERIC: # %bb.0:
-; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [4:0.50]
+; GENERIC-NEXT: vpmaskmovq (%rdi), %ymm0, %ymm2 # sched: [6:0.50]
; GENERIC-NEXT: vpmaskmovq %ymm1, %ymm0, (%rdi) # sched: [1:1.00]
; GENERIC-NEXT: vmovdqa %ymm2, %ymm0 # sched: [1:0.50]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; GOLDMONT-LABEL: test_sha256rnds2:
; GOLDMONT: # %bb.0:
-; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
-; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; GOLDMONT-NEXT: movaps %xmm0, %xmm3 # sched: [1:0.50]
+; GOLDMONT-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50]
; GOLDMONT-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00]
; GOLDMONT-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [7:1.00]
-; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
+; GOLDMONT-NEXT: movaps %xmm3, %xmm0 # sched: [1:0.50]
; GOLDMONT-NEXT: retq # sched: [4:1.00]
;
; CANNONLAKE-LABEL: test_sha256rnds2:
;
; ZNVER1-LABEL: test_sha256rnds2:
; ZNVER1: # %bb.0:
-; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.50]
-; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovaps %xmm0, %xmm3 # sched: [1:0.25]
+; ZNVER1-NEXT: vmovaps %xmm2, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: sha256rnds2 %xmm0, %xmm1, %xmm3 # sched: [4:1.00]
; ZNVER1-NEXT: sha256rnds2 %xmm0, (%rdi), %xmm3 # sched: [11:1.00]
-; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.50]
+; ZNVER1-NEXT: vmovaps %xmm3, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [1:0.50]
%1 = load <4 x i32>, <4 x i32>* %a3
%2 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
; SLM-NEXT: rcpps (%rdi), %xmm1 # sched: [8:1.00]
; SLM-NEXT: rcpps %xmm0, %xmm0 # sched: [5:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_rcpps:
; SLM-NEXT: rsqrtps (%rdi), %xmm1 # sched: [8:1.00]
; SLM-NEXT: rsqrtps %xmm0, %xmm0 # sched: [5:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_rsqrtps:
; SLM-NEXT: sqrtps (%rdi), %xmm1 # sched: [18:1.00]
; SLM-NEXT: sqrtps %xmm0, %xmm0 # sched: [15:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sqrtps:
; SLM-LABEL: test_movsd_reg:
; SLM: # %bb.0:
; SLM-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0] sched: [1:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movsd_reg:
; SLM-NEXT: sqrtpd (%rdi), %xmm1 # sched: [18:1.00]
; SLM-NEXT: sqrtpd %xmm0, %xmm0 # sched: [15:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_sqrtpd:
; SLM-LABEL: test_unpcklpd:
; SLM: # %bb.0:
; SLM-NEXT: unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:1.00]
-; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm0, %xmm1 # sched: [1:0.50]
; SLM-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [4:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_unpcklpd:
; SLM-NEXT: movddup {{.*#+}} xmm1 = mem[0,0] sched: [4:1.00]
; SLM-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:1.00]
; SLM-NEXT: subpd %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movddup:
; SLM-NEXT: movshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [4:1.00]
; SLM-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movshdup:
; SLM-NEXT: movsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [4:1.00]
; SLM-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_movsldup:
;
; SLM-LABEL: test_blendvpd:
; SLM: # %bb.0:
-; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:1.00]
-; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm0, %xmm3 # sched: [1:0.50]
+; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50]
; SLM-NEXT: blendvpd %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
; SLM-NEXT: blendvpd %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
-; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm3, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_blendvpd:
;
; SLM-LABEL: test_blendvps:
; SLM: # %bb.0:
-; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:1.00]
-; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm0, %xmm3 # sched: [1:0.50]
+; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50]
; SLM-NEXT: blendvps %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
; SLM-NEXT: blendvps %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
-; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm3, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_blendvps:
; SLM-LABEL: test_pblendvb:
; SLM: # %bb.0:
; SLM-NEXT: movdqa %xmm0, %xmm3 # sched: [1:0.50]
-; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm2, %xmm0 # sched: [1:0.50]
; SLM-NEXT: pblendvb %xmm0, %xmm1, %xmm3 # sched: [1:1.00]
; SLM-NEXT: pblendvb %xmm0, (%rdi), %xmm3 # sched: [4:1.00]
; SLM-NEXT: movdqa %xmm3, %xmm0 # sched: [1:0.50]
; SLM-NEXT: roundpd $7, (%rdi), %xmm1 # sched: [6:1.00]
; SLM-NEXT: roundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addpd %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_roundpd:
; SLM-NEXT: roundps $7, (%rdi), %xmm1 # sched: [6:1.00]
; SLM-NEXT: roundps $7, %xmm0, %xmm0 # sched: [3:1.00]
; SLM-NEXT: addps %xmm0, %xmm1 # sched: [3:1.00]
-; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm1, %xmm0 # sched: [1:0.50]
; SLM-NEXT: retq # sched: [4:1.00]
;
; SANDY-LABEL: test_roundps:
;
; SLM-LABEL: test_roundsd:
; SLM: # %bb.0:
-; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:1.00]
+; SLM-NEXT: movapd %xmm0, %xmm2 # sched: [1:0.50]
; SLM-NEXT: roundsd $7, (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: roundsd $7, %xmm1, %xmm2 # sched: [3:1.00]
; SLM-NEXT: addpd %xmm2, %xmm0 # sched: [3:1.00]
;
; SLM-LABEL: test_roundss:
; SLM: # %bb.0:
-; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:1.00]
+; SLM-NEXT: movaps %xmm0, %xmm2 # sched: [1:0.50]
; SLM-NEXT: roundss $7, (%rdi), %xmm0 # sched: [6:1.00]
; SLM-NEXT: roundss $7, %xmm1, %xmm2 # sched: [3:1.00]
; SLM-NEXT: addps %xmm2, %xmm0 # sched: [3:1.00]