let BufferSize=60;
}
+// Integer division issued on port 0.
+def BWDivider : ProcResource<1>; // Integer division issued on port 0.
+
// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
def : ReadAdvance<ReadAfterLd, 5>;
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [BWPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ def : WriteRes<SchedRW.Folded, !listconcat([BWPort23], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
def : WriteRes<WriteRMW, [BWPort4]>;
// Arithmetic.
-defm : BWWriteResPair<WriteALU, BWPort0156, 1>; // Simple integer ALU op.
-defm : BWWriteResPair<WriteIMul, BWPort1, 3>; // Integer multiplication.
+defm : BWWriteResPair<WriteALU, [BWPort0156], 1>; // Simple integer ALU op.
+defm : BWWriteResPair<WriteIMul, [BWPort1], 3>; // Integer multiplication.
+defm : BWWriteResPair<WriteIDiv, [BWPort0, BWDivider], 25, [1, 10]>;
def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
-def BWDivider : ProcResource<1>; // Integer division issued on port 0.
-def : WriteRes<WriteIDiv, [BWPort0, BWDivider]> { // Integer division.
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [BWPort23, BWPort0, BWDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
-}
def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
// Integer shifts and rotates.
-defm : BWWriteResPair<WriteShift, BWPort06, 1>;
+defm : BWWriteResPair<WriteShift, [BWPort06], 1>;
// Loads, stores, and moves, not folded with other operations.
def : WriteRes<WriteLoad, [BWPort23]> { let Latency = 5; }
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
-defm : BWWriteResPair<WriteJump, BWPort06, 1>;
+defm : BWWriteResPair<WriteJump, [BWPort06], 1>;
// Floating point. This covers both scalar and vector operations.
def : WriteRes<WriteFLoad, [BWPort23]> { let Latency = 5; }
def : WriteRes<WriteFStore, [BWPort237, BWPort4]>;
def : WriteRes<WriteFMove, [BWPort5]>;
-defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare.
-defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication.
-defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : BWWriteResPair<WriteFSqrt, BWPort0, 15>; // Floating point square root.
-defm : BWWriteResPair<WriteFRcp, BWPort0, 5>; // Floating point reciprocal estimate.
-defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate.
-defm : BWWriteResPair<WriteFMA, BWPort01, 5>; // Fused Multiply Add.
-defm : BWWriteResPair<WriteFShuffle, BWPort5, 1>; // Floating point vector shuffles.
-defm : BWWriteResPair<WriteFBlend, BWPort015, 1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : BWWriteResPair<WriteFAdd, [BWPort1], 3>; // Floating point add/sub/compare.
+defm : BWWriteResPair<WriteFMul, [BWPort0], 5>; // Floating point multiplication.
+defm : BWWriteResPair<WriteFDiv, [BWPort0], 12>; // 10-14 cycles. // Floating point division.
+defm : BWWriteResPair<WriteFSqrt, [BWPort0], 15>; // Floating point square root.
+defm : BWWriteResPair<WriteFRcp, [BWPort0], 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRsqrt, [BWPort0], 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFMA, [BWPort01], 5>; // Fused Multiply Add.
+defm : BWWriteResPair<WriteFShuffle, [BWPort5], 1>; // Floating point vector shuffles.
+defm : BWWriteResPair<WriteFBlend, [BWPort015], 1>; // Floating point vector blends.
+defm : BWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2]>; // Fp vector variable blends.
// FMA Scheduling helper class.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
def : WriteRes<WriteVecStore, [BWPort237, BWPort4]>;
def : WriteRes<WriteVecMove, [BWPort015]>;
-defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals.
-defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts.
-defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply.
-defm : BWWriteResPair<WriteShuffle, BWPort5, 1>; // Vector shuffles.
-defm : BWWriteResPair<WriteBlend, BWPort15, 1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [BWPort5]> { // Vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteVarBlendLd, [BWPort5, BWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-def : WriteRes<WriteMPSAD, [BWPort0, BWPort5]> { // Vector MPSAD.
- let Latency = 6;
- let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteMPSADLd, [BWPort23, BWPort0, BWPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
-}
+defm : BWWriteResPair<WriteVecALU, [BWPort15], 1>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecShift, [BWPort0], 1>; // Vector integer shifts.
+defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteShuffle, [BWPort5], 1>; // Vector shuffles.
+defm : BWWriteResPair<WriteBlend, [BWPort15], 1>; // Vector blends.
+defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2]>; // Vector variable blends.
+defm : BWWriteResPair<WriteMPSAD, [BWPort0, BWPort5], 6, [1, 2]>; // Vector MPSAD.
// Vector bitwise operations.
// These are often used on both floating point and integer vectors.
-defm : BWWriteResPair<WriteVecLogic, BWPort015, 1>; // Vector and/or/xor.
+defm : BWWriteResPair<WriteVecLogic, [BWPort015], 1>; // Vector and/or/xor.
// Conversion between integer and float.
-defm : BWWriteResPair<WriteCvtF2I, BWPort1, 3>; // Float -> Integer.
-defm : BWWriteResPair<WriteCvtI2F, BWPort1, 4>; // Integer -> Float.
-defm : BWWriteResPair<WriteCvtF2F, BWPort1, 3>; // Float -> Float size conversion.
+defm : BWWriteResPair<WriteCvtF2I, [BWPort1], 3>; // Float -> Integer.
+defm : BWWriteResPair<WriteCvtI2F, [BWPort1], 4>; // Integer -> Float.
+defm : BWWriteResPair<WriteCvtF2F, [BWPort1], 3>; // Float -> Float size conversion.
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
}
// Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [BWPort0, BWPort5]> {
- let Latency = 7;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteCLMulLd, [BWPort0, BWPort5, BWPort23]> {
- let Latency = 7;
- let ResourceCycles = [2, 1, 1];
-}
+defm : BWWriteResPair<WriteCLMul, [BWPort0, BWPort5], 7, [2, 1]>;
// Catch-all for expensive system instructions.
def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
// AVX2.
-defm : BWWriteResPair<WriteFShuffle256, BWPort5, 3>; // Fp 256-bit width vector shuffles.
-defm : BWWriteResPair<WriteShuffle256, BWPort5, 3>; // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [BWPort0, BWPort5]> { // Variable vector shifts.
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [BWPort0, BWPort5, BWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
+defm : BWWriteResPair<WriteFShuffle256, [BWPort5], 3>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteShuffle256, [BWPort5], 3>; // 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteVarVecShift, [BWPort0, BWPort5], 2, [2, 1]>; // Variable vector shifts.
// Old microcoded instructions that nobody use.
def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [BWPort1]> {
- let Latency = 3;
-}
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [BWPort1, BWPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [BWPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [BWPort15, BWPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : BWWriteResPair<WriteFHAdd, [BWPort1], 3>;
+defm : BWWriteResPair<WritePHAdd, [BWPort15], 1>;
// Remaining instrs.
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ def : WriteRes<SchedRW.Folded, !listconcat([HWPort23], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
def : WriteRes<WriteMove, [HWPort0156]>;
def : WriteRes<WriteZero, []>;
-defm : HWWriteResPair<WriteALU, HWPort0156, 1>;
-defm : HWWriteResPair<WriteIMul, HWPort1, 3>;
+defm : HWWriteResPair<WriteALU, [HWPort0156], 1>;
+defm : HWWriteResPair<WriteIMul, [HWPort1], 3>;
def : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : HWWriteResPair<WriteShift, HWPort06, 1>;
-defm : HWWriteResPair<WriteJump, HWPort06, 1>;
+defm : HWWriteResPair<WriteShift, [HWPort06], 1>;
+defm : HWWriteResPair<WriteJump, [HWPort06], 1>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
def : WriteRes<WriteFLoad, [HWPort23]> { let Latency = 5; }
def : WriteRes<WriteFMove, [HWPort5]>;
-defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
-defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
-defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
-defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
-defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
-defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
-defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
-defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
-defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
-defm : HWWriteResPair<WriteFMA, HWPort01, 5>;
-defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>;
-defm : HWWriteResPair<WriteFBlend, HWPort015, 1>;
-defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>;
-
-def : WriteRes<WriteFVarBlend, [HWPort5]> {
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [HWPort5, HWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : HWWriteResPair<WriteFAdd, [HWPort1], 3>;
+defm : HWWriteResPair<WriteFMul, [HWPort0], 5>;
+defm : HWWriteResPair<WriteFDiv, [HWPort0], 12>; // 10-14 cycles.
+defm : HWWriteResPair<WriteFRcp, [HWPort0], 5>;
+defm : HWWriteResPair<WriteFRsqrt, [HWPort0], 5>;
+defm : HWWriteResPair<WriteFSqrt, [HWPort0], 15>;
+defm : HWWriteResPair<WriteCvtF2I, [HWPort1], 3>;
+defm : HWWriteResPair<WriteCvtI2F, [HWPort1], 4>;
+defm : HWWriteResPair<WriteCvtF2F, [HWPort1], 3>;
+defm : HWWriteResPair<WriteFMA, [HWPort01], 5>;
+defm : HWWriteResPair<WriteFShuffle, [HWPort5], 1>;
+defm : HWWriteResPair<WriteFBlend, [HWPort015], 1>;
+defm : HWWriteResPair<WriteFShuffle256, [HWPort5], 3>;
+defm : HWWriteResPair<WriteFVarBlend, [HWPort5], 2, [2]>;
// Vector integer operations.
def : WriteRes<WriteVecStore, [HWPort237, HWPort4]>;
def : WriteRes<WriteVecLoad, [HWPort23]> { let Latency = 5; }
def : WriteRes<WriteVecMove, [HWPort015]>;
-defm : HWWriteResPair<WriteVecShift, HWPort0, 1>;
-defm : HWWriteResPair<WriteVecLogic, HWPort015, 1>;
-defm : HWWriteResPair<WriteVecALU, HWPort15, 1>;
-defm : HWWriteResPair<WriteVecIMul, HWPort0, 5>;
-defm : HWWriteResPair<WriteShuffle, HWPort5, 1>;
-defm : HWWriteResPair<WriteBlend, HWPort15, 1>;
-defm : HWWriteResPair<WriteShuffle256, HWPort5, 3>;
-
-def : WriteRes<WriteVarBlend, [HWPort5]> {
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteVarBlendLd, [HWPort5, HWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-def : WriteRes<WriteVarVecShift, [HWPort0, HWPort5]> {
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [HWPort0, HWPort5, HWPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
-
-def : WriteRes<WriteMPSAD, [HWPort0, HWPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteMPSADLd, [HWPort23, HWPort0, HWPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
-}
+defm : HWWriteResPair<WriteVecShift, [HWPort0], 1>;
+defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1>;
+defm : HWWriteResPair<WriteVecALU, [HWPort15], 1>;
+defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5>;
+defm : HWWriteResPair<WriteShuffle, [HWPort5], 1>;
+defm : HWWriteResPair<WriteBlend, [HWPort15], 1>;
+defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3>;
+defm : HWWriteResPair<WriteVarBlend, [HWPort5], 2, [2]>;
+defm : HWWriteResPair<WriteVarVecShift, [HWPort0, HWPort5], 2, [2, 1]>;
+defm : HWWriteResPair<WriteMPSAD, [HWPort0, HWPort5], 6, [1, 2]>;
// String instructions.
// Packed Compare Implicit Length Strings, Return Mask
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
- let Latency = 3;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
-}
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2, 1];
-}
+defm : HWWriteResPair<WriteFHAdd, [HWPort1, HWPort5], 5, [1, 2], 3>;
+defm : HWWriteResPair<WritePHAdd, [HWPort1, HWPort5], 3, [1, 2], 3>;
//=== Floating Point XMM and YMM Instructions ===//
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SBWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [SBPort23, ExePort]> {
- let Latency = !add(Lat, 4);
+ def : WriteRes<SchedRW.Folded, !listconcat([SBPort23], ExePorts)> {
+ let Latency = !add(Lat, 4);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
def : WriteRes<WriteMove, [SBPort015]>;
def : WriteRes<WriteZero, []>;
-defm : SBWriteResPair<WriteALU, SBPort015, 1>;
-defm : SBWriteResPair<WriteIMul, SBPort1, 3>;
+defm : SBWriteResPair<WriteALU, [SBPort015], 1>;
+defm : SBWriteResPair<WriteIMul, [SBPort1], 3>;
+defm : SBWriteResPair<WriteIDiv, [SBPort0, SBDivider], 25, [1, 10]>;
def : WriteRes<WriteIMulH, []> { let Latency = 3; }
-defm : SBWriteResPair<WriteShift, SBPort05, 1>;
-defm : SBWriteResPair<WriteJump, SBPort5, 1>;
+
+defm : SBWriteResPair<WriteShift, [SBPort05], 1>;
+defm : SBWriteResPair<WriteJump, [SBPort5], 1>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
// the port to read all inputs. We don't model that.
def : WriteRes<WriteLEA, [SBPort15]>;
-// This is quite rough, latency depends on the dividend.
-def : WriteRes<WriteIDiv, [SBPort0, SBDivider]> {
- let Latency = 25;
- let ResourceCycles = [1, 10];
-}
-def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
- let Latency = 29;
- let ResourceCycles = [1, 1, 10];
-}
-
// Scalar and vector floating point.
def : WriteRes<WriteFStore, [SBPort23, SBPort4]>;
def : WriteRes<WriteFLoad, [SBPort23]> { let Latency = 6; }
def : WriteRes<WriteFMove, [SBPort5]>;
-defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
-defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
-defm : SBWriteResPair<WriteFDiv, SBPort0, 24>;
-defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
-defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
-defm : SBWriteResPair<WriteFSqrt, SBPort0, 14>;
-defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
-defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
-defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
-defm : SBWriteResPair<WriteFShuffle, SBPort5, 1>;
-defm : SBWriteResPair<WriteFBlend, SBPort05, 1>;
-def : WriteRes<WriteFVarBlend, [SBPort0, SBPort5]> {
- let Latency = 2;
- let ResourceCycles = [1, 1];
-}
-def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 1];
-}
+defm : SBWriteResPair<WriteFAdd, [SBPort1], 3>;
+defm : SBWriteResPair<WriteFMul, [SBPort0], 5>;
+defm : SBWriteResPair<WriteFDiv, [SBPort0], 24>;
+defm : SBWriteResPair<WriteFRcp, [SBPort0], 5>;
+defm : SBWriteResPair<WriteFRsqrt, [SBPort0], 5>;
+defm : SBWriteResPair<WriteFSqrt, [SBPort0], 14>;
+defm : SBWriteResPair<WriteCvtF2I, [SBPort1], 3>;
+defm : SBWriteResPair<WriteCvtI2F, [SBPort1], 4>;
+defm : SBWriteResPair<WriteCvtF2F, [SBPort1], 3>;
+defm : SBWriteResPair<WriteFShuffle, [SBPort5], 1>;
+defm : SBWriteResPair<WriteFBlend, [SBPort05], 1>;
+defm : SBWriteResPair<WriteFVarBlend, [SBPort0, SBPort5], 2>;
// Vector integer operations.
def : WriteRes<WriteVecStore, [SBPort23, SBPort4]>;
def : WriteRes<WriteVecLoad, [SBPort23]> { let Latency = 6; }
def : WriteRes<WriteVecMove, [SBPort05]>;
-defm : SBWriteResPair<WriteVecShift, SBPort5, 1>;
-defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>;
-defm : SBWriteResPair<WriteVecALU, SBPort1, 3>;
-defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
-defm : SBWriteResPair<WriteShuffle, SBPort5, 1>;
-defm : SBWriteResPair<WriteBlend, SBPort15, 1>;
-def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
- let Latency = 2;
- let ResourceCycles = [1, 1];
-}
-def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 1];
-}
-def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> {
- let Latency = 5;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [1,1,2];
-}
+defm : SBWriteResPair<WriteVecShift, [SBPort5], 1>;
+defm : SBWriteResPair<WriteVecLogic, [SBPort5], 1>;
+defm : SBWriteResPair<WriteVecALU, [SBPort1], 3>;
+defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5>;
+defm : SBWriteResPair<WriteShuffle, [SBPort5], 1>;
+defm : SBWriteResPair<WriteBlend, [SBPort15], 1>;
+defm : SBWriteResPair<WriteVarBlend, [SBPort1, SBPort5], 2>;
+defm : SBWriteResPair<WriteMPSAD, [SBPort0, SBPort15], 5, [1,2], 3>;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SBPort1]> {
- let Latency = 3;
-}
-
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SBPort1, SBPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SBPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SBPort15, SBPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : SBWriteResPair<WriteFHAdd, [SBPort1], 3>;
+defm : SBWriteResPair<WritePHAdd, [SBPort15], 1>;
// String instructions.
// Packed Compare Implicit Length Strings, Return Mask
// AVX2/FMA is not supported on that architecture, but we should define the basic
// scheduling resources anyway.
-defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>;
-defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>;
-defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>;
-defm : SBWriteResPair<WriteFMA, SBPort01, 5>;
+defm : SBWriteResPair<WriteFShuffle256, [SBPort0], 1>;
+defm : SBWriteResPair<WriteShuffle256, [SBPort0], 1>;
+defm : SBWriteResPair<WriteVarVecShift, [SBPort0], 1>;
+defm : SBWriteResPair<WriteFMA, [SBPort01], 5>;
// Remaining SNB instrs.
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [SKLPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ def : WriteRes<SchedRW.Folded, !listconcat([SKLPort23], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
def : WriteRes<WriteRMW, [SKLPort4]>;
// Arithmetic.
-defm : SKLWriteResPair<WriteALU, SKLPort0156, 1>; // Simple integer ALU op.
-defm : SKLWriteResPair<WriteIMul, SKLPort1, 3>; // Integer multiplication.
+defm : SKLWriteResPair<WriteALU, [SKLPort0156], 1>; // Simple integer ALU op.
+defm : SKLWriteResPair<WriteIMul, [SKLPort1], 3>; // Integer multiplication.
def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
def SKLDivider : ProcResource<1>; // Integer division issued on port 0.
def : WriteRes<WriteIDiv, [SKLPort0, SKLDivider]> { // Integer division.
def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
// Integer shifts and rotates.
-defm : SKLWriteResPair<WriteShift, SKLPort06, 1>;
+defm : SKLWriteResPair<WriteShift, [SKLPort06], 1>;
// Loads, stores, and moves, not folded with other operations.
def : WriteRes<WriteLoad, [SKLPort23]> { let Latency = 5; }
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
-defm : SKLWriteResPair<WriteJump, SKLPort06, 1>;
+defm : SKLWriteResPair<WriteJump, [SKLPort06], 1>;
// Floating point. This covers both scalar and vector operations.
def : WriteRes<WriteFLoad, [SKLPort23]> { let Latency = 6; }
def : WriteRes<WriteFStore, [SKLPort237, SKLPort4]>;
def : WriteRes<WriteFMove, [SKLPort015]>;
-defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare.
-defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication.
-defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : SKLWriteResPair<WriteFSqrt, SKLPort0, 15>; // Floating point square root.
-defm : SKLWriteResPair<WriteFRcp, SKLPort0, 5>; // Floating point reciprocal estimate.
-defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate.
-defm : SKLWriteResPair<WriteFMA, SKLPort01, 4>; // Fused Multiply Add.
-defm : SKLWriteResPair<WriteFShuffle, SKLPort5, 1>; // Floating point vector shuffles.
-defm : SKLWriteResPair<WriteFBlend, SKLPort015, 1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : SKLWriteResPair<WriteFAdd, [SKLPort1], 3>; // Floating point add/sub/compare.
+defm : SKLWriteResPair<WriteFMul, [SKLPort0], 5>; // Floating point multiplication.
+defm : SKLWriteResPair<WriteFDiv, [SKLPort0], 12>; // 10-14 cycles. // Floating point division.
+defm : SKLWriteResPair<WriteFSqrt, [SKLPort0], 15>; // Floating point square root.
+defm : SKLWriteResPair<WriteFRcp, [SKLPort0], 5>; // Floating point reciprocal estimate.
+defm : SKLWriteResPair<WriteFRsqrt, [SKLPort0], 5>; // Floating point reciprocal square root estimate.
+defm : SKLWriteResPair<WriteFMA, [SKLPort01], 4>; // Fused Multiply Add.
+defm : SKLWriteResPair<WriteFShuffle, [SKLPort5], 1>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFBlend, [SKLPort015], 1>; // Floating point vector blends.
+defm : SKLWriteResPair<WriteFVarBlend, [SKLPort5], 2, [2]>; // Fp vector variable blends.
// FMA Scheduling helper class.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
def : WriteRes<WriteVecStore, [SKLPort237, SKLPort4]>;
def : WriteRes<WriteVecMove, [SKLPort015]>;
-defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals.
-defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts.
-defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply.
-defm : SKLWriteResPair<WriteShuffle, SKLPort5, 1>; // Vector shuffles.
-defm : SKLWriteResPair<WriteBlend, SKLPort15, 1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [SKLPort5]> { // Vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteVarBlendLd, [SKLPort5, SKLPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-def : WriteRes<WriteMPSAD, [SKLPort0, SKLPort5]> { // Vector MPSAD.
- let Latency = 6;
- let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteMPSADLd, [SKLPort23, SKLPort0, SKLPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
-}
+defm : SKLWriteResPair<WriteVecALU, [SKLPort15], 1>; // Vector integer ALU op, no logicals.
+defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1>; // Vector integer shifts.
+defm : SKLWriteResPair<WriteVecIMul, [SKLPort0], 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1>; // Vector shuffles.
+defm : SKLWriteResPair<WriteBlend, [SKLPort15], 1>; // Vector blends.
+defm : SKLWriteResPair<WriteVarBlend, [SKLPort5], 2, [2]>; // Vector variable blends.
+defm : SKLWriteResPair<WriteMPSAD, [SKLPort0, SKLPort5], 6, [1, 2]>; // Vector MPSAD.
// Vector bitwise operations.
// These are often used on both floating point and integer vectors.
-defm : SKLWriteResPair<WriteVecLogic, SKLPort015, 1>; // Vector and/or/xor.
+defm : SKLWriteResPair<WriteVecLogic, [SKLPort015], 1>; // Vector and/or/xor.
// Conversion between integer and float.
-defm : SKLWriteResPair<WriteCvtF2I, SKLPort1, 3>; // Float -> Integer.
-defm : SKLWriteResPair<WriteCvtI2F, SKLPort1, 4>; // Integer -> Float.
-defm : SKLWriteResPair<WriteCvtF2F, SKLPort1, 3>; // Float -> Float size conversion.
+defm : SKLWriteResPair<WriteCvtF2I, [SKLPort1], 3>; // Float -> Integer.
+defm : SKLWriteResPair<WriteCvtI2F, [SKLPort1], 4>; // Integer -> Float.
+defm : SKLWriteResPair<WriteCvtF2F, [SKLPort1], 3>; // Float -> Float size conversion.
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
// AVX2.
-defm : SKLWriteResPair<WriteFShuffle256, SKLPort5, 3>; // Fp 256-bit width vector shuffles.
-defm : SKLWriteResPair<WriteShuffle256, SKLPort5, 3>; // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [SKLPort0, SKLPort5]> { // Variable vector shifts.
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [SKLPort0, SKLPort5, SKLPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
+defm : SKLWriteResPair<WriteFShuffle256, [SKLPort5], 3>; // Fp 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteShuffle256, [SKLPort5], 3>; // 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteVarVecShift, [SKLPort0, SKLPort5], 2, [2, 1]>; // Variable vector shifts.
// Old microcoded instructions that nobody use.
def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SKLPort1]> {
- let Latency = 3;
-}
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SKLPort1, SKLPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SKLPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SKLPort15, SKLPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : SKLWriteResPair<WriteFHAdd, [SKLPort1], 3>;
+defm : SKLWriteResPair<WritePHAdd, [SKLPort15], 1>;
// Remaining instrs.
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [SKXPort23, ExePort]> {
- let Latency = !add(Lat, 5);
+ def : WriteRes<SchedRW.Folded, !listconcat([SKXPort23], ExePorts)> {
+ let Latency = !add(Lat, 5);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
def : WriteRes<WriteRMW, [SKXPort4]>;
// Arithmetic.
-defm : SKXWriteResPair<WriteALU, SKXPort0156, 1>; // Simple integer ALU op.
-defm : SKXWriteResPair<WriteIMul, SKXPort1, 3>; // Integer multiplication.
+defm : SKXWriteResPair<WriteALU, [SKXPort0156], 1>; // Simple integer ALU op.
+defm : SKXWriteResPair<WriteIMul, [SKXPort1], 3>; // Integer multiplication.
def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
def SKXDivider : ProcResource<1>; // Integer division issued on port 0.
def : WriteRes<WriteIDiv, [SKXPort0, SKXDivider]> { // Integer division.
def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
// Integer shifts and rotates.
-defm : SKXWriteResPair<WriteShift, SKXPort06, 1>;
+defm : SKXWriteResPair<WriteShift, [SKXPort06], 1>;
// Loads, stores, and moves, not folded with other operations.
def : WriteRes<WriteLoad, [SKXPort23]> { let Latency = 5; }
// Branches don't produce values, so they have no latency, but they still
// consume resources. Indirect branches can fold loads.
-defm : SKXWriteResPair<WriteJump, SKXPort06, 1>;
+defm : SKXWriteResPair<WriteJump, [SKXPort06], 1>;
// Floating point. This covers both scalar and vector operations.
def : WriteRes<WriteFLoad, [SKXPort23]> { let Latency = 5; }
def : WriteRes<WriteFStore, [SKXPort237, SKXPort4]>;
def : WriteRes<WriteFMove, [SKXPort015]>;
-defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare.
-defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication.
-defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division.
-defm : SKXWriteResPair<WriteFSqrt, SKXPort0, 15>; // Floating point square root.
-defm : SKXWriteResPair<WriteFRcp, SKXPort0, 5>; // Floating point reciprocal estimate.
-defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate.
-defm : SKXWriteResPair<WriteFMA, SKXPort015, 4>; // Fused Multiply Add.
-defm : SKXWriteResPair<WriteFShuffle, SKXPort5, 1>; // Floating point vector shuffles.
-defm : SKXWriteResPair<WriteFBlend, SKXPort015, 1>; // Floating point vector blends.
-def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
+defm : SKXWriteResPair<WriteFAdd, [SKXPort1], 3>; // Floating point add/sub/compare.
+defm : SKXWriteResPair<WriteFMul, [SKXPort0], 5>; // Floating point multiplication.
+defm : SKXWriteResPair<WriteFDiv, [SKXPort0], 12>; // 10-14 cycles. // Floating point division.
+defm : SKXWriteResPair<WriteFSqrt, [SKXPort0], 15>; // Floating point square root.
+defm : SKXWriteResPair<WriteFRcp, [SKXPort0], 5>; // Floating point reciprocal estimate.
+defm : SKXWriteResPair<WriteFRsqrt, [SKXPort0], 5>; // Floating point reciprocal square root estimate.
+defm : SKXWriteResPair<WriteFMA, [SKXPort015], 4>; // Fused Multiply Add.
+defm : SKXWriteResPair<WriteFShuffle, [SKXPort5], 1>; // Floating point vector shuffles.
+defm : SKXWriteResPair<WriteFBlend, [SKXPort015], 1>; // Floating point vector blends.
+defm : SKXWriteResPair<WriteFVarBlend, [SKXPort5], 2, [2]>; // Fp vector variable blends.
// FMA Scheduling helper class.
// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
def : WriteRes<WriteVecStore, [SKXPort237, SKXPort4]>;
def : WriteRes<WriteVecMove, [SKXPort015]>;
-defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals.
-defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts.
-defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply.
-defm : SKXWriteResPair<WriteShuffle, SKXPort5, 1>; // Vector shuffles.
-defm : SKXWriteResPair<WriteBlend, SKXPort15, 1>; // Vector blends.
-
-def : WriteRes<WriteVarBlend, [SKXPort5]> { // Vector variable blends.
- let Latency = 2;
- let ResourceCycles = [2];
-}
-def : WriteRes<WriteVarBlendLd, [SKXPort5, SKXPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-def : WriteRes<WriteMPSAD, [SKXPort0, SKXPort5]> { // Vector MPSAD.
- let Latency = 6;
- let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteMPSADLd, [SKXPort23, SKXPort0, SKXPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 2];
-}
+defm : SKXWriteResPair<WriteVecALU, [SKXPort15], 1>; // Vector integer ALU op, no logicals.
+defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1>; // Vector integer shifts.
+defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1>; // Vector shuffles.
+defm : SKXWriteResPair<WriteBlend, [SKXPort15], 1>; // Vector blends.
+defm : SKXWriteResPair<WriteVarBlend, [SKXPort5], 2, [2]>; // Vector variable blends.
+defm : SKXWriteResPair<WriteMPSAD, [SKXPort0, SKXPort5], 6, [1, 2]>; // Vector MPSAD.
// Vector bitwise operations.
// These are often used on both floating point and integer vectors.
-defm : SKXWriteResPair<WriteVecLogic, SKXPort015, 1>; // Vector and/or/xor.
+defm : SKXWriteResPair<WriteVecLogic, [SKXPort015], 1>; // Vector and/or/xor.
// Conversion between integer and float.
-defm : SKXWriteResPair<WriteCvtF2I, SKXPort1, 3>; // Float -> Integer.
-defm : SKXWriteResPair<WriteCvtI2F, SKXPort1, 4>; // Integer -> Float.
-defm : SKXWriteResPair<WriteCvtF2F, SKXPort1, 3>; // Float -> Float size conversion.
+defm : SKXWriteResPair<WriteCvtF2I, [SKXPort1], 3>; // Float -> Integer.
+defm : SKXWriteResPair<WriteCvtI2F, [SKXPort1], 4>; // Integer -> Float.
+defm : SKXWriteResPair<WriteCvtF2F, [SKXPort1], 3>; // Float -> Float size conversion.
// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
// AVX2.
-defm : SKXWriteResPair<WriteFShuffle256, SKXPort5, 3>; // Fp 256-bit width vector shuffles.
-defm : SKXWriteResPair<WriteShuffle256, SKXPort5, 3>; // 256-bit width vector shuffles.
-def : WriteRes<WriteVarVecShift, [SKXPort0, SKXPort5]> { // Variable vector shifts.
- let Latency = 2;
- let ResourceCycles = [2, 1];
-}
-def : WriteRes<WriteVarVecShiftLd, [SKXPort0, SKXPort5, SKXPort23]> {
- let Latency = 6;
- let ResourceCycles = [2, 1, 1];
-}
+defm : SKXWriteResPair<WriteFShuffle256, [SKXPort5], 3>; // Fp 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteShuffle256, [SKXPort5], 3>; // 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteVarVecShift, [SKXPort0, SKXPort5], 2, [2, 1]>; // Variable vector shifts.
// Old microcoded instructions that nobody use.
def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [SKXPort1]> {
- let Latency = 3;
-}
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [SKXPort1, SKXPort23]> {
- let Latency = 7;
- let ResourceCycles = [1, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [SKXPort15]>;
-
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [SKXPort15, SKXPort23]> {
- let Latency = 5;
- let ResourceCycles = [1, 1];
-}
+defm : SKXWriteResPair<WriteFHAdd, [SKXPort1], 3>;
+defm : SKXWriteResPair<WritePHAdd, [SKXPort15], 1>;
// Remaining instrs.
// This multiclass defines the resource usage for variants with and without
// folded loads.
multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant is using a single cycle on ExePort.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
// latency.
- def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
- let Latency = !add(Lat, 3);
+ def : WriteRes<SchedRW.Folded, !listconcat([MEC_RSV], ExePorts)> {
+ let Latency = !add(Lat, 3);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
-defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>;
-defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>;
-defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>;
-defm : SMWriteResPair<WriteJump, IEC_RSV1, 1>;
+defm : SMWriteResPair<WriteALU, [IEC_RSV01], 1>;
+defm : SMWriteResPair<WriteIMul, [IEC_RSV1], 3>;
+defm : SMWriteResPair<WriteShift, [IEC_RSV0], 1>;
+defm : SMWriteResPair<WriteJump, [IEC_RSV1], 1>;
// This is for simple LEAs with one or two input operands.
// The complex ones can only execute on port 1, and they require two cycles on
def : WriteRes<WriteFLoad, [MEC_RSV]> { let Latency = 3; }
def : WriteRes<WriteFMove, [FPC_RSV01]>;
-defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
-defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
-defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
-defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
-defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
-defm : SMWriteResPair<WriteFShuffle, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFBlend, FPC_RSV0, 1>;
-
-// This is quite rough, latency depends on precision
-def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
- let Latency = 5;
- let ResourceCycles = [1, 2];
-}
-def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
- let Latency = 8;
- let ResourceCycles = [1, 1, 2];
-}
-
-def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
- let Latency = 34;
- let ResourceCycles = [1, 34];
-}
-def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
- let Latency = 37;
- let ResourceCycles = [1, 1, 34];
-}
+defm : SMWriteResPair<WriteFAdd, [FPC_RSV1], 3>;
+defm : SMWriteResPair<WriteFMul, [FPC_RSV0, SMFPMultiplier], 5, [1,2]>;
+defm : SMWriteResPair<WriteFDiv, [FPC_RSV0, SMFPDivider], 34, [1,34]>;
+defm : SMWriteResPair<WriteFRcp, [FPC_RSV0], 5>;
+defm : SMWriteResPair<WriteFRsqrt, [FPC_RSV0], 5>;
+defm : SMWriteResPair<WriteFSqrt, [FPC_RSV0], 15>;
+defm : SMWriteResPair<WriteCvtF2I, [FPC_RSV01], 4>;
+defm : SMWriteResPair<WriteCvtI2F, [FPC_RSV01], 4>;
+defm : SMWriteResPair<WriteCvtF2F, [FPC_RSV01], 4>;
+defm : SMWriteResPair<WriteFShuffle, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteFBlend, [FPC_RSV0], 1>;
// Vector integer operations.
def : WriteRes<WriteVecStore, [FPC_RSV01, MEC_RSV]>;
def : WriteRes<WriteVecLoad, [MEC_RSV]> { let Latency = 3; }
def : WriteRes<WriteVecMove, [FPC_RSV01]>;
-defm : SMWriteResPair<WriteVecShift, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
-defm : SMWriteResPair<WriteVecALU, FPC_RSV01, 1>;
-defm : SMWriteResPair<WriteVecIMul, FPC_RSV0, 4>;
-defm : SMWriteResPair<WriteShuffle, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteMPSAD, FPC_RSV0, 7>;
+defm : SMWriteResPair<WriteVecShift, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteVecLogic, [FPC_RSV01], 1>;
+defm : SMWriteResPair<WriteVecALU, [FPC_RSV01], 1>;
+defm : SMWriteResPair<WriteVecIMul, [FPC_RSV0], 4>;
+defm : SMWriteResPair<WriteShuffle, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteBlend, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteMPSAD, [FPC_RSV0], 7>;
////////////////////////////////////////////////////////////////////////////////
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-// HADD, HSUB PS/PD
-
-def : WriteRes<WriteFHAdd, [FPC_RSV01]> {
- let Latency = 3;
- let ResourceCycles = [2];
-}
-
-def : WriteRes<WriteFHAddLd, [FPC_RSV01, MEC_RSV]> {
- let Latency = 6;
- let ResourceCycles = [2, 1];
-}
-
-// PHADD|PHSUB (S) W/D.
-def : WriteRes<WritePHAdd, [FPC_RSV01]> {
- let Latency = 1;
- let ResourceCycles = [1];
-}
-
-def : WriteRes<WritePHAddLd, [FPC_RSV01, MEC_RSV]> {
- let Latency = 4;
- let ResourceCycles = [1, 1];
-}
+defm : SMWriteResPair<WriteFHAdd, [FPC_RSV01], 3, [2]>;
+defm : SMWriteResPair<WritePHAdd, [FPC_RSV01], 1>;
// String instructions.
// Packed Compare Implicit Length Strings, Return Mask
// AVX/FMA is not supported on that architecture, but we should define the basic
// scheduling resources anyway.
def : WriteRes<WriteIMulH, [FPC_RSV0]>;
-defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>;
-defm : SMWriteResPair<WriteFMA, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteVarBlend, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteFVarBlend, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteFShuffle256, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteShuffle256, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteVarVecShift, [FPC_RSV0], 1>;
+defm : SMWriteResPair<WriteFMA, [FPC_RSV0], 1>;
} // SchedModel
// b. addpd
// This multiclass is for folded loads for integer units.
multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant takes 1-cycle on Execution Port.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on ZnAGU
// adds 4 cycles to the latency.
- def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
- let NumMicroOps = 2;
- let Latency = !add(Lat, 4);
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, 4);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = !add(UOps, 1);
}
}
// This multiclass is for folded loads for floating point units.
multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
- ProcResourceKind ExePort,
- int Lat> {
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [1], int UOps = 1> {
// Register variant takes 1-cycle on Execution Port.
- def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
// Memory variant also uses a cycle on ZnAGU
// adds 7 cycles to the latency.
- def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
- let Latency = !add(Lat, 7);
+ def : WriteRes<SchedRW.Folded, !listconcat([ZnAGU], ExePorts)> {
+ let Latency = !add(Lat, 7);
+ let ResourceCycles = !listconcat([1], Res);
+ let NumMicroOps = UOps;
}
}
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
-defm : ZnWriteResPair<WriteALU, ZnALU, 1>;
-defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
-defm : ZnWriteResPair<WriteJump, ZnALU, 1>;
+defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteIMul, [ZnALU1, ZnMultiplier], 4>;
+defm : ZnWriteResPair<WriteShift, [ZnALU], 1>;
+defm : ZnWriteResPair<WriteJump, [ZnALU], 1>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;
let ResourceCycles = [1, 4, 41];
}
-// IMUL
+// IMULH
def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
let Latency = 4;
}
-def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
- let Latency = 4;
-}
-
-def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
- let Latency = 8;
-}
// Floating point operations
def : WriteRes<WriteFStore, [ZnAGU]>;
def : WriteRes<WriteFMove, [ZnFPU]>;
def : WriteRes<WriteFLoad, [ZnAGU]> { let Latency = 8; }
-defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>;
-defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>;
-defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>;
-defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>;
-defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>;
-defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>;
-defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>;
-defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>;
-defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>;
-defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
+defm : ZnWriteResFpuPair<WriteFHAdd, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFAdd, [ZnFPU0], 3>;
+defm : ZnWriteResFpuPair<WriteFBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteFVarBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteVarBlend, [ZnFPU0], 1>;
+defm : ZnWriteResFpuPair<WriteCvtI2F, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtF2F, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteCvtF2I, [ZnFPU3], 5>;
+defm : ZnWriteResFpuPair<WriteFDiv, [ZnFPU3], 15>;
+defm : ZnWriteResFpuPair<WriteFShuffle, [ZnFPU12], 1>;
+defm : ZnWriteResFpuPair<WriteFMul, [ZnFPU0], 5>;
+defm : ZnWriteResFpuPair<WriteFMA, [ZnFPU03], 5>;
+defm : ZnWriteResFpuPair<WriteFRcp, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFRsqrt, [ZnFPU01], 5>;
+defm : ZnWriteResFpuPair<WriteFSqrt, [ZnFPU3], 20>;
// Vector integer operations which uses FPU units
def : WriteRes<WriteVecStore, [ZnAGU]>;
def : WriteRes<WriteVecMove, [ZnFPU]>;
def : WriteRes<WriteVecLoad, [ZnAGU]> { let Latency = 8; }
-defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>;
-defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>;
-defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>;
-defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>;
+defm : ZnWriteResFpuPair<WriteVecShift, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecLogic, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WritePHAdd, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>;
+defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
+defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;
// Vector Shift Operations
-defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
+defm : ZnWriteResFpuPair<WriteVarVecShift, [ZnFPU12], 1>;
// AES Instructions.
-defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
-defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
+defm : ZnWriteResFpuPair<WriteAESDecEnc, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESIMC, [ZnFPU01], 4>;
+defm : ZnWriteResFpuPair<WriteAESKeyGen, [ZnFPU01], 4>;
def : WriteRes<WriteFence, [ZnAGU]>;
def : WriteRes<WriteNop, []>;
// Following instructions with latency=100 are microcoded.
// We set long latency so as to block the entire pipeline.
-defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
+defm : ZnWriteResFpuPair<WriteFShuffle256, [ZnFPU], 100>;
//Microcoded Instructions
let Latency = 100 in {
; GENERIC-LABEL: test_mpsadbw:
; GENERIC: # %bb.0:
; GENERIC-NEXT: vmpsadbw $7, %ymm1, %ymm0, %ymm0 # sched: [5:1.00]
-; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
+; GENERIC-NEXT: vmpsadbw $7, (%rdi), %ymm0, %ymm0 # sched: [9:1.00]
; GENERIC-NEXT: retq # sched: [1:1.00]
;
; HASWELL-LABEL: test_mpsadbw: