const MCInst &Inst,
APInt &Writes) const;
+ /// Returns true if \param Inst is a dependency breaking instruction for the
+ /// given subtarget.
+ ///
+ /// The value computed by a dependency breaking instruction is not dependent
+ /// on the inputs. An example of dependency breaking instruction on X86 is
+ /// `XOR %eax, %eax`.
+ /// TODO: In future, we could implement an alternative approach where this
+ /// method returns `true` if the input instruction is not dependent on
+ /// some/all of its input operands. An APInt mask could then be used to
+ /// identify independent operands.
+ virtual bool isDependencyBreaking(const MCSubtargetInfo &STI,
+ const MCInst &Inst) const;
+
/// Given a branch instruction try to get the address the branch
/// targets. Return true on success, and the address in Target.
virtual bool
return false;
}
+bool MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI,
+ const MCInst &Inst) const {
+ return false;
+}
+
bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
uint64_t Size, uint64_t &Target) const {
if (Inst.getNumOperands() == 0 ||
public:
X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
+ bool isDependencyBreaking(const MCSubtargetInfo &STI,
+ const MCInst &Inst) const override;
bool clearsSuperRegisters(const MCRegisterInfo &MRI, const MCInst &Inst,
APInt &Mask) const override;
};
+bool X86MCInstrAnalysis::isDependencyBreaking(const MCSubtargetInfo &STI,
+ const MCInst &Inst) const {
+ if (STI.getCPU() == "btver2") {
+ // Reference: Agner Fog's microarchitecture.pdf - Section 20 "AMD Bobcat and
+ // Jaguar pipeline", subsection 8 "Dependency-breaking instructions".
+ switch (Inst.getOpcode()) {
+ default:
+ return false;
+ case X86::SUB32rr:
+ case X86::SUB64rr:
+ case X86::SBB32rr:
+ case X86::SBB64rr:
+ case X86::XOR32rr:
+ case X86::XOR64rr:
+ case X86::XORPSrr:
+ case X86::XORPDrr:
+ case X86::VXORPSrr:
+ case X86::VXORPDrr:
+ case X86::ANDNPSrr:
+ case X86::VANDNPSrr:
+ case X86::ANDNPDrr:
+ case X86::VANDNPDrr:
+ case X86::PXORrr:
+ case X86::VPXORrr:
+ case X86::PANDNrr:
+ case X86::VPANDNrr:
+ case X86::PSUBBrr:
+ case X86::PSUBWrr:
+ case X86::PSUBDrr:
+ case X86::PSUBQrr:
+ case X86::VPSUBBrr:
+ case X86::VPSUBWrr:
+ case X86::VPSUBDrr:
+ case X86::VPSUBQrr:
+ case X86::PCMPEQBrr:
+ case X86::PCMPEQWrr:
+ case X86::PCMPEQDrr:
+ case X86::PCMPEQQrr:
+ case X86::VPCMPEQBrr:
+ case X86::VPCMPEQWrr:
+ case X86::VPCMPEQDrr:
+ case X86::VPCMPEQQrr:
+ case X86::PCMPGTBrr:
+ case X86::PCMPGTWrr:
+ case X86::PCMPGTDrr:
+ case X86::PCMPGTQrr:
+ case X86::VPCMPGTBrr:
+ case X86::VPCMPGTWrr:
+ case X86::VPCMPGTDrr:
+ case X86::VPCMPGTQrr:
+ case X86::MMX_PXORirr:
+ case X86::MMX_PANDNirr:
+ case X86::MMX_PSUBBirr:
+ case X86::MMX_PSUBDirr:
+ case X86::MMX_PSUBQirr:
+ case X86::MMX_PSUBWirr:
+ case X86::MMX_PCMPGTBirr:
+ case X86::MMX_PCMPGTDirr:
+ case X86::MMX_PCMPGTWirr:
+ case X86::MMX_PCMPEQBirr:
+ case X86::MMX_PCMPEQDirr:
+ case X86::MMX_PCMPEQWirr:
+ return Inst.getOperand(1).getReg() == Inst.getOperand(2).getReg();
+ case X86::CMP32rr:
+ case X86::CMP64rr:
+ return Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg();
+ }
+ }
+
+ return false;
+}
+
bool X86MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
const MCInst &Inst,
APInt &Mask) const {
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 3000
-# CHECK-NEXT: Total Cycles: 3003
+# CHECK-NEXT: Total Cycles: 1504
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: IPC: 1.99
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
# CHECK-NEXT: 1.00 - - - - - - - - - - - - - cmovael %ebx, %eax
# CHECK: Timeline view:
-# CHECK-NEXT: Index 012345678
+# CHECK-NEXT: Index 0123456
-# CHECK: [0,0] DeER . . cmpl %eax, %eax
-# CHECK-NEXT: [0,1] D=eER. . cmovael %ebx, %eax
-# CHECK-NEXT: [1,0] .D=eER . cmpl %eax, %eax
-# CHECK-NEXT: [1,1] .D==eER . cmovael %ebx, %eax
-# CHECK-NEXT: [2,0] . D==eER. cmpl %eax, %eax
-# CHECK-NEXT: [2,1] . D===eER cmovael %ebx, %eax
+# CHECK: [0,0] DeER .. cmpl %eax, %eax
+# CHECK-NEXT: [0,1] D=eER.. cmovael %ebx, %eax
+# CHECK-NEXT: [1,0] .DeER.. cmpl %eax, %eax
+# CHECK-NEXT: [1,1] .D=eER. cmovael %ebx, %eax
+# CHECK-NEXT: [2,0] . DeER. cmpl %eax, %eax
+# CHECK-NEXT: [2,1] . D=eER cmovael %ebx, %eax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 2.0 0.3 0.0 cmpl %eax, %eax
-# CHECK-NEXT: 1. 3 3.0 0.0 0.0 cmovael %ebx, %eax
+# CHECK-NEXT: 0. 3 1.0 1.0 0.0 cmpl %eax, %eax
+# CHECK-NEXT: 1. 3 2.0 0.0 0.0 cmovael %ebx, %eax
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 6000
-# CHECK-NEXT: Total Cycles: 6003
+# CHECK-NEXT: Total Cycles: 3003
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 1.00
+# CHECK-NEXT: IPC: 2.00
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK: Timeline view:
-# CHECK-NEXT: 01234
-# CHECK-NEXT: Index 0123456789
+# CHECK-NEXT: Index 012345678
-# CHECK: [0,0] DeER . . . vpcmpeqb %xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [0,1] D=eER. . . vpcmpeqw %xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [0,2] .D=eER . . vpcmpeqd %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,3] .D==eER . . vpcmpeqq %xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [1,0] . D==eER . . vpcmpeqb %xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [1,1] . D===eER . . vpcmpeqw %xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [1,2] . D===eER. . vpcmpeqd %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,3] . D====eER . vpcmpeqq %xmm3, %xmm3, %xmm0
-# CHECK-NEXT: [2,0] . D====eER . vpcmpeqb %xmm0, %xmm0, %xmm1
-# CHECK-NEXT: [2,1] . D=====eER . vpcmpeqw %xmm1, %xmm1, %xmm2
-# CHECK-NEXT: [2,2] . D=====eER. vpcmpeqd %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,3] . D======eER vpcmpeqq %xmm3, %xmm3, %xmm0
+# CHECK: [0,0] DeER . . vpcmpeqb %xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [0,1] DeER . . vpcmpeqw %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [0,2] .DeER. . vpcmpeqd %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,3] .DeER. . vpcmpeqq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [1,0] . DeER . vpcmpeqb %xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [1,1] . DeER . vpcmpeqw %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,2] . DeER . vpcmpeqd %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,3] . DeER . vpcmpeqq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: [2,0] . DeER. vpcmpeqb %xmm0, %xmm0, %xmm1
+# CHECK-NEXT: [2,1] . DeER. vpcmpeqw %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,2] . DeER vpcmpeqd %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,3] . DeER vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 3.0 0.3 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1
-# CHECK-NEXT: 1. 3 4.0 0.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
-# CHECK-NEXT: 2. 3 4.0 0.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 3. 3 5.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
+# CHECK-NEXT: 0. 3 1.0 1.0 0.0 vpcmpeqb %xmm0, %xmm0, %xmm1
+# CHECK-NEXT: 1. 3 1.0 1.0 0.0 vpcmpeqw %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: 2. 3 1.0 1.0 0.0 vpcmpeqd %xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 3. 3 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm0
# CHECK: Iterations: 1500
# CHECK-NEXT: Instructions: 4500
-# CHECK-NEXT: Total Cycles: 6745
+# CHECK-NEXT: Total Cycles: 3007
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 0.67
+# CHECK-NEXT: IPC: 1.50
# CHECK-NEXT: Block RThroughput: 2.0
# CHECK: Instruction Info:
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: 2.01 1.99 - - - - - - 1.00 - - - - -
+# CHECK-NEXT: 2.00 2.00 - - - - - - 1.00 - - - - -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - 1.00 - - - - - imull %edx, %eax
-# CHECK-NEXT: 0.99 0.01 - - - - - - - - - - - - addl %edx, %edx
-# CHECK-NEXT: 1.01 0.99 - - - - - - - - - - - - sbbl %eax, %eax
+# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %edx, %edx
+# CHECK-NEXT: 2.00 - - - - - - - - - - - - - sbbl %eax, %eax
# CHECK: Timeline view:
-# CHECK-NEXT: 012345
+# CHECK-NEXT: 01
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeeeER . . imull %edx, %eax
-# CHECK-NEXT: [0,1] .DeE-R . . addl %edx, %edx
-# CHECK-NEXT: [0,2] .D==eER . . sbbl %eax, %eax
-# CHECK-NEXT: [1,0] . D===eeeER . imull %edx, %eax
-# CHECK-NEXT: [1,1] . DeE----R . addl %edx, %edx
-# CHECK-NEXT: [1,2] . D=====eER . sbbl %eax, %eax
-# CHECK-NEXT: [2,0] . D=====eeeER. imull %edx, %eax
-# CHECK-NEXT: [2,1] . DeE------R. addl %edx, %edx
-# CHECK-NEXT: [2,2] . D=======eER sbbl %eax, %eax
+# CHECK: [0,0] DeeeER .. imull %edx, %eax
+# CHECK-NEXT: [0,1] .DeE-R .. addl %edx, %edx
+# CHECK-NEXT: [0,2] .D=eE-R .. sbbl %eax, %eax
+# CHECK-NEXT: [1,0] . D==eeeER.. imull %edx, %eax
+# CHECK-NEXT: [1,1] . DeE---R.. addl %edx, %edx
+# CHECK-NEXT: [1,2] . D=eE---R. sbbl %eax, %eax
+# CHECK-NEXT: [2,0] . D=eeeER. imull %edx, %eax
+# CHECK-NEXT: [2,1] . D=eE--R addl %edx, %edx
+# CHECK-NEXT: [2,2] . D==eE-R sbbl %eax, %eax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
-# CHECK-NEXT: 0. 3 3.7 0.7 0.0 imull %edx, %eax
-# CHECK-NEXT: 1. 3 1.0 1.0 3.7 addl %edx, %edx
-# CHECK-NEXT: 2. 3 5.7 0.0 0.0 sbbl %eax, %eax
+# CHECK-NEXT: 0. 3 2.0 0.7 0.0 imull %edx, %eax
+# CHECK-NEXT: 1. 3 1.3 1.3 2.0 addl %edx, %edx
+# CHECK-NEXT: 2. 3 2.3 0.0 1.7 sbbl %eax, %eax
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
-# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -register-file-stats -iterations=1 < %s | FileCheck %s
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -timeline -timeline-max-iterations=1 -register-file-stats < %s | FileCheck %s
# These are dependency-breaking one-idioms.
# Much like zero-idioms, but they produce ones, and do consume resources.
+# perf stats reports a throughput of 2.00 IPC.
+
pcmpeqb %mm2, %mm2
pcmpeqd %mm2, %mm2
pcmpeqw %mm2, %mm2
# FIXME: their handling is broken in llvm-mca.
-# CHECK: Iterations: 1
-# CHECK-NEXT: Instructions: 15
-# CHECK-NEXT: Total Cycles: 12
+# CHECK: Iterations: 100
+# CHECK-NEXT: Instructions: 1500
+# CHECK-NEXT: Total Cycles: 753
# CHECK-NEXT: Dispatch Width: 2
-# CHECK-NEXT: IPC: 1.25
+# CHECK-NEXT: IPC: 1.99
# CHECK-NEXT: Block RThroughput: 7.5
# CHECK: Instruction Info:
# CHECK-NEXT: 1 1 0.50 vpcmpeqw %xmm3, %xmm3, %xmm5
# CHECK: Register File statistics:
-# CHECK-NEXT: Total number of mappings created: 15
-# CHECK-NEXT: Max number of mappings used: 8
+# CHECK-NEXT: Total number of mappings created: 1500
+# CHECK-NEXT: Max number of mappings used: 6
# CHECK: * Register File #1 -- JFpuPRF:
# CHECK-NEXT: Number of physical registers: 72
-# CHECK-NEXT: Total number of mappings created: 15
-# CHECK-NEXT: Max number of mappings used: 8
+# CHECK-NEXT: Total number of mappings created: 1500
+# CHECK-NEXT: Max number of mappings used: 6
# CHECK: * Register File #2 -- JIntegerPRF:
# CHECK-NEXT: Number of physical registers: 64
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
-# CHECK-NEXT: - - - - - 7.00 8.00 - - - - 7.00 8.00 -
+# CHECK-NEXT: - - - - - 7.50 7.50 - - - - 7.50 7.50 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - pcmpeqb %mm2, %mm2
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqd %mm2, %mm2
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqw %mm2, %mm2
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - pcmpeqb %xmm2, %xmm2
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - pcmpeqd %xmm2, %xmm2
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqq %xmm2, %xmm2
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - pcmpeqw %xmm2, %xmm2
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqb %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqd %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqw %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqb %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqd %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpcmpeqq %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpcmpeqw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqb %mm2, %mm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqd %mm2, %mm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqw %mm2, %mm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqb %xmm2, %xmm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqd %xmm2, %xmm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqq %xmm2, %xmm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - pcmpeqw %xmm2, %xmm2
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqb %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqd %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqq %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqw %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqb %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqd %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqq %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: - - - - - 0.50 0.50 - - - - 0.50 0.50 - vpcmpeqw %xmm3, %xmm3, %xmm5
# CHECK: Timeline view:
-# CHECK-NEXT: 01
+# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
-# CHECK: [0,0] DeER . .. pcmpeqb %mm2, %mm2
-# CHECK-NEXT: [0,1] D=eER. .. pcmpeqd %mm2, %mm2
-# CHECK-NEXT: [0,2] .D=eER .. pcmpeqw %mm2, %mm2
-# CHECK-NEXT: [0,3] .DeE-R .. pcmpeqb %xmm2, %xmm2
-# CHECK-NEXT: [0,4] . DeE-R .. pcmpeqd %xmm2, %xmm2
-# CHECK-NEXT: [0,5] . D=eER .. pcmpeqq %xmm2, %xmm2
-# CHECK-NEXT: [0,6] . D=eER .. pcmpeqw %xmm2, %xmm2
-# CHECK-NEXT: [0,7] . DeE-R .. vpcmpeqb %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,8] . DeE-R .. vpcmpeqd %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,9] . D=eER .. vpcmpeqq %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,10] . D=eER.. vpcmpeqw %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: [0,11] . D==eER. vpcmpeqb %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,12] . .D=eER. vpcmpeqd %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,13] . .D==eER vpcmpeqq %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: [0,14] . . D=eER vpcmpeqw %xmm3, %xmm3, %xmm5
+# CHECK: [0,0] DeER . . pcmpeqb %mm2, %mm2
+# CHECK-NEXT: [0,1] DeER . . pcmpeqd %mm2, %mm2
+# CHECK-NEXT: [0,2] .DeER. . pcmpeqw %mm2, %mm2
+# CHECK-NEXT: [0,3] .DeER. . pcmpeqb %xmm2, %xmm2
+# CHECK-NEXT: [0,4] . DeER . pcmpeqd %xmm2, %xmm2
+# CHECK-NEXT: [0,5] . DeER . pcmpeqq %xmm2, %xmm2
+# CHECK-NEXT: [0,6] . DeER . pcmpeqw %xmm2, %xmm2
+# CHECK-NEXT: [0,7] . DeER . vpcmpeqb %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,8] . DeER . vpcmpeqd %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,9] . DeER . vpcmpeqq %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,10] . DeER . vpcmpeqw %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: [0,11] . DeER . vpcmpeqb %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,12] . .DeER. vpcmpeqd %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,13] . .DeER. vpcmpeqq %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: [0,14] . . DeER vpcmpeqw %xmm3, %xmm3, %xmm5
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 pcmpeqb %mm2, %mm2
-# CHECK-NEXT: 1. 1 2.0 0.0 0.0 pcmpeqd %mm2, %mm2
-# CHECK-NEXT: 2. 1 2.0 0.0 0.0 pcmpeqw %mm2, %mm2
-# CHECK-NEXT: 3. 1 1.0 1.0 1.0 pcmpeqb %xmm2, %xmm2
-# CHECK-NEXT: 4. 1 1.0 0.0 1.0 pcmpeqd %xmm2, %xmm2
-# CHECK-NEXT: 5. 1 2.0 0.0 0.0 pcmpeqq %xmm2, %xmm2
-# CHECK-NEXT: 6. 1 2.0 0.0 0.0 pcmpeqw %xmm2, %xmm2
-# CHECK-NEXT: 7. 1 1.0 1.0 1.0 vpcmpeqb %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 8. 1 1.0 0.0 1.0 vpcmpeqd %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 9. 1 2.0 0.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 10. 1 2.0 0.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm3
-# CHECK-NEXT: 11. 1 3.0 0.0 0.0 vpcmpeqb %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 12. 1 2.0 0.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 13. 1 3.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm5
-# CHECK-NEXT: 14. 1 2.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 1. 1 1.0 1.0 0.0 pcmpeqd %mm2, %mm2
+# CHECK-NEXT: 2. 1 1.0 1.0 0.0 pcmpeqw %mm2, %mm2
+# CHECK-NEXT: 3. 1 1.0 1.0 0.0 pcmpeqb %xmm2, %xmm2
+# CHECK-NEXT: 4. 1 1.0 1.0 0.0 pcmpeqd %xmm2, %xmm2
+# CHECK-NEXT: 5. 1 1.0 1.0 0.0 pcmpeqq %xmm2, %xmm2
+# CHECK-NEXT: 6. 1 1.0 1.0 0.0 pcmpeqw %xmm2, %xmm2
+# CHECK-NEXT: 7. 1 1.0 1.0 0.0 vpcmpeqb %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 8. 1 1.0 1.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 9. 1 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 10. 1 1.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm3
+# CHECK-NEXT: 11. 1 1.0 1.0 0.0 vpcmpeqb %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 12. 1 1.0 1.0 0.0 vpcmpeqd %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 13. 1 1.0 1.0 0.0 vpcmpeqq %xmm3, %xmm3, %xmm5
+# CHECK-NEXT: 14. 1 1.0 1.0 0.0 vpcmpeqw %xmm3, %xmm3, %xmm5
// instruction. A dependency-breaking instruction is a zero-latency
// instruction that doesn't consume hardware resources.
// An example of dependency-breaking instruction on X86 is a zero-idiom XOR.
- if (!Desc.isZeroLatency())
- for (std::unique_ptr<ReadState> &RS : IS.getUses())
+ bool IsDependencyBreaking = IS.isDependencyBreaking();
+ for (std::unique_ptr<ReadState> &RS : IS.getUses())
+ if (RS->isImplicitRead() || !IsDependencyBreaking)
updateRAWDependencies(*RS, STI);
// By default, a dependency-breaking zero-latency instruction is expected to
// be optimized at register renaming stage. That means, no physical register
// is allocated to the instruction.
+ bool ShouldAllocateRegisters =
+ !(Desc.isZeroLatency() && IsDependencyBreaking);
SmallVector<unsigned, 4> RegisterFiles(PRF.getNumRegisterFiles());
- for (std::unique_ptr<WriteState> &WS : IS.getDefs())
+ for (std::unique_ptr<WriteState> &WS : IS.getDefs()) {
PRF.addRegisterWrite(WriteRef(IR.first, WS.get()), RegisterFiles,
- !Desc.isZeroLatency());
+ ShouldAllocateRegisters);
+ }
// Reserve slots in the RCU, and notify the instruction that it has been
// dispatched to the schedulers for execution.
// register writes implicitly clear the upper portion of a super-register.
MCIA.clearsSuperRegisters(MRI, MCI, WriteMask);
+ // Check if this is a dependency breaking instruction.
+ if (MCIA.isDependencyBreaking(STI, MCI))
+ NewIS->setDependencyBreaking();
+
// Initialize writes.
unsigned WriteIndex = 0;
for (const WriteDescriptor &WD : D.Writes) {
bool IsReady;
public:
- bool isReady() const { return IsReady; }
-
ReadState(const ReadDescriptor &Desc, unsigned RegID)
: RD(Desc), RegisterID(RegID), DependentWrites(0),
CyclesLeft(UNKNOWN_CYCLES), TotalCycles(0), IsReady(true) {}
unsigned getSchedClass() const { return RD.SchedClassID; }
unsigned getRegisterID() const { return RegisterID; }
+ bool isReady() const { return IsReady; }
+ bool isImplicitRead() const { return RD.isImplicitRead(); }
+
void cycleEvent();
void writeStartEvent(unsigned Cycles);
void setDependentWrites(unsigned Writes) {
// Retire Unit token ID for this instruction.
unsigned RCUTokenID;
+ bool IsDepBreaking;
+
using UniqueDef = std::unique_ptr<WriteState>;
using UniqueUse = std::unique_ptr<ReadState>;
using VecDefs = std::vector<UniqueDef>;
public:
Instruction(const InstrDesc &D)
- : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES) {}
+ : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0),
+ IsDepBreaking(false) {}
Instruction(const Instruction &Other) = delete;
Instruction &operator=(const Instruction &Other) = delete;
unsigned getRCUTokenID() const { return RCUTokenID; }
int getCyclesLeft() const { return CyclesLeft; }
+ bool isDependencyBreaking() const { return IsDepBreaking; }
+ void setDependencyBreaking() { IsDepBreaking = true; }
+
unsigned getNumUsers() const {
unsigned NumUsers = 0;
for (const UniqueDef &Def : Defs)
void RetireStage::notifyInstructionRetired(const InstRef &IR) {
LLVM_DEBUG(dbgs() << "[E] Instruction Retired: #" << IR << '\n');
SmallVector<unsigned, 4> FreedRegs(PRF.getNumRegisterFiles());
- const InstrDesc &Desc = IR.getInstruction()->getDesc();
+ const Instruction &Inst = *IR.getInstruction();
+ const InstrDesc &Desc = Inst.getDesc();
- for (const std::unique_ptr<WriteState> &WS : IR.getInstruction()->getDefs())
- PRF.removeRegisterWrite(*WS.get(), FreedRegs, !Desc.isZeroLatency());
+ bool ShouldFreeRegs = !(Desc.isZeroLatency() && Inst.isDependencyBreaking());
+ for (const std::unique_ptr<WriteState> &WS : Inst.getDefs())
+ PRF.removeRegisterWrite(*WS.get(), FreedRegs, ShouldFreeRegs);
notifyEvent<HWInstructionEvent>(HWInstructionRetiredEvent(IR, FreedRegs));
}