--- /dev/null
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=3 -timeline -register-file-stats < %s | FileCheck %s
+
+# The register move from XMM0 to XMM1 can be eliminated at register renaming
+# stage. So, it should not consume pipeline resources.
+
+vxorps %xmm0, %xmm0, %xmm0
+vmovaps %xmm0, %xmm1
+vaddps %xmm1, %xmm1, %xmm2
+
+# CHECK: Iterations: 3
+# CHECK-NEXT: Instructions: 9
+# CHECK-NEXT: Total Cycles: 10
+# CHECK-NEXT: Total uOps: 9
+
+# CHECK: Dispatch Width: 2
+# CHECK-NEXT: uOps Per Cycle: 0.90
+# CHECK-NEXT: IPC: 0.90
+# CHECK-NEXT: Block RThroughput: 1.5
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
+# CHECK-NEXT: 1 0 0.50 vxorps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1 1 0.50 vmovaps %xmm0, %xmm1
+# CHECK-NEXT: 1 3 1.00 vaddps %xmm1, %xmm1, %xmm2
+
+# CHECK: Register File statistics:
+# CHECK-NEXT: Total number of mappings created: 6
+# CHECK-NEXT: Max number of mappings used: 5
+
+# CHECK: * Register File #1 -- JFpuPRF:
+# CHECK-NEXT: Number of physical registers: 72
+# CHECK-NEXT: Total number of mappings created: 6
+# CHECK-NEXT: Max number of mappings used: 5
+
+# CHECK: * Register File #2 -- JIntegerPRF:
+# CHECK-NEXT: Number of physical registers: 64
+# CHECK-NEXT: Total number of mappings created: 0
+# CHECK-NEXT: Max number of mappings used: 0
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - JALU0
+# CHECK-NEXT: [1] - JALU1
+# CHECK-NEXT: [2] - JDiv
+# CHECK-NEXT: [3] - JFPA
+# CHECK-NEXT: [4] - JFPM
+# CHECK-NEXT: [5] - JFPU0
+# CHECK-NEXT: [6] - JFPU1
+# CHECK-NEXT: [7] - JLAGU
+# CHECK-NEXT: [8] - JMul
+# CHECK-NEXT: [9] - JSAGU
+# CHECK-NEXT: [10] - JSTC
+# CHECK-NEXT: [11] - JVALU0
+# CHECK-NEXT: [12] - JVALU1
+# CHECK-NEXT: [13] - JVIMUL
+
+# CHECK: Resource pressure per iteration:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
+# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 - - - - - - -
+
+# CHECK: Resource pressure by instruction:
+# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
+# CHECK-NEXT: - - - - - - - - - - - - - - vxorps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vmovaps %xmm0, %xmm1
+# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vaddps %xmm1, %xmm1, %xmm2
+
+# CHECK: Timeline view:
+# CHECK-NEXT: Index 0123456789
+
+# CHECK: [0,0] DR . . vxorps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [0,1] DeER . . vmovaps %xmm0, %xmm1
+# CHECK-NEXT: [0,2] .DeeeER . vaddps %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [1,0] .D----R . vxorps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [1,1] . DeE--R . vmovaps %xmm0, %xmm1
+# CHECK-NEXT: [1,2] . D=eeeER. vaddps %xmm1, %xmm1, %xmm2
+# CHECK-NEXT: [2,0] . D----R. vxorps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: [2,1] . DeE---R vmovaps %xmm0, %xmm1
+# CHECK-NEXT: [2,2] . DeeeER vaddps %xmm1, %xmm1, %xmm2
+
+# CHECK: Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK: [0] [1] [2] [3]
+# CHECK-NEXT: 0. 3 0.0 0.0 2.7 vxorps %xmm0, %xmm0, %xmm0
+# CHECK-NEXT: 1. 3 1.0 1.0 1.7 vmovaps %xmm0, %xmm1
+# CHECK-NEXT: 2. 3 1.3 0.0 0.0 vaddps %xmm1, %xmm1, %xmm2
#define LLVM_TOOLS_LLVM_MCA_REGISTER_FILE_H
#include "HardwareUnits/HardwareUnit.h"
-#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Support/Error.h"
namespace mca {
+class ReadState;
class WriteState;
class WriteRef;
class RegisterFile : public HardwareUnit {
const llvm::MCRegisterInfo &MRI;
- // Each register file is associated with an instance of
- // RegisterMappingTracker.
- // A RegisterMappingTracker keeps track of the number of physical registers
- // which have been dynamically allocated by the simulator.
+ // class RegisterMappingTracker is a physical register file (PRF) descriptor.
+ // There is one RegisterMappingTracker for every PRF definition in the
+ // scheduling model.
+ //
+ // An instance of RegisterMappingTracker tracks the number of physical
+ // registers available for renaming. It also tracks the number of register
+ // moves eliminated per cycle.
struct RegisterMappingTracker {
// The total number of physical registers that are available in this
// register file for register renaming purpouses. A value of zero for this
// Number of physical registers that are currently in use.
unsigned NumUsedPhysRegs;
- RegisterMappingTracker(unsigned NumPhysRegisters)
- : NumPhysRegs(NumPhysRegisters), NumUsedPhysRegs(0) {}
+ // Maximum number of register moves that can be eliminated by this PRF every
+ // cycle. A value of zero means that there is no limit in the number of
+ // moves which can be eliminated every cycle.
+ const unsigned MaxMoveEliminatedPerCycle;
+
+ // Number of register moves eliminated during this cycle.
+ //
+ // This value is increased by one every time a register move is eliminated.
+ // Every new cycle, this value is reset to zero.
+ // A move can be eliminated only if MaxMoveEliminatedPerCycle is zero, or if
+ // NumMoveEliminated is less than MaxMoveEliminatedPerCycle.
+ unsigned NumMoveEliminated;
+
+ RegisterMappingTracker(unsigned NumPhysRegisters,
+ unsigned MaxMoveEliminated = 0U)
+ : NumPhysRegs(NumPhysRegisters), NumUsedPhysRegs(0),
+ MaxMoveEliminatedPerCycle(MaxMoveEliminated), NumMoveEliminated(0U) {}
};
// A vector of register file descriptors. This set always contains at least
// Each RegisterRenamingInfo is owned by a PRF, and field `IndexPlusCost`
// specifies both the owning PRF, as well as the number of physical registers
// consumed at register renaming stage.
+ //
+ // Field `AllowMoveElimination` is set for registers that are used as
+ // destination by optimizable register moves.
+ // Field `AllowZeroMoveEliminationOnly` further restricts move elimination
+ // only to zero-register moves.
struct RegisterRenamingInfo {
IndexPlusCostPairTy IndexPlusCost;
llvm::MCPhysReg RenameAs;
+ bool AllowMoveElimination;
+ bool AllowZeroMoveEliminationOnly;
RegisterRenamingInfo()
- : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U) {}
+ : IndexPlusCost(std::make_pair(0U, 1U)), RenameAs(0U),
+ AllowMoveElimination(false), AllowZeroMoveEliminationOnly(false) {}
};
// RegisterMapping objects are mainly used to track physical register
void removeRegisterWrite(const WriteState &WS,
llvm::MutableArrayRef<unsigned> FreedPhysRegs);
+ // Returns true if a move from RS to WS can be eliminated.
+ // On success, it updates WriteState by setting flag `WS.isEliminated`.
+ // If RS is a read from a zero register, and WS is eliminated, then
+ // `WS.WritesZero` is also set, so that method addRegisterWrite() would not
+ // reserve a physical register for it.
+ bool tryEliminateMove(WriteState &WS, const ReadState &RS);
+
// Checks if there are enough physical registers in the register files.
// Returns a "response mask" where each bit represents the response from a
// different register file. A mask of all zeroes means that all register
unsigned RegID) const;
unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
+ // Notify each PRF that a new cycle just started.
+ void cycleStart();
+
#ifndef NDEBUG
void dump() const;
#endif
// True if this write is from a dependency breaking zero-idiom instruction.
bool WritesZero;
+ // True if this write has been eliminated at register renaming stage.
+ // Example: a register move doesn't consume scheduler/pipleline resources if
+ // it is eliminated at register renaming stage. It still consumes
+ // decode bandwidth, and ROB entries.
+ bool IsEliminated;
+
// This field is set if this is a partial register write, and it has a false
// dependency on any previous write of the same register (or a portion of it).
// DependentWrite must be able to complete before this write completes, so
bool clearsSuperRegs = false, bool writesZero = false)
: WD(Desc), CyclesLeft(UNKNOWN_CYCLES), RegisterID(RegID),
ClearsSuperRegs(clearsSuperRegs), WritesZero(writesZero),
- DependentWrite(nullptr), NumWriteUsers(0U) {}
+ IsEliminated(false), DependentWrite(nullptr), NumWriteUsers(0U) {}
WriteState(const WriteState &Other) = delete;
WriteState &operator=(const WriteState &Other) = delete;
unsigned getNumUsers() const { return Users.size() + NumWriteUsers; }
bool clearsSuperRegisters() const { return ClearsSuperRegs; }
bool isWriteZero() const { return WritesZero; }
+ bool isEliminated() const { return IsEliminated; }
bool isExecuted() const {
return CyclesLeft != UNKNOWN_CYCLES && CyclesLeft <= 0;
}
DependentWrite = Other;
++Other->NumWriteUsers;
}
+ void setWriteZero() { WritesZero = true; }
+ void setEliminated() {
+ assert(Users.empty() && "Write is in an inconsistent state.");
+ CyclesLeft = 0;
+ IsEliminated = true;
+ }
// On every cycle, update CyclesLeft and notify dependent users.
void cycleEvent();
// Retire Unit token ID for this instruction.
unsigned RCUTokenID;
+ // This field is set for instructions that are candidates for move
+ // elimination. For more information about move elimination, see the
+ // definition of RegisterMappingTracker in RegisterFile.h
+ //
+ // TODO: Teach subtargets how to describe optimizable register moves.
+ bool IsOptimizableMove;
+
using UniqueDef = std::unique_ptr<WriteState>;
using UniqueUse = std::unique_ptr<ReadState>;
using VecDefs = std::vector<UniqueDef>;
public:
Instruction(const InstrDesc &D)
- : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0) {}
+ : Desc(D), Stage(IS_INVALID), CyclesLeft(UNKNOWN_CYCLES), RCUTokenID(0),
+ IsOptimizableMove(false) {}
Instruction(const Instruction &Other) = delete;
Instruction &operator=(const Instruction &Other) = delete;
bool isExecuted() const { return Stage == IS_EXECUTED; }
bool isRetired() const { return Stage == IS_RETIRED; }
+ // Returns true if this instruction is a candidate for move elimination.
+ bool isOptimizableMove() const { return IsOptimizableMove; }
+ void setOptimizableMove() { IsOptimizableMove = true; }
+ bool isEliminated() const {
+ return isReady() && Defs.size() &&
+ llvm::all_of(Defs,
+ [](const UniqueDef &D) { return D->isEliminated(); });
+ }
+
+ // Forces a transition from state IS_AVAILABLE to state IS_EXECUTED.
+ void forceExecuted();
+
void retire() {
assert(isExecuted() && "Instruction is in an invalid state!");
Stage = IS_RETIRED;
// instructions to the underlying pipelines.
llvm::Error issueReadyInstructions();
+ // Used to notify instructions eliminated at register renaming stage.
+ llvm::Error handleInstructionEliminated(InstRef &IR);
+
ExecuteStage(const ExecuteStage &Other) = delete;
ExecuteStage &operator=(const ExecuteStage &Other) = delete;
}
}
+void RegisterFile::cycleStart() {
+ for (RegisterMappingTracker &RMT : RegisterFiles)
+ RMT.NumMoveEliminated = 0;
+}
+
void RegisterFile::addRegisterFile(ArrayRef<MCRegisterCostEntry> Entries,
unsigned NumPhysRegs) {
// A default register file is always allocated at index #0. That register file
}
}
+bool RegisterFile::tryEliminateMove(WriteState &WS, const ReadState &RS) {
+ const RegisterMapping &RMFrom = RegisterMappings[RS.getRegisterID()];
+ const RegisterMapping &RMTo = RegisterMappings[WS.getRegisterID()];
+
+ // Early exit if the PRF doesn't support move elimination for this register.
+ if (!RMTo.second.AllowMoveElimination)
+ return false;
+
+ // From and To must be owned by the same PRF.
+ const RegisterRenamingInfo &RRIFrom = RMFrom.second;
+ const RegisterRenamingInfo &RRITo = RMTo.second;
+ unsigned RegisterFileIndex = RRIFrom.IndexPlusCost.first;
+ if (RegisterFileIndex != RRITo.IndexPlusCost.first)
+ return false;
+
+ RegisterMappingTracker &RMT = RegisterFiles[RegisterFileIndex];
+ if (RMT.MaxMoveEliminatedPerCycle &&
+ RMT.NumMoveEliminated == RMT.MaxMoveEliminatedPerCycle)
+ return false;
+
+ bool IsZeroMove = ZeroRegisters[RS.getRegisterID()];
+ if (RRITo.AllowZeroMoveEliminationOnly && !IsZeroMove)
+ return false;
+
+ RMT.NumMoveEliminated++;
+ if (IsZeroMove)
+ WS.setWriteZero();
+ WS.setEliminated();
+ return true;
+}
+
void RegisterFile::collectWrites(SmallVectorImpl<WriteRef> &Writes,
unsigned RegID) const {
assert(RegID && RegID < RegisterMappings.size());
Stage = IS_EXECUTED;
}
+void Instruction::forceExecuted() {
+ assert(Stage == IS_READY && "Invalid internal state!");
+ CyclesLeft = 0;
+ Stage = IS_EXECUTED;
+}
+
void Instruction::update() {
assert(isDispatched() && "Unexpected instruction stage found!");
AvailableEntries -= NumMicroOps;
}
+ // Check if this is an optimizable reg-reg move.
+ if (IS.isOptimizableMove()) {
+ assert(IS.getDefs().size() == 1 && "Expected a single input!");
+ assert(IS.getUses().size() == 1 && "Expected a single output!");
+ PRF.tryEliminateMove(*IS.getDefs()[0], *IS.getUses()[0]);
+ }
+
// A dependency-breaking instruction doesn't have to wait on the register
// input operands, and it is often optimized at register renaming stage.
// Update RAW dependencies if this instruction is not a dependency-breaking
}
Error DispatchStage::cycleStart() {
+ PRF.cycleStart();
+
if (!CarryOver) {
AvailableEntries = DispatchWidth;
return ErrorSuccess();
return issueReadyInstructions();
}
+
+#ifndef NDEBUG
+static void verifyInstructionEliminated(const InstRef &IR) {
+ const Instruction &Inst = *IR.getInstruction();
+ assert(Inst.isEliminated() && "Instruction was not eliminated!");
+ assert(Inst.isReady() && "Instruction in an inconsistent state!");
+
+ // Ensure that instructions eliminated at register renaming stage are in a
+ // consistent state.
+ const InstrDesc &Desc = Inst.getDesc();
+ assert(!Desc.MayLoad && !Desc.MayStore && "Cannot eliminate a memory op!");
+}
+#endif
+
+
+Error ExecuteStage::handleInstructionEliminated(InstRef &IR) {
+#ifndef NDEBUG
+ verifyInstructionEliminated(IR);
+#endif
+ notifyInstructionReady(IR);
+ notifyInstructionIssued(IR, {});
+ IR.getInstruction()->forceExecuted();
+ notifyInstructionExecuted(IR);
+ return moveToTheNextStage(IR);
+}
+
// Schedule the instruction for execution on the hardware.
Error ExecuteStage::execute(InstRef &IR) {
assert(isAvailable(IR) && "Scheduler is not available!");
// Ensure that the HWS has not stored this instruction in its queues.
HWS.sanityCheck(IR);
#endif
+
+ if (IR.getInstruction()->isEliminated())
+ return handleInstructionEliminated(IR);
+
// Reserve a slot in each buffered resource. Also, mark units with
// BufferSize=0 as reserved. Resources with a buffer size of zero will only
// be released after MCIS is issued, and all the ResourceCycles for those