OPCODE(UDIVREM32);
OPCODE(UDIVREM64);
OPCODE(MVC);
+ OPCODE(MVC_LOOP);
OPCODE(CLC);
+ OPCODE(CLC_LOOP);
OPCODE(STRCMP);
OPCODE(STPCPY);
OPCODE(SEARCH_STRING);
return NewMBB;
}
-// Split MBB after MI and return the new block (the one that contains
-// instructions after MI).
-static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
- MachineBasicBlock *MBB) {
+// Split MBB before MI and return the new block (the one that contains MI).
+static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
+ MachineBasicBlock *MBB) {
MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
- NewMBB->splice(NewMBB->begin(), MBB,
- llvm::next(MachineBasicBlock::iterator(MI)),
- MBB->end());
+ NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
return NewMBB;
}
+// Force base value Base into a register before MI. Return the register.
+static unsigned forceReg(MachineInstr *MI, MachineOperand &Base,
+ const SystemZInstrInfo *TII) {
+ if (Base.isReg())
+ return Base.getReg();
+
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LA), Reg)
+ .addOperand(Base).addImm(0).addReg(0);
+ return Reg;
+}
+
// Implement EmitInstrWithCustomInserter for pseudo Select* instruction MI.
MachineBasicBlock *
SystemZTargetLowering::emitSelect(MachineInstr *MI,
DebugLoc DL = MI->getDebugLoc();
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *JoinMBB = splitBlockAfter(MI, MBB);
+ MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
// StartMBB:
// %Result = phi [ %FalseReg, FalseMBB ], [ %TrueReg, StartMBB ]
// ...
MBB = JoinMBB;
- BuildMI(*MBB, MBB->begin(), DL, TII->get(SystemZ::PHI), DestReg)
+ BuildMI(*MBB, MI, DL, TII->get(SystemZ::PHI), DestReg)
.addReg(TrueReg).addMBB(StartMBB)
.addReg(FalseReg).addMBB(FalseMBB);
CCMask ^= CCValid;
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *JoinMBB = splitBlockAfter(MI, MBB);
+ MachineBasicBlock *JoinMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
// StartMBB:
// Insert a basic block for the main loop.
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB);
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
// StartMBB:
// Insert 3 basic blocks for the loop.
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB);
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
// Insert 2 basic blocks for the loop.
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB);
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
MachineBasicBlock *SetMBB = emitBlockAfter(LoopMBB);
MachineBasicBlock *MBB,
unsigned Opcode) const {
const SystemZInstrInfo *TII = TM.getInstrInfo();
+ MachineFunction &MF = *MBB->getParent();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
DebugLoc DL = MI->getDebugLoc();
- MachineOperand DestBase = MI->getOperand(0);
+ MachineOperand DestBase = earlyUseOperand(MI->getOperand(0));
uint64_t DestDisp = MI->getOperand(1).getImm();
- MachineOperand SrcBase = MI->getOperand(2);
+ MachineOperand SrcBase = earlyUseOperand(MI->getOperand(2));
uint64_t SrcDisp = MI->getOperand(3).getImm();
uint64_t Length = MI->getOperand(4).getImm();
- BuildMI(*MBB, MI, DL, TII->get(Opcode))
- .addOperand(DestBase).addImm(DestDisp).addImm(Length)
- .addOperand(SrcBase).addImm(SrcDisp);
+ // Check for the loop form, in which operand 5 is the trip count.
+ if (MI->getNumExplicitOperands() > 5) {
+ bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
+
+ uint64_t StartCountReg = MI->getOperand(5).getReg();
+ uint64_t StartSrcReg = forceReg(MI, SrcBase, TII);
+ uint64_t StartDestReg = (HaveSingleBase ? StartSrcReg :
+ forceReg(MI, DestBase, TII));
+
+ const TargetRegisterClass *RC = &SystemZ::ADDR64BitRegClass;
+ uint64_t ThisSrcReg = MRI.createVirtualRegister(RC);
+ uint64_t ThisDestReg = (HaveSingleBase ? ThisSrcReg :
+ MRI.createVirtualRegister(RC));
+ uint64_t NextSrcReg = MRI.createVirtualRegister(RC);
+ uint64_t NextDestReg = (HaveSingleBase ? NextSrcReg :
+ MRI.createVirtualRegister(RC));
+
+ RC = &SystemZ::GR64BitRegClass;
+ uint64_t ThisCountReg = MRI.createVirtualRegister(RC);
+ uint64_t NextCountReg = MRI.createVirtualRegister(RC);
+
+ MachineBasicBlock *StartMBB = MBB;
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
+ MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+
+ // StartMBB:
+ // # fall through to LoopMMB
+ MBB->addSuccessor(LoopMBB);
+
+ // LoopMBB:
+ // %ThisDestReg = phi [ %StartDestReg, StartMBB ],
+ // [ %NextDestReg, LoopMBB ]
+ // %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
+ // [ %NextSrcReg, LoopMBB ]
+ // %ThisCountReg = phi [ %StartCountReg, StartMBB ],
+ // [ %NextCountReg, LoopMBB ]
+ // PFD 2, 768+DestDisp(%ThisDestReg)
+ // Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
+ // %NextDestReg = LA 256(%ThisDestReg)
+ // %NextSrcReg = LA 256(%ThisSrcReg)
+ // %NextCountReg = AGHI %ThisCountReg, -1
+ // CGHI %NextCountReg, 0
+ // JLH LoopMBB
+ // # fall through to DoneMMB
+ //
+ // The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
+ MBB = LoopMBB;
+
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
+ .addReg(StartDestReg).addMBB(StartMBB)
+ .addReg(NextDestReg).addMBB(LoopMBB);
+ if (!HaveSingleBase)
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
+ .addReg(StartSrcReg).addMBB(StartMBB)
+ .addReg(NextSrcReg).addMBB(LoopMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
+ .addReg(StartCountReg).addMBB(StartMBB)
+ .addReg(NextCountReg).addMBB(LoopMBB);
+ BuildMI(MBB, DL, TII->get(SystemZ::PFD))
+ .addImm(SystemZ::PFD_WRITE)
+ .addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
+ BuildMI(MBB, DL, TII->get(Opcode))
+ .addReg(ThisDestReg).addImm(DestDisp).addImm(256)
+ .addReg(ThisSrcReg).addImm(SrcDisp);
+ BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
+ .addReg(ThisDestReg).addImm(256).addReg(0);
+ if (!HaveSingleBase)
+ BuildMI(MBB, DL, TII->get(SystemZ::LA), NextSrcReg)
+ .addReg(ThisSrcReg).addImm(256).addReg(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::AGHI), NextCountReg)
+ .addReg(ThisCountReg).addImm(-1);
+ BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+ .addReg(NextCountReg).addImm(0);
+ BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+ .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
+ .addMBB(LoopMBB);
+ MBB->addSuccessor(LoopMBB);
+ MBB->addSuccessor(DoneMBB);
+
+ DestBase = MachineOperand::CreateReg(NextDestReg, false);
+ SrcBase = MachineOperand::CreateReg(NextSrcReg, false);
+ Length &= 255;
+ MBB = DoneMBB;
+ }
+ // Handle any remaining bytes with straight-line code.
+ while (Length > 0) {
+ uint64_t ThisLength = std::min(Length, uint64_t(256));
+ // The previous iteration might have created out-of-range displacements.
+ // Apply them using LAY if so.
+ if (!isUInt<12>(DestDisp)) {
+ unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+ .addOperand(DestBase).addImm(DestDisp).addReg(0);
+ DestBase = MachineOperand::CreateReg(Reg, false);
+ DestDisp = 0;
+ }
+ if (!isUInt<12>(SrcDisp)) {
+ unsigned Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(SystemZ::LAY), Reg)
+ .addOperand(SrcBase).addImm(SrcDisp).addReg(0);
+ SrcBase = MachineOperand::CreateReg(Reg, false);
+ SrcDisp = 0;
+ }
+ BuildMI(*MBB, MI, DL, TII->get(Opcode))
+ .addOperand(DestBase).addImm(DestDisp).addImm(ThisLength)
+ .addOperand(SrcBase).addImm(SrcDisp);
+ DestDisp += ThisLength;
+ SrcDisp += ThisLength;
+ Length -= ThisLength;
+ }
MI->eraseFromParent();
return MBB;
uint64_t End2Reg = MRI.createVirtualRegister(RC);
MachineBasicBlock *StartMBB = MBB;
- MachineBasicBlock *DoneMBB = splitBlockAfter(MI, MBB);
+ MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
// StartMBB:
case SystemZ::ATOMIC_CMP_SWAPW:
return emitAtomicCmpSwapW(MI, MBB);
- case SystemZ::MVCWrapper:
+ case SystemZ::MVCSequence:
+ case SystemZ::MVCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::MVC);
- case SystemZ::CLCWrapper:
+ case SystemZ::CLCSequence:
+ case SystemZ::CLCLoop:
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
case SystemZ::CLSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::CLST);
UDIVREM32,
UDIVREM64,
- // Use MVC to copy bytes from one memory location to another.
- // The first operand is the target address, the second operand is the
- // source address, and the third operand is the constant length.
+ // Use a series of MVCs to copy bytes from one memory location to another.
+ // The operands are:
+ // - the target address
+ // - the source address
+ // - the constant length
+ //
// This isn't a memory opcode because we'd need to attach two
// MachineMemOperands rather than one.
MVC,
+ // Like MVC, but implemented as a loop that handles X*256 bytes
+ // followed by straight-line code to handle the rest (if any).
+ // The value of X is passed as an additional operand.
+ MVC_LOOP,
+
// Use CLC to compare two blocks of memory, with the same comments
- // as for MVC.
+ // as for MVC and MVC_LOOP.
CLC,
+ CLC_LOOP,
// Use an MVST-based sequence to implement stpcpy().
STPCPY,
def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_high),
(EXTRACT_SUBREG FP128:$src2, subreg_high))>;
-defm LoadStoreF32 : MVCLoadStore<load, store, f32, MVCWrapper, 4>;
-defm LoadStoreF64 : MVCLoadStore<load, store, f64, MVCWrapper, 8>;
-defm LoadStoreF128 : MVCLoadStore<load, store, f128, MVCWrapper, 16>;
+defm LoadStoreF32 : MVCLoadStore<load, store, f32, MVCSequence, 4>;
+defm LoadStoreF64 : MVCLoadStore<load, store, f64, MVCSequence, 8>;
+defm LoadStoreF128 : MVCLoadStore<load, store, f128, MVCSequence, 16>;
//===----------------------------------------------------------------------===//
// Load instructions
class AtomicLoadWBinaryImm<SDPatternOperator operator, Immediate imm>
: AtomicLoadWBinary<operator, (i32 imm:$src2), imm>;
-// Define an instruction that operates on two fixed-length blocks of memory.
-// The real instruction uses a bdladdr12onlylen8 for the first operand and a
-// bdaddr12only for the second, with the length of the second operand being
-// implicitly the same as the first. This arrangement matches the underlying
-// assembly syntax. However, for instruction selection it's easier to have
-// two normal bdaddr12onlys and a separate length operand, so define a pseudo
-// instruction for that too.
+// Define an instruction that operates on two fixed-length blocks of memory,
+// and associated pseudo instructions for operating on blocks of any size.
+// The Sequence form uses a straight-line sequence of instructions and
+// the Loop form uses a loop of length-256 instructions followed by
+// another instruction to handle the excess.
multiclass MemorySS<string mnemonic, bits<8> opcode,
- SDPatternOperator operator> {
+ SDPatternOperator sequence, SDPatternOperator loop> {
def "" : InstSS<opcode, (outs), (ins bdladdr12onlylen8:$BDL1,
bdaddr12only:$BD2),
mnemonic##"\t$BDL1, $BD2", []>;
- let usesCustomInserter = 1 in
- def Wrapper : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
- imm32len8:$length),
- [(operator bdaddr12only:$dest, bdaddr12only:$src,
- imm32len8:$length)]>;
+ let usesCustomInserter = 1 in {
+ def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length),
+ [(sequence bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length)]>;
+ def Loop : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256),
+ [(loop bdaddr12only:$dest, bdaddr12only:$src,
+ imm64:$length, GR64:$count256)]>;
+ }
}
// Define an instruction that operates on two strings, both terminated
// Memory-to-memory moves.
let mayLoad = 1, mayStore = 1 in
- defm MVC : MemorySS<"mvc", 0xD2, z_mvc>;
+ defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>;
// String moves.
let mayLoad = 1, mayStore = 1, Defs = [CC], Uses = [R0W] in
defm MVST : StringRRE<"mvst", 0xB255, z_stpcpy>;
defm LoadStore8_32 : MVCLoadStore<anyextloadi8, truncstorei8, i32,
- MVCWrapper, 1>;
+ MVCSequence, 1>;
defm LoadStore16_32 : MVCLoadStore<anyextloadi16, truncstorei16, i32,
- MVCWrapper, 2>;
-defm LoadStore32_32 : MVCLoadStore<load, store, i32, MVCWrapper, 4>;
+ MVCSequence, 2>;
+defm LoadStore32_32 : MVCLoadStore<load, store, i32, MVCSequence, 4>;
defm LoadStore8 : MVCLoadStore<anyextloadi8, truncstorei8, i64,
- MVCWrapper, 1>;
+ MVCSequence, 1>;
defm LoadStore16 : MVCLoadStore<anyextloadi16, truncstorei16, i64,
- MVCWrapper, 2>;
+ MVCSequence, 2>;
defm LoadStore32 : MVCLoadStore<anyextloadi32, truncstorei32, i64,
- MVCWrapper, 4>;
-defm LoadStore64 : MVCLoadStore<load, store, i64, MVCWrapper, 8>;
+ MVCSequence, 4>;
+defm LoadStore64 : MVCLoadStore<load, store, i64, MVCSequence, 8>;
//===----------------------------------------------------------------------===//
// Sign extensions
// Memory-to-memory comparison.
let mayLoad = 1, Defs = [CC] in
- defm CLC : MemorySS<"clc", 0xD5, z_clc>;
+ defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>;
// String comparison.
let mayLoad = 1, Defs = [CC], Uses = [R0W] in
// i32 immediates
//===----------------------------------------------------------------------===//
-// Immediates for 8-bit lengths.
-def imm32len8 : Immediate<i32, [{
- return isUInt<8>(N->getZExtValue() - 1);
-}], NOOP_SDNodeXForm, "U32Imm">;
-
// Immediates for the lower and upper 16 bits of an i32, with the other
// bits of the i32 being zero.
def imm32ll16 : Immediate<i32, [{
return isUInt<32>(-N->getSExtValue());
}], NEGIMM32, "U32Imm">;
-def imm64 : ImmLeaf<i64, [{}]>;
+def imm64 : ImmLeaf<i64, [{}]>, Operand<i64>;
//===----------------------------------------------------------------------===//
// Floating-point immediates
def SDT_ZMemMemLength : SDTypeProfile<0, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
- SDTCisVT<2, i32>]>;
+ SDTCisVT<2, i64>]>;
+def SDT_ZMemMemLoop : SDTypeProfile<0, 4,
+ [SDTCisPtrTy<0>,
+ SDTCisPtrTy<1>,
+ SDTCisVT<2, i64>,
+ SDTCisVT<3, i64>]>;
def SDT_ZString : SDTypeProfile<1, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
def z_mvc : SDNode<"SystemZISD::MVC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
+def z_mvc_loop : SDNode<"SystemZISD::MVC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
+def z_clc_loop : SDNode<"SystemZISD::CLC_LOOP", SDT_ZMemMemLoop,
+ [SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZString,
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad]>;
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,
SystemZSelectionDAGInfo::~SystemZSelectionDAGInfo() {
}
+// Use MVC to copy Size bytes from Src to Dest, deciding whether to use
+// a loop or straight-line code.
+static SDValue emitMVC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
+ SDValue Dst, SDValue Src, uint64_t Size) {
+ EVT PtrVT = Src.getValueType();
+ // The heuristic we use is to prefer loops for anything that would
+ // require 7 or more MVCs. With these kinds of sizes there isn't
+ // much to choose between straight-line code and looping code,
+ // since the time will be dominated by the MVCs themselves.
+ // However, the loop has 4 or 5 instructions (depending on whether
+ // the base addresses can be proved equal), so there doesn't seem
+ // much point using a loop for 5 * 256 bytes or fewer. Anything in
+ // the range (5 * 256, 6 * 256) will need another instruction after
+ // the loop, so it doesn't seem worth using a loop then either.
+ // The next value up, 6 * 256, can be implemented in the same
+ // number of straight-line MVCs as 6 * 256 - 1.
+ if (Size > 6 * 256)
+ return DAG.getNode(SystemZISD::MVC_LOOP, DL, MVT::Other, Chain, Dst, Src,
+ DAG.getConstant(Size, PtrVT),
+ DAG.getConstant(Size / 256, PtrVT));
+ return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst, Src,
+ DAG.getConstant(Size, PtrVT));
+}
+
SDValue SystemZSelectionDAGInfo::
EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
SDValue Dst, SDValue Src, SDValue Size, unsigned Align,
if (IsVolatile)
return SDValue();
- if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
- uint64_t Bytes = CSize->getZExtValue();
- if (Bytes >= 1 && Bytes <= 0x100) {
- // A single MVC.
- return DAG.getNode(SystemZISD::MVC, DL, MVT::Other,
- Chain, Dst, Src, Size);
- }
- }
+ if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size))
+ return emitMVC(DAG, DL, Chain, Dst, Src, CSize->getZExtValue());
return SDValue();
}
SDValue Dst, SDValue Byte, SDValue Size,
unsigned Align, bool IsVolatile,
MachinePointerInfo DstPtrInfo) const {
- EVT DstVT = Dst.getValueType();
+ EVT PtrVT = Dst.getValueType();
if (IsVolatile)
return SDValue();
Align, DstPtrInfo);
if (Size2 == 0)
return Chain1;
- Dst = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
- DAG.getConstant(Size1, DstVT));
+ Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+ DAG.getConstant(Size1, PtrVT));
DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
std::min(Align, Size1), DstPtrInfo);
false, false, Align);
if (Bytes == 1)
return Chain1;
- SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
- DAG.getConstant(1, DstVT));
+ SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+ DAG.getConstant(1, PtrVT));
SDValue Chain2 = DAG.getStore(Chain, DL, Byte, Dst2,
DstPtrInfo.getWithOffset(1),
false, false, 1);
}
}
assert(Bytes >= 2 && "Should have dealt with 0- and 1-byte cases already");
- if (Bytes <= 0x101) {
- // Copy the byte to the first location and then use MVC to copy
- // it to the rest.
- Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
- false, false, Align);
- SDValue Dst2 = DAG.getNode(ISD::ADD, DL, DstVT, Dst,
- DAG.getConstant(1, DstVT));
- return DAG.getNode(SystemZISD::MVC, DL, MVT::Other, Chain, Dst2, Dst,
- DAG.getConstant(Bytes - 1, MVT::i32));
- }
+ // Copy the byte to the first location and then use MVC to copy
+ // it to the rest.
+ Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo,
+ false, false, Align);
+ SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
+ DAG.getConstant(1, PtrVT));
+ return emitMVC(DAG, DL, Chain, DstPlus1, Dst, Bytes - 1);
}
return SDValue();
}
SDValue Src1, SDValue Src2, SDValue Size,
MachinePointerInfo Op1PtrInfo,
MachinePointerInfo Op2PtrInfo) const {
+ EVT PtrVT = Src1.getValueType();
if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
uint64_t Bytes = CSize->getZExtValue();
if (Bytes >= 1 && Bytes <= 0x100) {
// A single CLC.
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain,
- Src1, Src2, Size);
+ Src1, Src2, Size, DAG.getConstant(0, PtrVT));
SDValue Glue = Chain.getValue(1);
return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
}
declare void @llvm.memcpy.p0i8.p0i8.i32(i8 *nocapture, i8 *nocapture, i32, i32, i1) nounwind
declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i32, i1) nounwind
+declare void @foo(i8 *, i8 *)
+; Test a no-op move, i32 version.
define void @f1(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f1:
; CHECK-NOT: %r2
ret void
}
+; Test a no-op move, i64 version.
define void @f2(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f2:
; CHECK-NOT: %r2
ret void
}
+; Test a 1-byte move, i32 version.
define void @f3(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f3:
; CHECK: mvc 0(1,%r2), 0(%r3)
ret void
}
+; Test a 1-byte move, i64 version.
define void @f4(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f4:
; CHECK: mvc 0(1,%r2), 0(%r3)
ret void
}
+; Test the upper range of a single MVC, i32 version.
define void @f5(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f5:
; CHECK: mvc 0(256,%r2), 0(%r3)
ret void
}
+; Test the upper range of a single MVC, i64 version.
define void @f6(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f6:
; CHECK: mvc 0(256,%r2), 0(%r3)
ret void
}
-; 257 bytes is too big for a single MVC. For now expect none, so that
-; the test fails and gets updated when large copies are implemented.
+; Test the first case that needs two MVCs.
define void @f7(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f7:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(1,%r2), 256(%r3)
; CHECK: br %r14
call void @llvm.memcpy.p0i8.p0i8.i32(i8 *%dest, i8 *%src, i32 257, i32 1,
i1 false)
ret void
}
+; Test the last-but-one case that needs two MVCs.
define void @f8(i8 *%dest, i8 *%src) {
; CHECK-LABEL: f8:
-; CHECK-NOT: mvc
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(255,%r2), 256(%r3)
+; CHECK: br %r14
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 511, i32 1,
+ i1 false)
+ ret void
+}
+
+; Test the last case that needs two MVCs.
+define void @f9(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f9:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: br %r14
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 512, i32 1,
+ i1 false)
+ ret void
+}
+
+; Test an arbitrary value that uses straight-line code.
+define void @f10(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f10:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(255,%r2), 1024(%r3)
+; CHECK: br %r14
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+ i1 false)
+ ret void
+}
+
+; ...and again in cases where not all parts are in range of MVC.
+define void @f11(i8 *%srcbase, i8 *%destbase) {
+; CHECK-LABEL: f11:
+; CHECK: mvc 4000(256,%r2), 3500(%r3)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4256(%r2)
+; CHECK: mvc 0(256,[[NEWDEST]]), 3756(%r3)
+; CHECK: mvc 256(256,[[NEWDEST]]), 4012(%r3)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4268(%r3)
+; CHECK: mvc 512(256,[[NEWDEST]]), 0([[NEWSRC]])
+; CHECK: mvc 768(255,[[NEWDEST]]), 256([[NEWSRC]])
+; CHECK: br %r14
+ %dest = getelementptr i8 *%srcbase, i64 4000
+ %src = getelementptr i8* %destbase, i64 3500
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+ i1 false)
+ ret void
+}
+
+; ...and again with a destination frame base that goes out of range.
+define void @f12() {
+; CHECK-LABEL: f12:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 4076(256,%r15), 2100(%r15)
+; CHECK: lay [[NEWDEST:%r[1-5]]], 4332(%r15)
+; CHECK: mvc 0(256,[[NEWDEST]]), 2356(%r15)
+; CHECK: mvc 256(256,[[NEWDEST]]), 2612(%r15)
+; CHECK: mvc 512(256,[[NEWDEST]]), 2868(%r15)
+; CHECK: mvc 768(255,[[NEWDEST]]), 3124(%r15)
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+ %arr = alloca [6000 x i8]
+ %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 3900
+ %src = getelementptr [6000 x i8] *%arr, i64 0, i64 1924
+ call void @foo(i8 *%dest, i8 *%src)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+ i1 false)
+ call void @foo(i8 *%dest, i8 *%src)
+ ret void
+}
+
+; ...and again with a source frame base that goes out of range.
+define void @f13() {
+; CHECK-LABEL: f13:
+; CHECK: brasl %r14, foo@PLT
+; CHECK: mvc 200(256,%r15), 3826(%r15)
+; CHECK: mvc 456(256,%r15), 4082(%r15)
+; CHECK: lay [[NEWSRC:%r[1-5]]], 4338(%r15)
+; CHECK: mvc 712(256,%r15), 0([[NEWSRC]])
+; CHECK: mvc 968(256,%r15), 256([[NEWSRC]])
+; CHECK: mvc 1224(255,%r15), 512([[NEWSRC]])
+; CHECK: brasl %r14, foo@PLT
+; CHECK: br %r14
+ %arr = alloca [6000 x i8]
+ %dest = getelementptr [6000 x i8] *%arr, i64 0, i64 24
+ %src = getelementptr [6000 x i8] *%arr, i64 0, i64 3650
+ call void @foo(i8 *%dest, i8 *%src)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1279, i32 1,
+ i1 false)
+ call void @foo(i8 *%dest, i8 *%src)
+ ret void
+}
+
+; Test the last case that is done using straight-line code.
+define void @f14(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f14:
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: mvc 256(256,%r2), 256(%r3)
+; CHECK: mvc 512(256,%r2), 512(%r3)
+; CHECK: mvc 768(256,%r2), 768(%r3)
+; CHECK: mvc 1024(256,%r2), 1024(%r3)
+; CHECK: mvc 1280(256,%r2), 1280(%r3)
+; CHECK: br %r14
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1536, i32 1,
+ i1 false)
+ ret void
+}
+
+; Test the first case that is done using a loop.
+define void @f15(i8 *%dest, i8 *%src) {
+; CHECK-LABEL: f15:
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 768(%r2)
+; CHECK: mvc 0(256,%r2), 0(%r3)
+; CHECK: la %r2, 256(%r2)
+; CHECK: la %r3, 256(%r3)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 0(1,%r2), 0(%r3)
+; CHECK: br %r14
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
+ i1 false)
+ ret void
+}
+
+; ...and again with frame bases, where the base must be loaded into a
+; register before the loop.
+define void @f16() {
+; CHECK-LABEL: f16:
+; CHECK: brasl %r14, foo@PLT
+; CHECK-DAG: lghi [[COUNT:%r[0-5]]], 6
+; CHECK-DAG: la [[BASE:%r[0-5]]], 160(%r15)
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 2368([[BASE]])
+; CHECK: mvc 1600(256,[[BASE]]), 0([[BASE]])
+; CHECK: la [[BASE]], 256([[BASE]])
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1600(1,[[BASE]]), 0([[BASE]])
+; CHECK: brasl %r14, foo@PLT
; CHECK: br %r14
- call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 257, i32 1,
+ %arr = alloca [3200 x i8]
+ %dest = getelementptr [3200 x i8] *%arr, i64 0, i64 1600
+ %src = getelementptr [3200 x i8] *%arr, i64 0, i64 0
+ call void @foo(i8 *%dest, i8 *%src)
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8 *%dest, i8 *%src, i64 1537, i32 1,
i1 false)
+ call void @foo(i8 *%dest, i8 *%src)
ret void
}
ret void
}
-; 258 bytes, i32 version. 258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version. We need two MVCs.
define void @f11(i8 *%dest, i8 %val) {
; CHECK-LABEL: f11:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8 *%dest, i8 %val, i32 258, i32 1, i1 false)
ret void
}
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
define void @f12(i8 *%dest, i8 %val) {
; CHECK-LABEL: f12:
-; CHECK-NOT: mvc
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 258, i32 1, i1 false)
ret void
}
+
+; Test the largest case for which straight-line code is used.
+define void @f13(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f13:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(256,%r2), 256(%r2)
+; CHECK: mvc 513(256,%r2), 512(%r2)
+; CHECK: mvc 769(256,%r2), 768(%r2)
+; CHECK: mvc 1025(256,%r2), 1024(%r2)
+; CHECK: mvc 1281(256,%r2), 1280(%r2)
+; CHECK: br %r14
+ call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1537, i32 1,
+ i1 false)
+ ret void
+}
+
+; Test the next size up, which uses a loop. We leave the other corner
+; cases to memcpy-01.ll.
+define void @f14(i8 *%dest, i8 %val) {
+; CHECK-LABEL: f14:
+; CHECK: stc %r3, 0(%r2)
+; CHECK: lghi [[COUNT:%r[0-5]]], 6
+; CHECK: [[LABEL:\.L[^:]*]]:
+; CHECK: pfd 2, 769(%r2)
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: la %r2, 256(%r2)
+; CHECK: brctg [[COUNT]], [[LABEL]]
+; CHECK: mvc 1(1,%r2), 0(%r2)
+; CHECK: br %r14
+ call void @llvm.memset.p0i8.i64(i8 *%dest, i8 %val, i64 1538, i32 1,
+ i1 false)
+ ret void
+}
ret void
}
-; 258 bytes, i32 version. 258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version. We need two MVCs.
define void @f15(i8 *%dest) {
; CHECK-LABEL: f15:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8 *%dest, i8 128, i32 258, i32 1, i1 false)
ret void
}
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
define void @f16(i8 *%dest) {
; CHECK-LABEL: f16:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 128
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8 *%dest, i8 128, i64 258, i32 1, i1 false)
ret void
ret void
}
-; 258 bytes, i32 version. 258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version. We need two MVCs.
define void @f39(i8 *%dest) {
; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 0
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8 *%dest, i8 0, i32 258, i32 1, i1 false)
ret void
}
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
define void @f40(i8 *%dest) {
; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 0
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8 *%dest, i8 0, i64 258, i32 1, i1 false)
ret void
ret void
}
-; 258 bytes, i32 version. 258 bytes is too big for a single MVC.
-; For now expect none, so that the test fails and gets updated when
-; large copies are implemented.
+; 258 bytes, i32 version. We need two MVCs.
define void @f39(i8 *%dest) {
; CHECK-LABEL: f39:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8 *%dest, i8 -1, i32 258, i32 1, i1 false)
ret void
}
-; 258 bytes, i64 version, with the same comments as above.
+; 258 bytes, i64 version.
define void @f40(i8 *%dest) {
; CHECK-LABEL: f40:
-; CHECK-NOT: mvc
+; CHECK: mvi 0(%r2), 255
+; CHECK: mvc 1(256,%r2), 0(%r2)
+; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8 *%dest, i8 -1, i64 258, i32 1, i1 false)
ret void